From c19cb4672d880d39ee2c60e7de18fbc01558072f Mon Sep 17 00:00:00 2001
From: MoneroOcean <support@moneroocean.stream>
Date: Thu, 15 Aug 2019 17:23:22 -0700
Subject: [PATCH] Added experimental DefyX support

---
 CMakeLists.txt                            |   23 +
 src/backend/cpu/CpuWorker.cpp             |    7 +-
 src/core/Benchmark.cpp                    |    1 +
 src/core/Benchmark.h                      |    2 +
 src/core/config/usage.h                   |    2 +-
 src/crypto/cn/CnAlgo.h                    |    3 +
 src/crypto/common/Algorithm.cpp           |    8 +
 src/crypto/common/Algorithm.h             |    1 +
 src/crypto/defyx/KangarooTwelve.c         |  271 ++++
 src/crypto/defyx/KangarooTwelve.h         |   89 ++
 src/crypto/defyx/KeccakP-1600-SnP.h       |   41 +
 src/crypto/defyx/KeccakP-1600-reference.c |  402 ++++++
 src/crypto/defyx/KeccakSponge-common.h    |   35 +
 src/crypto/defyx/KeccakSponge.inc         |  311 +++++
 src/crypto/defyx/KeccakSpongeWidth1600.c  |   54 +
 src/crypto/defyx/KeccakSpongeWidth1600.h  |   31 +
 src/crypto/defyx/Phases.h                 |   22 +
 src/crypto/defyx/align.h                  |   32 +
 src/crypto/defyx/brg_endian.h             |  143 +++
 src/crypto/defyx/defyx.cpp                |  109 ++
 src/crypto/defyx/defyx.h                  |   57 +
 src/crypto/defyx/sha256.c                 |  411 +++++++
 src/crypto/defyx/sha256.h                 |   62 +
 src/crypto/defyx/sysendian.h              |  138 +++
 src/crypto/defyx/yescrypt-best.c          |    5 +
 src/crypto/defyx/yescrypt-opt.c           | 1102 +++++++++++++++++
 src/crypto/defyx/yescrypt-platform.c      |  188 +++
 src/crypto/defyx/yescrypt-simd.c          | 1367 +++++++++++++++++++++
 src/crypto/defyx/yescrypt.h               |  326 +++++
 src/crypto/rx/RxAlgo.cpp                  |    8 +
 30 files changed, 5249 insertions(+), 2 deletions(-)
 create mode 100644 src/crypto/defyx/KangarooTwelve.c
 create mode 100644 src/crypto/defyx/KangarooTwelve.h
 create mode 100644 src/crypto/defyx/KeccakP-1600-SnP.h
 create mode 100644 src/crypto/defyx/KeccakP-1600-reference.c
 create mode 100644 src/crypto/defyx/KeccakSponge-common.h
 create mode 100644 src/crypto/defyx/KeccakSponge.inc
 create mode 100644 src/crypto/defyx/KeccakSpongeWidth1600.c
 create mode 100644 src/crypto/defyx/KeccakSpongeWidth1600.h
 create mode 100644 src/crypto/defyx/Phases.h
 create mode 100644 src/crypto/defyx/align.h
 create mode 100644 src/crypto/defyx/brg_endian.h
 create mode 100644 src/crypto/defyx/defyx.cpp
 create mode 100644 src/crypto/defyx/defyx.h
 create mode 100644 src/crypto/defyx/sha256.c
 create mode 100644 src/crypto/defyx/sha256.h
 create mode 100644 src/crypto/defyx/sysendian.h
 create mode 100644 src/crypto/defyx/yescrypt-best.c
 create mode 100644 src/crypto/defyx/yescrypt-opt.c
 create mode 100644 src/crypto/defyx/yescrypt-platform.c
 create mode 100644 src/crypto/defyx/yescrypt-simd.c
 create mode 100644 src/crypto/defyx/yescrypt.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 09a28602..b312bfc5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -154,6 +154,7 @@ find_package(UV REQUIRED)
 
 if (WITH_RANDOMX)
     include_directories(src/crypto/randomx)
+    include_directories(src/crypto/defyx)
     add_definitions(/DXMRIG_ALGO_RANDOMX)
     set(SOURCES_CRYPTO
         "${SOURCES_CRYPTO}"
@@ -188,6 +189,28 @@ if (WITH_RANDOMX)
         src/crypto/rx/RxDataset.h
         src/crypto/rx/RxVm.cpp
         src/crypto/rx/RxVm.h
+
+        src/crypto/defyx/align.h
+        src/crypto/defyx/brg_endian.h
+        src/crypto/defyx/defyx.cpp
+        src/crypto/defyx/defyx.h
+        src/crypto/defyx/KangarooTwelve.c
+        src/crypto/defyx/KangarooTwelve.h
+        src/crypto/defyx/KeccakP-1600-reference.c
+        src/crypto/defyx/KeccakP-1600-SnP.h
+        src/crypto/defyx/KeccakSponge-common.h
+        src/crypto/defyx/KeccakSponge.inc
+        src/crypto/defyx/KeccakSpongeWidth1600.c
+        src/crypto/defyx/KeccakSpongeWidth1600.h
+        src/crypto/defyx/Phases.h
+        src/crypto/defyx/sha256.c
+        src/crypto/defyx/sha256.h
+        src/crypto/defyx/sysendian.h
+        src/crypto/defyx/yescrypt-best.c
+        src/crypto/defyx/yescrypt-opt.c
+        src/crypto/defyx/yescrypt-platform.c
+        src/crypto/defyx/yescrypt-simd.c
+        src/crypto/defyx/yescrypt.h
     )
     if (NOT ARCH_ID)
         set(ARCH_ID ${CMAKE_HOST_SYSTEM_PROCESSOR})
diff --git a/src/backend/cpu/CpuWorker.cpp b/src/backend/cpu/CpuWorker.cpp
index 14ef1797..6035025e 100644
--- a/src/backend/cpu/CpuWorker.cpp
+++ b/src/backend/cpu/CpuWorker.cpp
@@ -41,6 +41,7 @@
 
 #ifdef XMRIG_ALGO_RANDOMX
 #   include "crypto/randomx/randomx.h"
+#   include "crypto/defyx/defyx.h"
 #endif
 
 
@@ -190,7 +191,11 @@ void xmrig::CpuWorker<N>::start()
 
 #           ifdef XMRIG_ALGO_RANDOMX
             if (job.algorithm().family() == Algorithm::RANDOM_X) {
-                randomx_calculate_hash(m_vm->get(), m_job.blob(), job.size(), m_hash);
+                if (job.algorithm() == Algorithm::DEFYX) {
+                    defyx_calculate_hash(m_vm->get(), m_job.blob(), job.size(), m_hash);
+                } else {
+                    randomx_calculate_hash(m_vm->get(), m_job.blob(), job.size(), m_hash);
+                }
             }
             else
 #           endif
diff --git a/src/core/Benchmark.cpp b/src/core/Benchmark.cpp
index 3840bddc..6d754f84 100644
--- a/src/core/Benchmark.cpp
+++ b/src/core/Benchmark.cpp
@@ -126,6 +126,7 @@ float Benchmark::get_algo_perf(Algorithm::Id algo) const {
         case Algorithm::RX_0:          return m_bench_algo_perf[BenchAlgo::RX_0];
         case Algorithm::RX_WOW:        return m_bench_algo_perf[BenchAlgo::RX_WOW];
         case Algorithm::RX_LOKI:       return m_bench_algo_perf[BenchAlgo::RX_0];
+        case Algorithm::DEFYX:         return m_bench_algo_perf[BenchAlgo::DEFYX];
         default: return 0.0f;
     }
 }
diff --git a/src/core/Benchmark.h b/src/core/Benchmark.h
index 66949878..443904e8 100644
--- a/src/core/Benchmark.h
+++ b/src/core/Benchmark.h
@@ -32,6 +32,7 @@ class Benchmark : public IJobResultListener {
         enum BenchAlgo : int {
             RX_0,          // "rx/0"             RandomX (reference configuration).
             RX_WOW,        // "rx/wow"           RandomWOW (Wownero).
+            DEFYX,         // "defyx             DefyX.
             CN_R,          // "cn/r"             CryptoNightR (Monero's variant 4).
             CN_GPU,        // "cn/gpu"           CryptoNight-GPU (Ryo).
             CN_LITE_1,     // "cn-lite/1"        CryptoNight-Lite variant 1.
@@ -45,6 +46,7 @@ class Benchmark : public IJobResultListener {
         const Algorithm::Id ba2a[BenchAlgo::MAX] = {
             Algorithm::RX_0,
             Algorithm::RX_WOW,
+            Algorithm::DEFYX,
             Algorithm::CN_R,
             Algorithm::CN_GPU,
             Algorithm::CN_LITE_1,
diff --git a/src/core/config/usage.h b/src/core/config/usage.h
index b41ec6db..f1acd7c1 100644
--- a/src/core/config/usage.h
+++ b/src/core/config/usage.h
@@ -57,7 +57,7 @@ Options:\n\
 #endif
 #ifdef XMRIG_ALGO_RANDOMX
 "\
-                                  rx/wow, rx/loki\n"
+                                  rx/wow, rx/loki, defyx\n"
 #endif
 "\
   -o, --url=URL                 URL of mining server\n\
diff --git a/src/crypto/cn/CnAlgo.h b/src/crypto/cn/CnAlgo.h
index ed64331a..3cd88a5a 100644
--- a/src/crypto/cn/CnAlgo.h
+++ b/src/crypto/cn/CnAlgo.h
@@ -132,6 +132,7 @@ private:
         0,             // RX_0
         0,             // RX_WOW
         0,             // RX_LOKI
+        0,             // DEFYX
 #       endif
     };
 
@@ -167,6 +168,7 @@ private:
         0,             // RX_0
         0,             // RX_WOW
         0,             // RX_LOKI
+        0,             // DEFYX
 #       endif
     };
 
@@ -202,6 +204,7 @@ private:
         Algorithm::INVALID, // RX_0
         Algorithm::INVALID, // RX_WOW
         Algorithm::INVALID, // RX_LOKI
+        Algorithm::INVALID, // DEFYX
 #       endif
     };
 };
diff --git a/src/crypto/common/Algorithm.cpp b/src/crypto/common/Algorithm.cpp
index db6cb234..2d12ea6d 100644
--- a/src/crypto/common/Algorithm.cpp
+++ b/src/crypto/common/Algorithm.cpp
@@ -115,6 +115,7 @@ static AlgoName const algorithm_names[] = {
     { "RandomWOW",                 nullptr,            Algorithm::RX_WOW          },
     { "randomx/loki",              "rx/loki",          Algorithm::RX_LOKI         },
     { "RandomXL",                  nullptr,            Algorithm::RX_LOKI         },
+    { "DefyX",                     "defyx",            Algorithm::DEFYX           },
 #   endif
 };
 
@@ -159,6 +160,9 @@ size_t xmrig::Algorithm::l2() const
     case RX_WOW:
         return 0x20000;
 
+    case DEFYX:
+        return 0x20000;
+
     default:
         break;
     }
@@ -189,6 +193,9 @@ size_t xmrig::Algorithm::l3() const
         case RX_WOW:
             return oneMiB;
 
+        case DEFYX:
+            return 0x40000;
+
         default:
             break;
         }
@@ -241,6 +248,7 @@ xmrig::Algorithm::Family xmrig::Algorithm::family(Id id)
     case RX_0:
     case RX_WOW:
     case RX_LOKI:
+    case DEFYX:
         return RANDOM_X;
 #   endif
 
diff --git a/src/crypto/common/Algorithm.h b/src/crypto/common/Algorithm.h
index 92fcc61e..8c708f9a 100644
--- a/src/crypto/common/Algorithm.h
+++ b/src/crypto/common/Algorithm.h
@@ -72,6 +72,7 @@ public:
         RX_0,          // "rx/0"             RandomX (reference configuration).
         RX_WOW,        // "rx/wow"           RandomWOW (Wownero).
         RX_LOKI,       // "rx/loki"          RandomXL (Loki).
+        DEFYX,         // "defyx"            DefyX (Scala).
 #       endif
         MAX
     };
diff --git a/src/crypto/defyx/KangarooTwelve.c b/src/crypto/defyx/KangarooTwelve.c
new file mode 100644
index 00000000..5f8b879f
--- /dev/null
+++ b/src/crypto/defyx/KangarooTwelve.c
@@ -0,0 +1,271 @@
+/*
+Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include "KangarooTwelve.h"
+#ifndef KeccakP1600timesN_excluded
+    // #include "KeccakP-1600-times2-SnP.h"
+    // #include "KeccakP-1600-times4-SnP.h"
+    // #include "KeccakP-1600-times8-SnP.h"
+#endif
+
+#define chunkSize       8192
+#define laneSize        8
+#define suffixLeaf      0x0B /* '110': message hop, simple padding, inner node */
+
+#define security        128
+#define capacity        (2*security)
+#define capacityInBytes (capacity/8)
+#define capacityInLanes (capacityInBytes/laneSize)
+#define rate            (1600-capacity)
+#define rateInBytes     (rate/8)
+#define rateInLanes     (rateInBytes/laneSize)
+
+#define ParallelSpongeFastLoop( Parallellism ) \
+    while ( inLen >= Parallellism * chunkSize ) { \
+        ALIGN(KeccakP1600times##Parallellism##_statesAlignment) unsigned char states[KeccakP1600times##Parallellism##_statesSizeInBytes]; \
+        unsigned char intermediate[Parallellism*capacityInBytes]; \
+        unsigned int localBlockLen = chunkSize; \
+        const unsigned char * localInput = input; \
+        unsigned int i; \
+        unsigned int fastLoopOffset; \
+        \
+        KeccakP1600times##Parallellism##_StaticInitialize(); \
+        KeccakP1600times##Parallellism##_InitializeAll(states); \
+        fastLoopOffset = KeccakP1600times##Parallellism##_12rounds_FastLoop_Absorb(states, rateInLanes, chunkSize / laneSize, rateInLanes, localInput, Parallellism * chunkSize); \
+        localBlockLen -= fastLoopOffset; \
+        localInput += fastLoopOffset; \
+        for ( i = 0; i < Parallellism; ++i, localInput += chunkSize ) { \
+            KeccakP1600times##Parallellism##_AddBytes(states, i, localInput, 0, localBlockLen); \
+            KeccakP1600times##Parallellism##_AddByte(states, i, suffixLeaf, localBlockLen); \
+            KeccakP1600times##Parallellism##_AddByte(states, i, 0x80, rateInBytes-1); \
+        } \
+        KeccakP1600times##Parallellism##_PermuteAll_12rounds(states); \
+        input += Parallellism * chunkSize; \
+        inLen -= Parallellism * chunkSize; \
+        ktInstance->blockNumber += Parallellism; \
+        KeccakP1600times##Parallellism##_ExtractLanesAll(states, intermediate, capacityInLanes, capacityInLanes ); \
+        if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, intermediate, Parallellism * capacityInBytes) != 0) return 1; \
+    }
+
+#define ParallelSpongeLoop( Parallellism ) \
+    while ( inLen >= Parallellism * chunkSize ) { \
+        ALIGN(KeccakP1600times##Parallellism##_statesAlignment) unsigned char states[KeccakP1600times##Parallellism##_statesSizeInBytes]; \
+        unsigned char intermediate[Parallellism*capacityInBytes]; \
+        unsigned int localBlockLen = chunkSize; \
+        const unsigned char * localInput = input; \
+        unsigned int i; \
+        \
+        KeccakP1600times##Parallellism##_StaticInitialize(); \
+        KeccakP1600times##Parallellism##_InitializeAll(states); \
+        while(localBlockLen >= rateInBytes) { \
+            KeccakP1600times##Parallellism##_AddLanesAll(states, localInput, rateInLanes, chunkSize / laneSize); \
+            KeccakP1600times##Parallellism##_PermuteAll_12rounds(states); \
+            localBlockLen -= rateInBytes; \
+            localInput += rateInBytes; \
+           } \
+        for ( i = 0; i < Parallellism; ++i, localInput += chunkSize ) { \
+            KeccakP1600times##Parallellism##_AddBytes(states, i, localInput, 0, localBlockLen); \
+            KeccakP1600times##Parallellism##_AddByte(states, i, suffixLeaf, localBlockLen); \
+            KeccakP1600times##Parallellism##_AddByte(states, i, 0x80, rateInBytes-1); \
+        } \
+        KeccakP1600times##Parallellism##_PermuteAll_12rounds(states); \
+        input += Parallellism * chunkSize; \
+        inLen -= Parallellism * chunkSize; \
+        ktInstance->blockNumber += Parallellism; \
+        KeccakP1600times##Parallellism##_ExtractLanesAll(states, intermediate, capacityInLanes, capacityInLanes ); \
+        if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, intermediate, Parallellism * capacityInBytes) != 0) return 1; \
+    }
+
+static unsigned int right_encode( unsigned char * encbuf, size_t value )
+{
+    unsigned int n, i;
+    size_t v;
+
+    for ( v = value, n = 0; v && (n < sizeof(size_t)); ++n, v >>= 8 )
+        ; /* empty */
+    for ( i = 1; i <= n; ++i )
+        encbuf[i-1] = (unsigned char)(value >> (8 * (n-i)));
+    encbuf[n] = (unsigned char)n;
+    return n + 1;
+}
+
+int KangarooTwelve_Initialize(KangarooTwelve_Instance *ktInstance, size_t outputLen)
+{
+    ktInstance->fixedOutputLength = outputLen;
+    ktInstance->queueAbsorbedLen = 0;
+    ktInstance->blockNumber = 0;
+    ktInstance->phase = ABSORBING;
+    return KeccakWidth1600_12rounds_SpongeInitialize(&ktInstance->finalNode, rate, capacity);
+}
+
+int KangarooTwelve_Update(KangarooTwelve_Instance *ktInstance, const unsigned char *input, size_t inLen)
+{
+    if (ktInstance->phase != ABSORBING)
+        return 1;
+
+    if ( ktInstance->blockNumber == 0 ) {
+        /* First block, absorb in final node */
+        unsigned int len = (inLen < (chunkSize - ktInstance->queueAbsorbedLen)) ? inLen : (chunkSize - ktInstance->queueAbsorbedLen);
+        if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, input, len) != 0)
+            return 1;
+        input += len;
+        inLen -= len;
+        ktInstance->queueAbsorbedLen += len;
+        if ( (ktInstance->queueAbsorbedLen == chunkSize) && (inLen != 0) ) {
+            /* First block complete and more input data available, finalize it */
+            const unsigned char padding = 0x03; /* '110^6': message hop, simple padding */
+            ktInstance->queueAbsorbedLen = 0;
+            ktInstance->blockNumber = 1;
+            if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, &padding, 1) != 0)
+                return 1;
+            ktInstance->finalNode.byteIOIndex = (ktInstance->finalNode.byteIOIndex + 7) & ~7; /* Zero padding up to 64 bits */
+        }
+    }
+    else if ( ktInstance->queueAbsorbedLen != 0 ) {
+        /* There is data in the queue, absorb further in queue until block complete */
+        unsigned int len = (inLen < (chunkSize - ktInstance->queueAbsorbedLen)) ? inLen : (chunkSize - ktInstance->queueAbsorbedLen);
+        if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->queueNode, input, len) != 0)
+            return 1;
+        input += len;
+        inLen -= len;
+        ktInstance->queueAbsorbedLen += len;
+        if ( ktInstance->queueAbsorbedLen == chunkSize ) {
+            unsigned char intermediate[capacityInBytes];
+            ktInstance->queueAbsorbedLen = 0;
+            ++ktInstance->blockNumber;
+            if (KeccakWidth1600_12rounds_SpongeAbsorbLastFewBits(&ktInstance->queueNode, suffixLeaf) != 0)
+                return 1;
+            if (KeccakWidth1600_12rounds_SpongeSqueeze(&ktInstance->queueNode, intermediate, capacityInBytes) != 0)
+                return 1;
+            if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, intermediate, capacityInBytes) != 0)
+                return 1;
+        }
+    }
+
+    #if defined(KeccakP1600times8_implementation) && !defined(KeccakP1600times8_isFallback)
+    #if defined(KeccakP1600times8_12rounds_FastLoop_supported)
+    ParallelSpongeFastLoop( 8 )
+    #else
+    ParallelSpongeLoop( 8 )
+    #endif
+    #endif
+
+    #if defined(KeccakP1600times4_implementation) && !defined(KeccakP1600times4_isFallback)
+    #if defined(KeccakP1600times4_12rounds_FastLoop_supported)
+    ParallelSpongeFastLoop( 4 )
+    #else
+    ParallelSpongeLoop( 4 )
+    #endif
+    #endif
+
+    #if defined(KeccakP1600times2_implementation) && !defined(KeccakP1600times2_isFallback)
+    #if defined(KeccakP1600times2_12rounds_FastLoop_supported)
+    ParallelSpongeFastLoop( 2 )
+    #else
+    ParallelSpongeLoop( 2 )
+    #endif
+    #endif
+
+    while ( inLen > 0 ) {
+        unsigned int len = (inLen < chunkSize) ? inLen : chunkSize;
+        if (KeccakWidth1600_12rounds_SpongeInitialize(&ktInstance->queueNode, rate, capacity) != 0)
+            return 1;
+        if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->queueNode, input, len) != 0)
+            return 1;
+        input += len;
+        inLen -= len;
+        if ( len == chunkSize ) {
+            unsigned char intermediate[capacityInBytes];
+            ++ktInstance->blockNumber;
+            if (KeccakWidth1600_12rounds_SpongeAbsorbLastFewBits(&ktInstance->queueNode, suffixLeaf) != 0)
+                return 1;
+            if (KeccakWidth1600_12rounds_SpongeSqueeze(&ktInstance->queueNode, intermediate, capacityInBytes) != 0)
+                return 1;
+            if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, intermediate, capacityInBytes) != 0)
+                return 1;
+        }
+        else
+            ktInstance->queueAbsorbedLen = len;
+    }
+
+    return 0;
+}
+
+int KangarooTwelve_Final(KangarooTwelve_Instance *ktInstance, unsigned char * output, const unsigned char * customization, size_t customLen)
+{
+    unsigned char encbuf[sizeof(size_t)+1+2];
+    unsigned char padding;
+
+    if (ktInstance->phase != ABSORBING)
+        return 1;
+
+    /* Absorb customization | right_encode(customLen) */
+    if ((customLen != 0) && (KangarooTwelve_Update(ktInstance, customization, customLen) != 0))
+        return 1;
+    if (KangarooTwelve_Update(ktInstance, encbuf, right_encode(encbuf, customLen)) != 0)
+        return 1;
+
+    if ( ktInstance->blockNumber == 0 ) {
+        /* Non complete first block in final node, pad it */
+        padding = 0x07; /*  '11': message hop, final node */
+    }
+    else {
+        unsigned int n;
+
+        if ( ktInstance->queueAbsorbedLen != 0 ) {
+            /* There is data in the queue node */
+            unsigned char intermediate[capacityInBytes];
+            ++ktInstance->blockNumber;
+            if (KeccakWidth1600_12rounds_SpongeAbsorbLastFewBits(&ktInstance->queueNode, suffixLeaf) != 0)
+                return 1;
+            if (KeccakWidth1600_12rounds_SpongeSqueeze(&ktInstance->queueNode, intermediate, capacityInBytes) != 0)
+                return 1;
+            if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, intermediate, capacityInBytes) != 0)
+                return 1;
+        }
+        --ktInstance->blockNumber; /* Absorb right_encode(number of Chaining Values) || 0xFF || 0xFF */
+        n = right_encode(encbuf, ktInstance->blockNumber);
+        encbuf[n++] = 0xFF;
+        encbuf[n++] = 0xFF;
+        if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, encbuf, n) != 0)
+            return 1;
+        padding = 0x06; /* '01': chaining hop, final node */
+    }
+    if (KeccakWidth1600_12rounds_SpongeAbsorbLastFewBits(&ktInstance->finalNode, padding) != 0)
+        return 1;
+    if ( ktInstance->fixedOutputLength != 0 ) {
+        ktInstance->phase = FINAL;
+        return KeccakWidth1600_12rounds_SpongeSqueeze(&ktInstance->finalNode, output, ktInstance->fixedOutputLength);
+    }
+    ktInstance->phase = SQUEEZING;
+    return 0;
+}
+
+int KangarooTwelve_Squeeze(KangarooTwelve_Instance *ktInstance, unsigned char * output, size_t outputLen)
+{
+    if (ktInstance->phase != SQUEEZING)
+        return 1;
+    return KeccakWidth1600_12rounds_SpongeSqueeze(&ktInstance->finalNode, output, outputLen);
+}
+
+int KangarooTwelve( const unsigned char * input, size_t inLen, unsigned char * output, size_t outLen, const unsigned char * customization, size_t customLen )
+{
+    KangarooTwelve_Instance ktInstance;
+
+    if (outLen == 0)
+        return 1;
+    if (KangarooTwelve_Initialize(&ktInstance, outLen) != 0)
+        return 1;
+    if (KangarooTwelve_Update(&ktInstance, input, inLen) != 0)
+        return 1;
+    return KangarooTwelve_Final(&ktInstance, output, customization, customLen);
+}
diff --git a/src/crypto/defyx/KangarooTwelve.h b/src/crypto/defyx/KangarooTwelve.h
new file mode 100644
index 00000000..0e9ed41e
--- /dev/null
+++ b/src/crypto/defyx/KangarooTwelve.h
@@ -0,0 +1,89 @@
+/*
+Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KangarooTwelve_h_
+#define _KangarooTwelve_h_
+
+#ifndef KeccakP1600_excluded
+
+#include <stddef.h>
+#include "align.h"
+#include "KeccakSpongeWidth1600.h"
+#include "Phases.h"
+
+typedef KCP_Phases KangarooTwelve_Phases;
+
+typedef struct {
+    KeccakWidth1600_12rounds_SpongeInstance queueNode;
+    KeccakWidth1600_12rounds_SpongeInstance finalNode;
+    size_t fixedOutputLength;
+    size_t blockNumber;
+    unsigned int queueAbsorbedLen;
+    KangarooTwelve_Phases phase;
+} KangarooTwelve_Instance;
+
+/** Extendable ouput function KangarooTwelve.
+  * @param  input           Pointer to the input message (M).
+  * @param  inputByteLen    The length of the input message in bytes.
+  * @param  output          Pointer to the output buffer.
+  * @param  outputByteLen   The desired number of output bytes.
+  * @param  customization   Pointer to the customization string (C).
+  * @param  customByteLen   The length of the customization string in bytes.
+  * @return 0 if successful, 1 otherwise.
+  */
+int KangarooTwelve(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen );
+
+/**
+  * Function to initialize a KangarooTwelve instance.
+  * @param  ktInstance      Pointer to the instance to be initialized.
+  * @param  outputByteLen   The desired number of output bytes,
+  *                         or 0 for an arbitrarily-long output.
+  * @return 0 if successful, 1 otherwise.
+  */
+int KangarooTwelve_Initialize(KangarooTwelve_Instance *ktInstance, size_t outputByteLen);
+
+/**
+  * Function to give input data to be absorbed.
+  * @param  ktInstance      Pointer to the instance initialized by KangarooTwelve_Initialize().
+  * @param  input           Pointer to the input message data (M).
+  * @param  inputByteLen    The number of bytes provided in the input message data.
+  * @return 0 if successful, 1 otherwise.
+  */
+int KangarooTwelve_Update(KangarooTwelve_Instance *ktInstance, const unsigned char *input, size_t inputByteLen);
+
+/**
+  * Function to call after all the input message has been input, and to get
+  * output bytes if the length was specified when calling KangarooTwelve_Initialize().
+  * @param  ktInstance      Pointer to the hash instance initialized by KangarooTwelve_Initialize().
+  * If @a outputByteLen was not 0 in the call to KangarooTwelve_Initialize(), the number of
+  *     output bytes is equal to @a outputByteLen.
+  * If @a outputByteLen was 0 in the call to KangarooTwelve_Initialize(), the output bytes
+  *     must be extracted using the KangarooTwelve_Squeeze() function.
+  * @param  output          Pointer to the buffer where to store the output data.
+  * @param  customization   Pointer to the customization string (C).
+  * @param  customByteLen   The length of the customization string in bytes.
+  * @return 0 if successful, 1 otherwise.
+  */
+int KangarooTwelve_Final(KangarooTwelve_Instance *ktInstance, unsigned char *output, const unsigned char *customization, size_t customByteLen);
+
+/**
+  * Function to squeeze output data.
+  * @param  ktInstance     Pointer to the hash instance initialized by KangarooTwelve_Initialize().
+  * @param  data           Pointer to the buffer where to store the output data.
+  * @param  outputByteLen  The number of output bytes desired.
+  * @pre    KangarooTwelve_Final() must have been already called.
+  * @return 0 if successful, 1 otherwise.
+  */
+int KangarooTwelve_Squeeze(KangarooTwelve_Instance *ktInstance, unsigned char *output, size_t outputByteLen);
+
+#endif
+
+#endif
diff --git a/src/crypto/defyx/KeccakP-1600-SnP.h b/src/crypto/defyx/KeccakP-1600-SnP.h
new file mode 100644
index 00000000..907e3581
--- /dev/null
+++ b/src/crypto/defyx/KeccakP-1600-SnP.h
@@ -0,0 +1,41 @@
+/*
+Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen,
+Michaël Peeters, Gilles Van Assche and Ronny Van Keer,
+hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+Please refer to SnP-documentation.h for more details.
+*/
+
+#ifndef _KeccakP_1600_SnP_h_
+#define _KeccakP_1600_SnP_h_
+
+#define KeccakP1600_implementation      "64-bit reference implementation"
+#define KeccakP1600_stateSizeInBytes    200
+#define KeccakP1600_stateAlignment      8
+
+#ifdef KeccakReference
+void KeccakP1600_StaticInitialize( void );
+#else
+#define KeccakP1600_StaticInitialize()
+#endif
+void KeccakP1600_Initialize(void *state);
+void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount);
+void KeccakP1600_Permute_Nrounds(void *state, unsigned int nrounds);
+void KeccakP1600_Permute_12rounds(void *state);
+void KeccakP1600_Permute_24rounds(void *state);
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
+
+#endif
diff --git a/src/crypto/defyx/KeccakP-1600-reference.c b/src/crypto/defyx/KeccakP-1600-reference.c
new file mode 100644
index 00000000..0c126030
--- /dev/null
+++ b/src/crypto/defyx/KeccakP-1600-reference.c
@@ -0,0 +1,402 @@
+/*
+Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen,
+Michaël Peeters, Gilles Van Assche and Ronny Van Keer,
+hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+This file implements Keccak-p[1600] in a SnP-compatible way.
+Please refer to SnP-documentation.h for more details.
+
+This implementation comes with KeccakP-1600-SnP.h in the same folder.
+Please refer to LowLevel.build for the exact list of other files it must be combined with.
+*/
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "brg_endian.h"
+#ifdef KeccakReference
+#include "displayIntermediateValues.h"
+#endif
+
+typedef unsigned char UINT8;
+typedef unsigned long long UINT64;
+typedef UINT64 tKeccakLane;
+
+#define maxNrRounds 24
+#define nrLanes 25
+#define index(x, y) (((x)%5)+5*((y)%5))
+
+#ifdef KeccakReference
+
+static tKeccakLane KeccakRoundConstants[maxNrRounds];
+static unsigned int KeccakRhoOffsets[nrLanes];
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_InitializeRoundConstants(void);
+void KeccakP1600_InitializeRhoOffsets(void);
+static int LFSR86540(UINT8 *LFSR);
+
+void KeccakP1600_StaticInitialize(void)
+{
+    if (sizeof(tKeccakLane) != 8) {
+        printf("tKeccakLane should be 64-bit wide\n");
+        abort();
+    }
+    KeccakP1600_InitializeRoundConstants();
+    KeccakP1600_InitializeRhoOffsets();
+}
+
+void KeccakP1600_InitializeRoundConstants(void)
+{
+    UINT8 LFSRstate = 0x01;
+    unsigned int i, j, bitPosition;
+
+    for(i=0; i<maxNrRounds; i++) {
+        KeccakRoundConstants[i] = 0;
+        for(j=0; j<7; j++) {
+            bitPosition = (1<<j)-1; /* 2^j-1 */
+            if (LFSR86540(&LFSRstate))
+                KeccakRoundConstants[i] ^= (tKeccakLane)1<<bitPosition;
+        }
+    }
+}
+
+void KeccakP1600_InitializeRhoOffsets(void)
+{
+    unsigned int x, y, t, newX, newY;
+
+    KeccakRhoOffsets[index(0, 0)] = 0;
+    x = 1;
+    y = 0;
+    for(t=0; t<24; t++) {
+        KeccakRhoOffsets[index(x, y)] = ((t+1)*(t+2)/2) % 64;
+        newX = (0*x+1*y) % 5;
+        newY = (2*x+3*y) % 5;
+        x = newX;
+        y = newY;
+    }
+}
+
+static int LFSR86540(UINT8 *LFSR)
+{
+    int result = ((*LFSR) & 0x01) != 0;
+    if (((*LFSR) & 0x80) != 0)
+        /* Primitive polynomial over GF(2): x^8+x^6+x^5+x^4+1 */
+        (*LFSR) = ((*LFSR) << 1) ^ 0x71;
+    else
+        (*LFSR) <<= 1;
+    return result;
+}
+
+#else
+
+static const tKeccakLane KeccakRoundConstants[maxNrRounds] =
+{
+    0x0000000000000001,
+    0x0000000000008082,
+    0x800000000000808a,
+    0x8000000080008000,
+    0x000000000000808b,
+    0x0000000080000001,
+    0x8000000080008081,
+    0x8000000000008009,
+    0x000000000000008a,
+    0x0000000000000088,
+    0x0000000080008009,
+    0x000000008000000a,
+    0x000000008000808b,
+    0x800000000000008b,
+    0x8000000000008089,
+    0x8000000000008003,
+    0x8000000000008002,
+    0x8000000000000080,
+    0x000000000000800a,
+    0x800000008000000a,
+    0x8000000080008081,
+    0x8000000000008080,
+    0x0000000080000001,
+    0x8000000080008008,
+};
+
+static const unsigned int KeccakRhoOffsets[nrLanes] =
+{
+     0,  1, 62, 28, 27, 36, 44,  6, 55, 20,  3, 10, 43, 25, 39, 41, 45, 15, 21,  8, 18,  2, 61, 56, 14
+};
+
+#endif
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Initialize(void *state)
+{
+    memset(state, 0, 1600/8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset)
+{
+    assert(offset < 200);
+    ((unsigned char *)state)[offset] ^= byte;
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    unsigned int i;
+
+    assert(offset < 200);
+    assert(offset+length <= 200);
+    for(i=0; i<length; i++)
+        ((unsigned char *)state)[offset+i] ^= data[i];
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    assert(offset < 200);
+    assert(offset+length <= 200);
+    memcpy((unsigned char*)state+offset, data, length);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount)
+{
+    assert(byteCount <= 200);
+    memset(state, 0, byteCount);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600OnWords(tKeccakLane *state, unsigned int nrRounds);
+void KeccakP1600Round(tKeccakLane *state, unsigned int indexRound);
+static void theta(tKeccakLane *A);
+static void rho(tKeccakLane *A);
+static void pi(tKeccakLane *A);
+static void chi(tKeccakLane *A);
+static void iota(tKeccakLane *A, unsigned int indexRound);
+
+void KeccakP1600_Permute_Nrounds(void *state, unsigned int nrounds)
+{
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+    tKeccakLane stateAsWords[1600/64];
+#endif
+
+#ifdef KeccakReference
+    displayStateAsBytes(1, "Input of permutation", (const unsigned char *)state, 1600);
+#endif
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakP1600OnWords((tKeccakLane*)state, nrounds);
+#else
+    fromBytesToWords(stateAsWords, (const unsigned char *)state);
+    KeccakP1600OnWords(stateAsWords, nrounds);
+    fromWordsToBytes((unsigned char *)state, stateAsWords);
+#endif
+#ifdef KeccakReference
+    displayStateAsBytes(1, "State after permutation", (const unsigned char *)state, 1600);
+#endif
+}
+
+void KeccakP1600_Permute_12rounds(void *state)
+{
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+    tKeccakLane stateAsWords[1600/64];
+#endif
+
+#ifdef KeccakReference
+    displayStateAsBytes(1, "Input of permutation", (const unsigned char *)state, 1600);
+#endif
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakP1600OnWords((tKeccakLane*)state, 12);
+#else
+    fromBytesToWords(stateAsWords, (const unsigned char *)state);
+    KeccakP1600OnWords(stateAsWords, 12);
+    fromWordsToBytes((unsigned char *)state, stateAsWords);
+#endif
+#ifdef KeccakReference
+    displayStateAsBytes(1, "State after permutation", (const unsigned char *)state, 1600);
+#endif
+}
+
+void KeccakP1600_Permute_24rounds(void *state)
+{
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+    tKeccakLane stateAsWords[1600/64];
+#endif
+
+#ifdef KeccakReference
+    displayStateAsBytes(1, "Input of permutation", (const unsigned char *)state, 1600);
+#endif
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakP1600OnWords((tKeccakLane*)state, 24);
+#else
+    fromBytesToWords(stateAsWords, (const unsigned char *)state);
+    KeccakP1600OnWords(stateAsWords, 24);
+    fromWordsToBytes((unsigned char *)state, stateAsWords);
+#endif
+#ifdef KeccakReference
+    displayStateAsBytes(1, "State after permutation", (const unsigned char *)state, 1600);
+#endif
+}
+
+void KeccakP1600OnWords(tKeccakLane *state, unsigned int nrRounds)
+{
+    unsigned int i;
+
+#ifdef KeccakReference
+    displayStateAsLanes(3, "Same, with lanes as 64-bit words", state, 1600);
+#endif
+
+    for(i=(maxNrRounds-nrRounds); i<maxNrRounds; i++)
+        KeccakP1600Round(state, i);
+}
+
+void KeccakP1600Round(tKeccakLane *state, unsigned int indexRound)
+{
+#ifdef KeccakReference
+    displayRoundNumber(3, indexRound);
+#endif
+
+    theta(state);
+#ifdef KeccakReference
+    displayStateAsLanes(3, "After theta", state, 1600);
+#endif
+
+    rho(state);
+#ifdef KeccakReference
+    displayStateAsLanes(3, "After rho", state, 1600);
+#endif
+
+    pi(state);
+#ifdef KeccakReference
+    displayStateAsLanes(3, "After pi", state, 1600);
+#endif
+
+    chi(state);
+#ifdef KeccakReference
+    displayStateAsLanes(3, "After chi", state, 1600);
+#endif
+
+    iota(state, indexRound);
+#ifdef KeccakReference
+    displayStateAsLanes(3, "After iota", state, 1600);
+#endif
+}
+
+#define ROL64(a, offset) ((offset != 0) ? ((((tKeccakLane)a) << offset) ^ (((tKeccakLane)a) >> (64-offset))) : a)
+
+static void theta(tKeccakLane *A)
+{
+    unsigned int x, y;
+    tKeccakLane C[5], D[5];
+
+    for(x=0; x<5; x++) {
+        C[x] = 0;
+        for(y=0; y<5; y++)
+            C[x] ^= A[index(x, y)];
+    }
+    for(x=0; x<5; x++)
+        D[x] = ROL64(C[(x+1)%5], 1) ^ C[(x+4)%5];
+    for(x=0; x<5; x++)
+        for(y=0; y<5; y++)
+            A[index(x, y)] ^= D[x];
+}
+
+static void rho(tKeccakLane *A)
+{
+    unsigned int x, y;
+
+    for(x=0; x<5; x++) for(y=0; y<5; y++)
+        A[index(x, y)] = ROL64(A[index(x, y)], KeccakRhoOffsets[index(x, y)]);
+}
+
+static void pi(tKeccakLane *A)
+{
+    unsigned int x, y;
+    tKeccakLane tempA[25];
+
+    for(x=0; x<5; x++) for(y=0; y<5; y++)
+        tempA[index(x, y)] = A[index(x, y)];
+    for(x=0; x<5; x++) for(y=0; y<5; y++)
+        A[index(0*x+1*y, 2*x+3*y)] = tempA[index(x, y)];
+}
+
+static void chi(tKeccakLane *A)
+{
+    unsigned int x, y;
+    tKeccakLane C[5];
+
+    for(y=0; y<5; y++) {
+        for(x=0; x<5; x++)
+            C[x] = A[index(x, y)] ^ ((~A[index(x+1, y)]) & A[index(x+2, y)]);
+        for(x=0; x<5; x++)
+            A[index(x, y)] = C[x];
+    }
+}
+
+static void iota(tKeccakLane *A, unsigned int indexRound)
+{
+    A[index(0, 0)] ^= KeccakRoundConstants[indexRound];
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    assert(offset < 200);
+    assert(offset+length <= 200);
+    memcpy(data, (unsigned char*)state+offset, length);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+    unsigned int i;
+
+    assert(offset < 200);
+    assert(offset+length <= 200);
+    for(i=0; i<length; i++)
+        output[i] = input[i] ^ ((unsigned char *)state)[offset+i];
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_DisplayRoundConstants(FILE *f)
+{
+    unsigned int i;
+
+    for(i=0; i<maxNrRounds; i++) {
+        fprintf(f, "RC[%02i][0][0] = ", i);
+        fprintf(f, "%08X", (unsigned int)(KeccakRoundConstants[i] >> 32));
+        fprintf(f, "%08X", (unsigned int)(KeccakRoundConstants[i] & 0xFFFFFFFFULL));
+        fprintf(f, "\n");
+    }
+    fprintf(f, "\n");
+}
+
+void KeccakP1600_DisplayRhoOffsets(FILE *f)
+{
+    unsigned int x, y;
+
+    for(y=0; y<5; y++) for(x=0; x<5; x++) {
+        fprintf(f, "RhoOffset[%i][%i] = ", x, y);
+        fprintf(f, "%2i", KeccakRhoOffsets[index(x, y)]);
+        fprintf(f, "\n");
+    }
+    fprintf(f, "\n");
+}
diff --git a/src/crypto/defyx/KeccakSponge-common.h b/src/crypto/defyx/KeccakSponge-common.h
new file mode 100644
index 00000000..8fb3ba1a
--- /dev/null
+++ b/src/crypto/defyx/KeccakSponge-common.h
@@ -0,0 +1,35 @@
+/*
+Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen,
+Michaël Peeters, Gilles Van Assche and Ronny Van Keer,
+hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakSpongeCommon_h_
+#define _KeccakSpongeCommon_h_
+
+#include <string.h>
+#include "align.h"
+
+#define KCP_DeclareSpongeStructure(prefix, size, alignment) \
+    ALIGN(alignment) typedef struct prefix##_SpongeInstanceStruct { \
+        unsigned char state[size]; \
+        unsigned int rate; \
+        unsigned int byteIOIndex; \
+        int squeezing; \
+    } prefix##_SpongeInstance;
+
+#define KCP_DeclareSpongeFunctions(prefix) \
+    int prefix##_Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen); \
+    int prefix##_SpongeInitialize(prefix##_SpongeInstance *spongeInstance, unsigned int rate, unsigned int capacity); \
+    int prefix##_SpongeAbsorb(prefix##_SpongeInstance *spongeInstance, const unsigned char *data, size_t dataByteLen); \
+    int prefix##_SpongeAbsorbLastFewBits(prefix##_SpongeInstance *spongeInstance, unsigned char delimitedData); \
+    int prefix##_SpongeSqueeze(prefix##_SpongeInstance *spongeInstance, unsigned char *data, size_t dataByteLen);
+
+#endif
diff --git a/src/crypto/defyx/KeccakSponge.inc b/src/crypto/defyx/KeccakSponge.inc
new file mode 100644
index 00000000..f6c59cee
--- /dev/null
+++ b/src/crypto/defyx/KeccakSponge.inc
@@ -0,0 +1,311 @@
+/*
+Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen,
+Michaël Peeters, Gilles Van Assche and Ronny Van Keer,
+hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define JOIN0(a, b)                     a ## b
+#define JOIN(a, b)                      JOIN0(a, b)
+
+#define Sponge                          JOIN(prefix, _Sponge)
+#define SpongeInstance                  JOIN(prefix, _SpongeInstance)
+#define SpongeInitialize                JOIN(prefix, _SpongeInitialize)
+#define SpongeAbsorb                    JOIN(prefix, _SpongeAbsorb)
+#define SpongeAbsorbLastFewBits         JOIN(prefix, _SpongeAbsorbLastFewBits)
+#define SpongeSqueeze                   JOIN(prefix, _SpongeSqueeze)
+
+#define SnP_stateSizeInBytes            JOIN(SnP, _stateSizeInBytes)
+#define SnP_stateAlignment              JOIN(SnP, _stateAlignment)
+#define SnP_StaticInitialize            JOIN(SnP, _StaticInitialize)
+#define SnP_Initialize                  JOIN(SnP, _Initialize)
+#define SnP_AddByte                     JOIN(SnP, _AddByte)
+#define SnP_AddBytes                    JOIN(SnP, _AddBytes)
+#define SnP_ExtractBytes                JOIN(SnP, _ExtractBytes)
+
+int Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen)
+{
+    ALIGN(SnP_stateAlignment) unsigned char state[SnP_stateSizeInBytes];
+    unsigned int partialBlock;
+    const unsigned char *curInput = input;
+    unsigned char *curOutput = output;
+    unsigned int rateInBytes = rate/8;
+
+    if (rate+capacity != SnP_width)
+        return 1;
+    if ((rate <= 0) || (rate > SnP_width) || ((rate % 8) != 0))
+        return 1;
+    if (suffix == 0)
+        return 1;
+
+    /* Initialize the state */
+    SnP_StaticInitialize();
+    SnP_Initialize(state);
+
+    /* First, absorb whole blocks */
+#ifdef SnP_FastLoop_Absorb
+    if (((rateInBytes % (SnP_width/200)) == 0) && (inputByteLen >= rateInBytes)) {
+        /* fast lane: whole lane rate */
+        size_t j;
+        j = SnP_FastLoop_Absorb(state, rateInBytes/(SnP_width/200), curInput, inputByteLen);
+        curInput += j;
+        inputByteLen -= j;
+    }
+#endif
+    while(inputByteLen >= (size_t)rateInBytes) {
+        #ifdef KeccakReference
+        displayBytes(1, "Block to be absorbed", curInput, rateInBytes);
+        #endif
+        SnP_AddBytes(state, curInput, 0, rateInBytes);
+        SnP_Permute(state);
+        curInput += rateInBytes;
+        inputByteLen -= rateInBytes;
+    }
+
+    /* Then, absorb what remains */
+    partialBlock = (unsigned int)inputByteLen;
+    #ifdef KeccakReference
+    displayBytes(1, "Block to be absorbed (part)", curInput, partialBlock);
+    #endif
+    SnP_AddBytes(state, curInput, 0, partialBlock);
+
+    /* Finally, absorb the suffix */
+    #ifdef KeccakReference
+    {
+        unsigned char delimitedData1[1];
+        delimitedData1[0] = suffix;
+        displayBytes(1, "Block to be absorbed (last few bits + first bit of padding)", delimitedData1, 1);
+    }
+    #endif
+    /* Last few bits, whose delimiter coincides with first bit of padding */
+    SnP_AddByte(state, suffix, partialBlock);
+    /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */
+    if ((suffix >= 0x80) && (partialBlock == (rateInBytes-1)))
+        SnP_Permute(state);
+    /* Second bit of padding */
+    SnP_AddByte(state, 0x80, rateInBytes-1);
+    #ifdef KeccakReference
+    {
+        unsigned char block[SnP_width/8];
+        memset(block, 0, SnP_width/8);
+        block[rateInBytes-1] = 0x80;
+        displayBytes(1, "Second bit of padding", block, rateInBytes);
+    }
+    #endif
+    SnP_Permute(state);
+    #ifdef KeccakReference
+    displayText(1, "--- Switching to squeezing phase ---");
+    #endif
+
+    /* First, output whole blocks */
+    while(outputByteLen > (size_t)rateInBytes) {
+        SnP_ExtractBytes(state, curOutput, 0, rateInBytes);
+        SnP_Permute(state);
+        #ifdef KeccakReference
+        displayBytes(1, "Squeezed block", curOutput, rateInBytes);
+        #endif
+        curOutput += rateInBytes;
+        outputByteLen -= rateInBytes;
+    }
+
+    /* Finally, output what remains */
+    partialBlock = (unsigned int)outputByteLen;
+    SnP_ExtractBytes(state, curOutput, 0, partialBlock);
+    #ifdef KeccakReference
+    displayBytes(1, "Squeezed block (part)", curOutput, partialBlock);
+    #endif
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+/* ---------------------------------------------------------------- */
+/* ---------------------------------------------------------------- */
+
+int SpongeInitialize(SpongeInstance *instance, unsigned int rate, unsigned int capacity)
+{
+    if (rate+capacity != SnP_width)
+        return 1;
+    if ((rate <= 0) || (rate > SnP_width) || ((rate % 8) != 0))
+        return 1;
+    SnP_StaticInitialize();
+    SnP_Initialize(instance->state);
+    instance->rate = rate;
+    instance->byteIOIndex = 0;
+    instance->squeezing = 0;
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+int SpongeAbsorb(SpongeInstance *instance, const unsigned char *data, size_t dataByteLen)
+{
+    size_t i, j;
+    unsigned int partialBlock;
+    const unsigned char *curData;
+    unsigned int rateInBytes = instance->rate/8;
+
+    if (instance->squeezing)
+        return 1; /* Too late for additional input */
+
+    i = 0;
+    curData = data;
+    while(i < dataByteLen) {
+        if ((instance->byteIOIndex == 0) && (dataByteLen >= (i + rateInBytes))) {
+#ifdef SnP_FastLoop_Absorb
+            /* processing full blocks first */
+            if ((rateInBytes % (SnP_width/200)) == 0) {
+                /* fast lane: whole lane rate */
+                j = SnP_FastLoop_Absorb(instance->state, rateInBytes/(SnP_width/200), curData, dataByteLen - i);
+                i += j;
+                curData += j;
+            }
+            else {
+#endif
+                for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, rateInBytes);
+                    #endif
+                    SnP_AddBytes(instance->state, curData, 0, rateInBytes);
+                    SnP_Permute(instance->state);
+                    curData+=rateInBytes;
+                }
+                i = dataByteLen - j;
+#ifdef SnP_FastLoop_Absorb
+            }
+#endif
+        }
+        else {
+            /* normal lane: using the message queue */
+            partialBlock = (unsigned int)(dataByteLen - i);
+            if (partialBlock+instance->byteIOIndex > rateInBytes)
+                partialBlock = rateInBytes-instance->byteIOIndex;
+            #ifdef KeccakReference
+            displayBytes(1, "Block to be absorbed (part)", curData, partialBlock);
+            #endif
+            i += partialBlock;
+
+            SnP_AddBytes(instance->state, curData, instance->byteIOIndex, partialBlock);
+            curData += partialBlock;
+            instance->byteIOIndex += partialBlock;
+            if (instance->byteIOIndex == rateInBytes) {
+                SnP_Permute(instance->state);
+                instance->byteIOIndex = 0;
+            }
+        }
+    }
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+int SpongeAbsorbLastFewBits(SpongeInstance *instance, unsigned char delimitedData)
+{
+    unsigned int rateInBytes = instance->rate/8;
+
+    if (delimitedData == 0)
+        return 1;
+    if (instance->squeezing)
+        return 1; /* Too late for additional input */
+
+    #ifdef KeccakReference
+    {
+        unsigned char delimitedData1[1];
+        delimitedData1[0] = delimitedData;
+        displayBytes(1, "Block to be absorbed (last few bits + first bit of padding)", delimitedData1, 1);
+    }
+    #endif
+    /* Last few bits, whose delimiter coincides with first bit of padding */
+    SnP_AddByte(instance->state, delimitedData, instance->byteIOIndex);
+    /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */
+    if ((delimitedData >= 0x80) && (instance->byteIOIndex == (rateInBytes-1)))
+        SnP_Permute(instance->state);
+    /* Second bit of padding */
+    SnP_AddByte(instance->state, 0x80, rateInBytes-1);
+    #ifdef KeccakReference
+    {
+        unsigned char block[SnP_width/8];
+        memset(block, 0, SnP_width/8);
+        block[rateInBytes-1] = 0x80;
+        displayBytes(1, "Second bit of padding", block, rateInBytes);
+    }
+    #endif
+    SnP_Permute(instance->state);
+    instance->byteIOIndex = 0;
+    instance->squeezing = 1;
+    #ifdef KeccakReference
+    displayText(1, "--- Switching to squeezing phase ---");
+    #endif
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+int SpongeSqueeze(SpongeInstance *instance, unsigned char *data, size_t dataByteLen)
+{
+    size_t i, j;
+    unsigned int partialBlock;
+    unsigned int rateInBytes = instance->rate/8;
+    unsigned char *curData;
+
+    if (!instance->squeezing)
+        SpongeAbsorbLastFewBits(instance, 0x01);
+
+    i = 0;
+    curData = data;
+    while(i < dataByteLen) {
+        if ((instance->byteIOIndex == rateInBytes) && (dataByteLen >= (i + rateInBytes))) {
+            for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) {
+                SnP_Permute(instance->state);
+                SnP_ExtractBytes(instance->state, curData, 0, rateInBytes);
+                #ifdef KeccakReference
+                displayBytes(1, "Squeezed block", curData, rateInBytes);
+                #endif
+                curData+=rateInBytes;
+            }
+            i = dataByteLen - j;
+        }
+        else {
+            /* normal lane: using the message queue */
+            if (instance->byteIOIndex == rateInBytes) {
+                SnP_Permute(instance->state);
+                instance->byteIOIndex = 0;
+            }
+            partialBlock = (unsigned int)(dataByteLen - i);
+            if (partialBlock+instance->byteIOIndex > rateInBytes)
+                partialBlock = rateInBytes-instance->byteIOIndex;
+            i += partialBlock;
+
+            SnP_ExtractBytes(instance->state, curData, instance->byteIOIndex, partialBlock);
+            #ifdef KeccakReference
+            displayBytes(1, "Squeezed block (part)", curData, partialBlock);
+            #endif
+            curData += partialBlock;
+            instance->byteIOIndex += partialBlock;
+        }
+    }
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+#undef Sponge
+#undef SpongeInstance
+#undef SpongeInitialize
+#undef SpongeAbsorb
+#undef SpongeAbsorbLastFewBits
+#undef SpongeSqueeze
+#undef SnP_stateSizeInBytes
+#undef SnP_stateAlignment
+#undef SnP_StaticInitialize
+#undef SnP_Initialize
+#undef SnP_AddByte
+#undef SnP_AddBytes
+#undef SnP_ExtractBytes
diff --git a/src/crypto/defyx/KeccakSpongeWidth1600.c b/src/crypto/defyx/KeccakSpongeWidth1600.c
new file mode 100644
index 00000000..672ec36d
--- /dev/null
+++ b/src/crypto/defyx/KeccakSpongeWidth1600.c
@@ -0,0 +1,54 @@
+/*
+Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen,
+Michaël Peeters, Gilles Van Assche and Ronny Van Keer,
+hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include "KeccakSpongeWidth1600.h"
+
+#ifdef KeccakReference
+    #include "displayIntermediateValues.h"
+#endif
+
+#ifndef KeccakP1600_excluded
+    #include "KeccakP-1600-SnP.h"
+
+    #define prefix KeccakWidth1600
+    #define SnP KeccakP1600
+    #define SnP_width 1600
+    #define SnP_Permute KeccakP1600_Permute_24rounds
+    #if defined(KeccakF1600_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakF1600_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
+
+#ifndef KeccakP1600_excluded
+    #include "KeccakP-1600-SnP.h"
+
+    #define prefix KeccakWidth1600_12rounds
+    #define SnP KeccakP1600
+    #define SnP_width 1600
+    #define SnP_Permute KeccakP1600_Permute_12rounds
+    #if defined(KeccakP1600_12rounds_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakP1600_12rounds_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
diff --git a/src/crypto/defyx/KeccakSpongeWidth1600.h b/src/crypto/defyx/KeccakSpongeWidth1600.h
new file mode 100644
index 00000000..1558256c
--- /dev/null
+++ b/src/crypto/defyx/KeccakSpongeWidth1600.h
@@ -0,0 +1,31 @@
+/*
+Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen,
+Michaël Peeters, Gilles Van Assche and Ronny Van Keer,
+hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakSpongeWidth1600_h_
+#define _KeccakSpongeWidth1600_h_
+
+#include "KeccakSponge-common.h"
+
+#ifndef KeccakP1600_excluded
+    #include "KeccakP-1600-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth1600, KeccakP1600_stateSizeInBytes, KeccakP1600_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth1600)
+#endif
+
+#ifndef KeccakP1600_excluded
+    #include "KeccakP-1600-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth1600_12rounds, KeccakP1600_stateSizeInBytes, KeccakP1600_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth1600_12rounds)
+#endif
+
+#endif
diff --git a/src/crypto/defyx/Phases.h b/src/crypto/defyx/Phases.h
new file mode 100644
index 00000000..769125c3
--- /dev/null
+++ b/src/crypto/defyx/Phases.h
@@ -0,0 +1,22 @@
+/*
+Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _Phases_h_
+#define _Phases_h_
+
+typedef enum {
+    NOT_INITIALIZED,
+    ABSORBING,
+    FINAL,
+    SQUEEZING
+} KCP_Phases;
+
+#endif
diff --git a/src/crypto/defyx/align.h b/src/crypto/defyx/align.h
new file mode 100644
index 00000000..90c1b37a
--- /dev/null
+++ b/src/crypto/defyx/align.h
@@ -0,0 +1,32 @@
+/*
+Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen,
+Michaël Peeters, Gilles Van Assche and Ronny Van Keer,
+hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _align_h_
+#define _align_h_
+
+/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */
+#ifdef ALIGN
+#undef ALIGN
+#endif
+
+#if defined(__GNUC__)
+#define ALIGN(x) __attribute__ ((aligned(x)))
+#elif defined(_MSC_VER)
+#define ALIGN(x) __declspec(align(x))
+#elif defined(__ARMCC_VERSION)
+#define ALIGN(x) __align(x)
+#else
+#define ALIGN(x)
+#endif
+
+#endif
diff --git a/src/crypto/defyx/brg_endian.h b/src/crypto/defyx/brg_endian.h
new file mode 100644
index 00000000..7c640b90
--- /dev/null
+++ b/src/crypto/defyx/brg_endian.h
@@ -0,0 +1,143 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+ Changes for ARM 9/9/2010
+*/
+
+#ifndef _BRG_ENDIAN_H
+#define _BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+#if 0
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __sun )
+#  include <sys/isa_defs.h>
+#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined( _AIX )
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )    || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )    || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )   || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ )  || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )    || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ )  || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C )  || defined( __VMCMS__ ) || defined( _AIX )       || \
+      defined( __s390__ ) || defined( __s390x__ ) || defined( __zarch__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif defined(__arm__)
+# ifdef __BIG_ENDIAN
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# else
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif 1     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
+#endif
+
+#endif
+
+#endif
diff --git a/src/crypto/defyx/defyx.cpp b/src/crypto/defyx/defyx.cpp
new file mode 100644
index 00000000..0d1a9697
--- /dev/null
+++ b/src/crypto/defyx/defyx.cpp
@@ -0,0 +1,109 @@
+/*
+Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "defyx.h"
+#include "crypto/randomx/blake2/blake2.h"
+#include "crypto/randomx/vm_interpreted.hpp"
+#include "crypto/randomx/vm_interpreted_light.hpp"
+#include "crypto/randomx/vm_compiled.hpp"
+#include "crypto/randomx/vm_compiled_light.hpp"
+#include "crypto/randomx/jit_compiler_x86_static.hpp"
+
+#include <cassert>
+
+extern "C" {
+#include "yescrypt.h"
+#include "KangarooTwelve.h"
+} 
+
+#define YESCRYPT_FLAGS YESCRYPT_RW
+#define YESCRYPT_BASE_N 2048
+#define YESCRYPT_R 8
+#define YESCRYPT_P 1
+
+RandomX_ConfigurationScala::RandomX_ConfigurationScala()
+{
+	ArgonMemory       = 131072;
+        ArgonIterations   = 2;
+	ArgonSalt         = "DefyXScala\x13";
+        CacheAccesses     = 2;
+        DatasetBaseSize   = 33554432;
+        ProgramSize       = 64;
+        ProgramIterations = 1024;
+	ProgramCount      = 4;
+	ScratchpadL3_Size = 262144;
+	ScratchpadL2_Size = 131072;
+	ScratchpadL1_Size = 65536;
+}
+
+RandomX_ConfigurationScala RandomX_ScalaConfig;
+
+int sipesh(void *out, size_t outlen, const void *in, size_t inlen, const void *salt, size_t saltlen, unsigned int t_cost, unsigned int m_cost)
+{
+	yescrypt_local_t local;
+	int retval;
+
+	if (yescrypt_init_local(&local))
+		return -1;
+	retval = yescrypt_kdf(NULL, &local, (const uint8_t*)in, inlen, (const uint8_t*)salt, saltlen,
+	    (uint64_t)YESCRYPT_BASE_N << m_cost, YESCRYPT_R, YESCRYPT_P,
+	    t_cost, 0, YESCRYPT_FLAGS, (uint8_t*)out, outlen);
+	if (yescrypt_free_local(&local))
+		return -1;
+	return retval;
+}
+
+int k12(const void *data, size_t length, void *hash)
+{
+
+  int kDo = KangarooTwelve((const unsigned char *)data, length, (unsigned char *)hash, 32, 0, 0);
+  return kDo;
+}
+
+
+extern "C" {
+
+	void defyx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output) {
+		assert(machine != nullptr);
+		assert(inputSize == 0 || input != nullptr);
+		assert(output != nullptr);
+		alignas(16) uint64_t tempHash[8];
+		blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
+		sipesh(tempHash, sizeof(tempHash), input, inputSize, input, inputSize, 0, 0);
+		k12(input, inputSize, tempHash);
+		machine->initScratchpad(&tempHash);
+		machine->resetRoundingMode();
+		for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) {
+			machine->run(&tempHash);
+			blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
+		}
+		machine->run(&tempHash);
+		machine->getFinalResult(output, RANDOMX_HASH_SIZE);
+	}
+
+}
diff --git a/src/crypto/defyx/defyx.h b/src/crypto/defyx/defyx.h
new file mode 100644
index 00000000..a67e7fde
--- /dev/null
+++ b/src/crypto/defyx/defyx.h
@@ -0,0 +1,57 @@
+/*
+Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef DEFYX_H
+#define DEFYX_H
+
+#include "crypto/randomx/randomx.h"
+
+struct RandomX_ConfigurationScala : public RandomX_ConfigurationBase { RandomX_ConfigurationScala(); };
+
+extern RandomX_ConfigurationScala RandomX_ScalaConfig;
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/**
+ * Calculates a RandomX hash value.
+ *
+ * @param machine is a pointer to a randomx_vm structure. Must not be NULL.
+ * @param input is a pointer to memory to be hashed. Must not be NULL.
+ * @param inputSize is the number of bytes to be hashed.
+ * @param output is a pointer to memory where the hash will be stored. Must not
+ *        be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing.
+*/
+RANDOMX_EXPORT void defyx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/src/crypto/defyx/sha256.c b/src/crypto/defyx/sha256.c
new file mode 100644
index 00000000..49b937cd
--- /dev/null
+++ b/src/crypto/defyx/sha256.c
@@ -0,0 +1,411 @@
+/*-
+ * Copyright 2005,2007,2009 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+
+#include <stdint.h>
+#include <string.h>
+
+#include "sysendian.h"
+
+#include "sha256.h"
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static void
+be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		be32enc(dst + i * 4, src[i]);
+}
+
+/*
+ * Decode a big-endian length len vector of (unsigned char) into a length
+ * len/4 vector of (uint32_t).  Assumes len is a multiple of 4.
+ */
+static void
+be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		dst[i] = be32dec(src + i * 4);
+}
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define SHR(x, n)	(x >> n)
+#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
+#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	t0 = h + S1(e) + Ch(e, f, g) + k;		\
+	t1 = S0(a) + Maj(a, b, c);			\
+	d += t0;					\
+	h  = t0 + t1;
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, k)			\
+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
+	    W[i] + k)
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA256_Transform(uint32_t * state, const unsigned char block[64])
+{
+	uint32_t W[64];
+	uint32_t S[8];
+	uint32_t t0, t1;
+	int i;
+
+	/* 1. Prepare message schedule W. */
+	be32dec_vect(W, block, 64);
+	for (i = 16; i < 64; i++)
+		W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, 32);
+
+	/* 3. Mix. */
+	RNDr(S, W, 0, 0x428a2f98);
+	RNDr(S, W, 1, 0x71374491);
+	RNDr(S, W, 2, 0xb5c0fbcf);
+	RNDr(S, W, 3, 0xe9b5dba5);
+	RNDr(S, W, 4, 0x3956c25b);
+	RNDr(S, W, 5, 0x59f111f1);
+	RNDr(S, W, 6, 0x923f82a4);
+	RNDr(S, W, 7, 0xab1c5ed5);
+	RNDr(S, W, 8, 0xd807aa98);
+	RNDr(S, W, 9, 0x12835b01);
+	RNDr(S, W, 10, 0x243185be);
+	RNDr(S, W, 11, 0x550c7dc3);
+	RNDr(S, W, 12, 0x72be5d74);
+	RNDr(S, W, 13, 0x80deb1fe);
+	RNDr(S, W, 14, 0x9bdc06a7);
+	RNDr(S, W, 15, 0xc19bf174);
+	RNDr(S, W, 16, 0xe49b69c1);
+	RNDr(S, W, 17, 0xefbe4786);
+	RNDr(S, W, 18, 0x0fc19dc6);
+	RNDr(S, W, 19, 0x240ca1cc);
+	RNDr(S, W, 20, 0x2de92c6f);
+	RNDr(S, W, 21, 0x4a7484aa);
+	RNDr(S, W, 22, 0x5cb0a9dc);
+	RNDr(S, W, 23, 0x76f988da);
+	RNDr(S, W, 24, 0x983e5152);
+	RNDr(S, W, 25, 0xa831c66d);
+	RNDr(S, W, 26, 0xb00327c8);
+	RNDr(S, W, 27, 0xbf597fc7);
+	RNDr(S, W, 28, 0xc6e00bf3);
+	RNDr(S, W, 29, 0xd5a79147);
+	RNDr(S, W, 30, 0x06ca6351);
+	RNDr(S, W, 31, 0x14292967);
+	RNDr(S, W, 32, 0x27b70a85);
+	RNDr(S, W, 33, 0x2e1b2138);
+	RNDr(S, W, 34, 0x4d2c6dfc);
+	RNDr(S, W, 35, 0x53380d13);
+	RNDr(S, W, 36, 0x650a7354);
+	RNDr(S, W, 37, 0x766a0abb);
+	RNDr(S, W, 38, 0x81c2c92e);
+	RNDr(S, W, 39, 0x92722c85);
+	RNDr(S, W, 40, 0xa2bfe8a1);
+	RNDr(S, W, 41, 0xa81a664b);
+	RNDr(S, W, 42, 0xc24b8b70);
+	RNDr(S, W, 43, 0xc76c51a3);
+	RNDr(S, W, 44, 0xd192e819);
+	RNDr(S, W, 45, 0xd6990624);
+	RNDr(S, W, 46, 0xf40e3585);
+	RNDr(S, W, 47, 0x106aa070);
+	RNDr(S, W, 48, 0x19a4c116);
+	RNDr(S, W, 49, 0x1e376c08);
+	RNDr(S, W, 50, 0x2748774c);
+	RNDr(S, W, 51, 0x34b0bcb5);
+	RNDr(S, W, 52, 0x391c0cb3);
+	RNDr(S, W, 53, 0x4ed8aa4a);
+	RNDr(S, W, 54, 0x5b9cca4f);
+	RNDr(S, W, 55, 0x682e6ff3);
+	RNDr(S, W, 56, 0x748f82ee);
+	RNDr(S, W, 57, 0x78a5636f);
+	RNDr(S, W, 58, 0x84c87814);
+	RNDr(S, W, 59, 0x8cc70208);
+	RNDr(S, W, 60, 0x90befffa);
+	RNDr(S, W, 61, 0xa4506ceb);
+	RNDr(S, W, 62, 0xbef9a3f7);
+	RNDr(S, W, 63, 0xc67178f2);
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+
+	/* Clean the stack. */
+	memset(W, 0, 256);
+	memset(S, 0, 32);
+	t0 = t1 = 0;
+}
+
+static unsigned char PAD[64] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Add padding and terminating bit-count. */
+static void
+SHA256_Pad(SHA256_CTX_Y * ctx)
+{
+	unsigned char len[8];
+	uint32_t r, plen;
+
+	/*
+	 * Convert length to a vector of bytes -- we do this now rather
+	 * than later because the length will change after we pad.
+	 */
+	be32enc_vect(len, ctx->count, 8);
+
+	/* Add 1--64 bytes so that the resulting length is 56 mod 64 */
+	r = (ctx->count[1] >> 3) & 0x3f;
+	plen = (r < 56) ? (56 - r) : (120 - r);
+	SHA256_Update_Y(ctx, PAD, (size_t)plen);
+
+	/* Add the terminating bit-count */
+	SHA256_Update_Y(ctx, len, 8);
+}
+
+/* SHA-256 initialization.  Begins a SHA-256 operation. */
+void
+SHA256_Init_Y(SHA256_CTX_Y * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count[0] = ctx->count[1] = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0x6A09E667;
+	ctx->state[1] = 0xBB67AE85;
+	ctx->state[2] = 0x3C6EF372;
+	ctx->state[3] = 0xA54FF53A;
+	ctx->state[4] = 0x510E527F;
+	ctx->state[5] = 0x9B05688C;
+	ctx->state[6] = 0x1F83D9AB;
+	ctx->state[7] = 0x5BE0CD19;
+}
+
+/* Add bytes into the hash */
+void
+SHA256_Update_Y(SHA256_CTX_Y * ctx, const void *in, size_t len)
+{
+	uint32_t bitlen[2];
+	uint32_t r;
+	const unsigned char *src = in;
+
+	/* Number of bytes left in the buffer from previous updates */
+	r = (ctx->count[1] >> 3) & 0x3f;
+
+	/* Convert the length into a number of bits */
+	bitlen[1] = ((uint32_t)len) << 3;
+	bitlen[0] = (uint32_t)(len >> 29);
+
+	/* Update number of bits */
+	if ((ctx->count[1] += bitlen[1]) < bitlen[1])
+		ctx->count[0]++;
+	ctx->count[0] += bitlen[0];
+
+	/* Handle the case where we don't need to perform any transforms */
+	if (len < 64 - r) {
+		memcpy(&ctx->buf[r], src, len);
+		return;
+	}
+
+	/* Finish the current block */
+	memcpy(&ctx->buf[r], src, 64 - r);
+	SHA256_Transform(ctx->state, ctx->buf);
+	src += 64 - r;
+	len -= 64 - r;
+
+	/* Perform complete blocks */
+	while (len >= 64) {
+		SHA256_Transform(ctx->state, src);
+		src += 64;
+		len -= 64;
+	}
+
+	/* Copy left over data into buffer */
+	memcpy(ctx->buf, src, len);
+}
+
+/*
+ * SHA-256 finalization.  Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA256_Final_Y(unsigned char digest[32], SHA256_CTX_Y * ctx)
+{
+
+	/* Add padding */
+	SHA256_Pad(ctx);
+
+	/* Write the hash */
+	be32enc_vect(digest, ctx->state, 32);
+
+	/* Clear the context state */
+	memset((void *)ctx, 0, sizeof(*ctx));
+}
+
+/* Initialize an HMAC-SHA256 operation with the given key. */
+void
+HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y * ctx, const void * _K, size_t Klen)
+{
+	unsigned char pad[64];
+	unsigned char khash[32];
+	const unsigned char * K = _K;
+	size_t i;
+
+	/* If Klen > 64, the key is really SHA256(K). */
+	if (Klen > 64) {
+		SHA256_Init_Y(&ctx->ictx);
+		SHA256_Update_Y(&ctx->ictx, K, Klen);
+		SHA256_Final_Y(khash, &ctx->ictx);
+		K = khash;
+		Klen = 32;
+	}
+
+	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
+	SHA256_Init_Y(&ctx->ictx);
+	memset(pad, 0x36, 64);
+	for (i = 0; i < Klen; i++)
+		pad[i] ^= K[i];
+	SHA256_Update_Y(&ctx->ictx, pad, 64);
+
+	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
+	SHA256_Init_Y(&ctx->octx);
+	memset(pad, 0x5c, 64);
+	for (i = 0; i < Klen; i++)
+		pad[i] ^= K[i];
+	SHA256_Update_Y(&ctx->octx, pad, 64);
+
+	/* Clean the stack. */
+	memset(khash, 0, 32);
+}
+
+/* Add bytes to the HMAC-SHA256 operation. */
+void
+HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y * ctx, const void *in, size_t len)
+{
+
+	/* Feed data to the inner SHA256 operation. */
+	SHA256_Update_Y(&ctx->ictx, in, len);
+}
+
+/* Finish an HMAC-SHA256 operation. */
+void
+HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx)
+{
+	unsigned char ihash[32];
+
+	/* Finish the inner SHA256 operation. */
+	SHA256_Final_Y(ihash, &ctx->ictx);
+
+	/* Feed the inner hash to the outer SHA256 operation. */
+	SHA256_Update_Y(&ctx->octx, ihash, 32);
+
+	/* Finish the outer SHA256 operation. */
+	SHA256_Final_Y(digest, &ctx->octx);
+
+	/* Clean the stack. */
+	memset(ihash, 0, 32);
+}
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+void
+PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
+    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
+{
+	HMAC_SHA256_CTX_Y PShctx, hctx;
+	size_t i;
+	uint8_t ivec[4];
+	uint8_t U[32];
+	uint8_t T[32];
+	uint64_t j;
+	int k;
+	size_t clen;
+
+	/* Compute HMAC state after processing P and S. */
+	HMAC_SHA256_Init_Y(&PShctx, passwd, passwdlen);
+	HMAC_SHA256_Update_Y(&PShctx, salt, saltlen);
+
+	/* Iterate through the blocks. */
+	for (i = 0; i * 32 < dkLen; i++) {
+		/* Generate INT(i + 1). */
+		be32enc(ivec, (uint32_t)(i + 1));
+
+		/* Compute U_1 = PRF(P, S || INT(i)). */
+		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_Y));
+		HMAC_SHA256_Update_Y(&hctx, ivec, 4);
+		HMAC_SHA256_Final_Y(U, &hctx);
+
+		/* T_i = U_1 ... */
+		memcpy(T, U, 32);
+
+		for (j = 2; j <= c; j++) {
+			/* Compute U_j. */
+			HMAC_SHA256_Init_Y(&hctx, passwd, passwdlen);
+			HMAC_SHA256_Update_Y(&hctx, U, 32);
+			HMAC_SHA256_Final_Y(U, &hctx);
+
+			/* ... xor U_j ... */
+			for (k = 0; k < 32; k++)
+				T[k] ^= U[k];
+		}
+
+		/* Copy as many bytes as necessary into buf. */
+		clen = dkLen - i * 32;
+		if (clen > 32)
+			clen = 32;
+		memcpy(&buf[i * 32], T, clen);
+	}
+
+	/* Clean PShctx, since we never called _Final on it. */
+	memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
+}
diff --git a/src/crypto/defyx/sha256.h b/src/crypto/defyx/sha256.h
new file mode 100644
index 00000000..f935cfaa
--- /dev/null
+++ b/src/crypto/defyx/sha256.h
@@ -0,0 +1,62 @@
+/*-
+ * Copyright 2005,2007,2009 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/lib/libmd/sha256_Y.h,v 1.2 2006/01/17 15:35:56 phk Exp $
+ */
+
+#ifndef _SHA256_H_
+#define _SHA256_H_
+
+#include <sys/types.h>
+
+#include <stdint.h>
+
+typedef struct SHA256Context {
+	uint32_t state[8];
+	uint32_t count[2];
+	unsigned char buf[64];
+} SHA256_CTX_Y;
+
+typedef struct HMAC_SHA256Context {
+	SHA256_CTX_Y ictx;
+	SHA256_CTX_Y octx;
+} HMAC_SHA256_CTX_Y;
+
+void	SHA256_Init_Y(SHA256_CTX_Y *);
+void	SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t);
+void	SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *);
+void	HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
+void	HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
+void	HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *);
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+void	PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t,
+    uint64_t, uint8_t *, size_t);
+
+#endif /* !_SHA256_H_ */
diff --git a/src/crypto/defyx/sysendian.h b/src/crypto/defyx/sysendian.h
new file mode 100644
index 00000000..04951946
--- /dev/null
+++ b/src/crypto/defyx/sysendian.h
@@ -0,0 +1,138 @@
+/*-
+ * Copyright 2007-2009 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+#ifndef _SYSENDIAN_H_
+#define _SYSENDIAN_H_
+
+/* If we don't have be64enc, the <sys/endian.h> we have isn't usable. */
+#if !HAVE_DECL_BE64ENC
+#undef HAVE_SYS_ENDIAN_H
+#endif
+
+#ifdef HAVE_SYS_ENDIAN_H
+
+#include <sys/endian.h>
+
+#else
+
+#include <stdint.h>
+
+static inline uint32_t
+be32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
+	    ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
+}
+
+static inline void
+be32enc(void *pp, uint32_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[3] = x & 0xff;
+	p[2] = (x >> 8) & 0xff;
+	p[1] = (x >> 16) & 0xff;
+	p[0] = (x >> 24) & 0xff;
+}
+
+static inline uint64_t
+be64dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint64_t)(p[7]) + ((uint64_t)(p[6]) << 8) +
+	    ((uint64_t)(p[5]) << 16) + ((uint64_t)(p[4]) << 24) +
+	    ((uint64_t)(p[3]) << 32) + ((uint64_t)(p[2]) << 40) +
+	    ((uint64_t)(p[1]) << 48) + ((uint64_t)(p[0]) << 56));
+}
+
+static inline void
+be64enc(void *pp, uint64_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[7] = x & 0xff;
+	p[6] = (x >> 8) & 0xff;
+	p[5] = (x >> 16) & 0xff;
+	p[4] = (x >> 24) & 0xff;
+	p[3] = (x >> 32) & 0xff;
+	p[2] = (x >> 40) & 0xff;
+	p[1] = (x >> 48) & 0xff;
+	p[0] = (x >> 56) & 0xff;
+}
+
+static inline uint32_t
+le32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
+	    ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
+}
+
+static inline void
+le32enc(void *pp, uint32_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[0] = x & 0xff;
+	p[1] = (x >> 8) & 0xff;
+	p[2] = (x >> 16) & 0xff;
+	p[3] = (x >> 24) & 0xff;
+}
+
+static inline uint64_t
+le64dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint64_t)(p[0]) + ((uint64_t)(p[1]) << 8) +
+	    ((uint64_t)(p[2]) << 16) + ((uint64_t)(p[3]) << 24) +
+	    ((uint64_t)(p[4]) << 32) + ((uint64_t)(p[5]) << 40) +
+	    ((uint64_t)(p[6]) << 48) + ((uint64_t)(p[7]) << 56));
+}
+
+static inline void
+le64enc(void *pp, uint64_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[0] = x & 0xff;
+	p[1] = (x >> 8) & 0xff;
+	p[2] = (x >> 16) & 0xff;
+	p[3] = (x >> 24) & 0xff;
+	p[4] = (x >> 32) & 0xff;
+	p[5] = (x >> 40) & 0xff;
+	p[6] = (x >> 48) & 0xff;
+	p[7] = (x >> 56) & 0xff;
+}
+#endif /* !HAVE_SYS_ENDIAN_H */
+
+#endif /* !_SYSENDIAN_H_ */
diff --git a/src/crypto/defyx/yescrypt-best.c b/src/crypto/defyx/yescrypt-best.c
new file mode 100644
index 00000000..4e836215
--- /dev/null
+++ b/src/crypto/defyx/yescrypt-best.c
@@ -0,0 +1,5 @@
+#ifdef __SSE2__
+#include "yescrypt-simd.c"
+#else
+#include "yescrypt-opt.c"
+#endif
diff --git a/src/crypto/defyx/yescrypt-opt.c b/src/crypto/defyx/yescrypt-opt.c
new file mode 100644
index 00000000..3da0a532
--- /dev/null
+++ b/src/crypto/defyx/yescrypt-opt.c
@@ -0,0 +1,1102 @@
+/*-
+ * Copyright 2009 Colin Percival
+ * Copyright 2013-2015 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "sha256.h"
+#include "sysendian.h"
+
+#include "yescrypt.h"
+
+#include "yescrypt-platform.c"
+
+static inline void
+blkcpy(uint64_t * dest, const uint64_t * src, size_t count)
+{
+	do {
+		*dest++ = *src++; *dest++ = *src++;
+		*dest++ = *src++; *dest++ = *src++;
+	} while (count -= 4);
+}
+
+static inline void
+blkxor(uint64_t * dest, const uint64_t * src, size_t count)
+{
+	do {
+		*dest++ ^= *src++; *dest++ ^= *src++;
+		*dest++ ^= *src++; *dest++ ^= *src++;
+	} while (count -= 4);
+}
+
+typedef union {
+	uint32_t w[16];
+	uint64_t d[8];
+} salsa20_blk_t;
+
+static inline void
+salsa20_simd_shuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
+{
+#define COMBINE(out, in1, in2) \
+	Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
+	COMBINE(0, 0, 2)
+	COMBINE(1, 5, 7)
+	COMBINE(2, 2, 4)
+	COMBINE(3, 7, 1)
+	COMBINE(4, 4, 6)
+	COMBINE(5, 1, 3)
+	COMBINE(6, 6, 0)
+	COMBINE(7, 3, 5)
+#undef COMBINE
+}
+
+static inline void
+salsa20_simd_unshuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
+{
+#define UNCOMBINE(out, in1, in2) \
+	Bout->w[out * 2] = Bin->d[in1]; \
+	Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
+	UNCOMBINE(0, 0, 6)
+	UNCOMBINE(1, 5, 3)
+	UNCOMBINE(2, 2, 0)
+	UNCOMBINE(3, 7, 5)
+	UNCOMBINE(4, 4, 2)
+	UNCOMBINE(5, 1, 7)
+	UNCOMBINE(6, 6, 4)
+	UNCOMBINE(7, 3, 1)
+#undef UNCOMBINE
+}
+
+/**
+ * salsa20(B):
+ * Apply the Salsa20 core to the provided block.
+ */
+static void
+salsa20(uint64_t B[8], uint32_t doublerounds)
+{
+	salsa20_blk_t X;
+#define x X.w
+
+	salsa20_simd_unshuffle((const salsa20_blk_t *)B, &X);
+
+	do {
+#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
+		/* Operate on columns */
+		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
+		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
+
+		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
+		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
+
+		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
+		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
+
+		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
+		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
+
+		/* Operate on rows */
+		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
+		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
+
+		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
+		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
+
+		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
+		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
+
+		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
+		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
+#undef R
+	} while (--doublerounds);
+#undef x
+
+	{
+		uint32_t i;
+		salsa20_blk_t Y;
+		salsa20_simd_shuffle(&X, &Y);
+		for (i = 0; i < 16; i += 4) {
+			((salsa20_blk_t *)B)->w[i] += Y.w[i];
+			((salsa20_blk_t *)B)->w[i + 1] += Y.w[i + 1];
+			((salsa20_blk_t *)B)->w[i + 2] += Y.w[i + 2];
+			((salsa20_blk_t *)B)->w[i + 3] += Y.w[i + 3];
+		}
+	}
+}
+
+/**
+ * blockmix_salsa8(Bin, Bout, X, r):
+ * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
+ * bytes in length; the output Bout must also be the same size.  The
+ * temporary space X must be 64 bytes.
+ */
+static void
+blockmix_salsa8(const uint64_t * Bin, uint64_t * Bout, uint64_t * X, size_t r)
+{
+	size_t i;
+
+	/* 1: X <-- B_{2r - 1} */
+	blkcpy(X, &Bin[(2 * r - 1) * 8], 8);
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < 2 * r; i += 2) {
+		/* 3: X <-- H(X \xor B_i) */
+		blkxor(X, &Bin[i * 8], 8);
+		salsa20(X, 4);
+
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		blkcpy(&Bout[i * 4], X, 8);
+
+		/* 3: X <-- H(X \xor B_i) */
+		blkxor(X, &Bin[i * 8 + 8], 8);
+		salsa20(X, 4);
+
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		blkcpy(&Bout[i * 4 + r * 8], X, 8);
+	}
+}
+
+/* These are tunable */
+#define PWXsimple 2
+#define PWXgather 4
+#define PWXrounds 6
+#define Swidth 8
+
+/* Derived values.  Not tunable on their own. */
+#define PWXbytes (PWXgather * PWXsimple * 8)
+#define PWXwords (PWXbytes / sizeof(uint64_t))
+#define Sbytes (3 * (1 << Swidth) * PWXsimple * 8)
+#define Swords (Sbytes / sizeof(uint64_t))
+#define Smask (((1 << Swidth) - 1) * PWXsimple * 8)
+#define Smask2 (((uint64_t)Smask << 32) | Smask)
+#define rmin ((PWXbytes + 127) / 128)
+
+#if PWXbytes % 32 != 0
+#error "blkcpy() and blkxor() currently work on multiples of 32."
+#endif
+
+typedef struct {
+	uint64_t *S0, *S1, *S2;
+	size_t w;
+} pwxform_ctx_t;
+
+#define Salloc (Sbytes + ((sizeof(pwxform_ctx_t) + 63) & ~63U))
+
+/**
+ * pwxform(B):
+ * Transform the provided block using the provided S-boxes.
+ */
+static void
+pwxform(uint64_t * B, pwxform_ctx_t * ctx)
+{
+	uint64_t (*X)[PWXsimple] = (uint64_t (*)[PWXsimple])B;
+	uint64_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2;
+	size_t w = ctx->w;
+	size_t i, j;
+#if PWXsimple > 2
+	size_t k;
+#endif
+
+	/* 2: for j = 0 to PWXgather - 1 do */
+	for (j = 0; j < PWXgather; j++) {
+		uint64_t *Xj = X[j];
+		uint64_t x0 = Xj[0];
+#if PWXsimple > 1
+		uint64_t x1 = Xj[1];
+#endif
+
+		/* 1: for i = 0 to PWXrounds - 1 do */
+		for (i = 0; i < PWXrounds; i++) {
+			uint64_t x = x0 & Smask2;
+			const uint64_t *p0, *p1;
+
+			/* 3: p0 <-- (lo(B_{j,0}) & Smask) / (PWXsimple * 8) */
+			p0 = (const uint64_t *)((uint8_t *)S0 + (uint32_t)x);
+			/* 4: p1 <-- (hi(B_{j,0}) & Smask) / (PWXsimple * 8) */
+			p1 = (const uint64_t *)((uint8_t *)S1 + (x >> 32));
+
+			/* 5: for k = 0 to PWXsimple - 1 do */
+			/* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) \xor S1_{p1,k} */
+			x0 = (uint64_t)(x0 >> 32) * (uint32_t)x0;
+			x0 += p0[0];
+			x0 ^= p1[0];
+
+#if PWXsimple > 1
+			/* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) \xor S1_{p1,k} */
+			x1 = (uint64_t)(x1 >> 32) * (uint32_t)x1;
+			x1 += p0[1];
+			x1 ^= p1[1];
+#endif
+
+#if PWXsimple > 2
+			/* 5: for k = 0 to PWXsimple - 1 do */
+			for (k = 2; k < PWXsimple; k++) {
+				/* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) \xor S1_{p1,k} */
+				x = Xj[k];
+
+				x = (uint64_t)(x >> 32) * (uint32_t)x;
+				x += p0[k];
+				x ^= p1[k];
+
+				Xj[k] = x;
+			}
+#endif
+
+			/* 8: if (i != 0) and (i != PWXrounds - 1) */
+			if ((i - 1) < PWXrounds - 2) {
+				/* 9: S2_w <-- B_j */
+				/* 10: w <-- w + 1 */
+				uint64_t *p2 = (uint64_t *)((uint8_t *)S2 + w);
+				w += PWXbytes;
+				p2[0] = x0;
+#if PWXsimple > 1
+				p2[1] = x1;
+#endif
+#if PWXsimple > 2
+				for (k = 2; k < PWXsimple; k++)
+					p2[k] = Xj[k];
+#endif
+			}
+		}
+
+		Xj[0] = x0;
+#if PWXsimple > 1
+		Xj[1] = x1;
+#endif
+
+		w -= (PWXrounds - 2) * PWXbytes - PWXsimple * 8;
+	}
+
+	/* 14: (S0, S1, S2) <-- (S2, S0, S1) */
+	ctx->S0 = S2;
+	ctx->S1 = S0;
+	ctx->S2 = S1;
+	/* 15: w <-- w mod 2^Swidth */
+	ctx->w = (w + (PWXrounds - 3) * PWXbytes) & Smask;
+}
+
+/**
+ * blockmix_pwxform(Bin, Bout, S, r):
+ * Compute Bout = BlockMix_pwxform{salsa20/2, ctx, r}(Bin).  The input Bin must
+ * be 128r bytes in length; the output Bout must also be the same size.
+ */
+static void
+blockmix_pwxform(const uint64_t * Bin, uint64_t * Bout,
+    pwxform_ctx_t * ctx, size_t r)
+{
+	size_t r1, r2, i;
+
+	/* Convert 128-byte blocks to PWXbytes blocks */
+	/* 1: r_1 <-- 128r / PWXbytes */
+	r1 = r * 128 / PWXbytes;
+
+	/* 2: X <-- B'_{r_1 - 1} */
+	blkcpy(Bout, &Bin[(r1 - 1) * PWXwords], PWXwords);
+
+	/* 3: for i = 0 to r_1 - 1 do */
+	/* 4: if r_1 > 1 */
+	if (r1 > 1) {
+		/* 5: X <-- X \xor B'_i */
+		blkxor(Bout, Bin, PWXwords);
+	}
+
+	/* 7: X <-- pwxform(X) */
+	/* 8: B'_i <-- X */
+	pwxform(Bout, ctx);
+
+	/* 3: for i = 0 to r_1 - 1 do */
+	for (i = 1; i < r1; i++) {
+		/* 5: X <-- X \xor B'_i */
+		blkcpy(&Bout[i * PWXwords], &Bout[(i - 1) * PWXwords],
+		    PWXwords);
+		blkxor(&Bout[i * PWXwords], &Bin[i * PWXwords], PWXwords);
+
+		/* 7: X <-- pwxform(X) */
+		/* 8: B'_i <-- X */
+		pwxform(&Bout[i * PWXwords], ctx);
+	}
+
+#if PWXbytes > 128
+	/*
+	 * Handle partial blocks.  If we were using just one buffer, like in
+	 * the algorithm specification, the data would already be there, but
+	 * since we use separate input and output buffers, we may have to copy
+	 * some data over (which will then be processed by the Salsa20/8
+	 * invocations below) in this special case - that is, when 128r is not
+	 * a multiple of PWXbytes.  Since PWXgather and PWXsimple must each be
+	 * a power of 2 (per the specification), PWXbytes is also a power of 2.
+	 * Thus, 128r is obviously a multiple of valid values of PWXbytes up to
+	 * 128, inclusive.  When PWXbytes is larger than that (thus, 256 or
+	 * larger) we perform this extra check.
+	 */
+	if (i * PWXwords < r * 16)
+		blkcpy(&Bout[i * PWXwords], &Bin[i * PWXwords],
+		    r * 16 - i * PWXwords);
+#endif
+
+	/* 10: i <-- floor((r_1 - 1) * PWXbytes / 64) */
+	i = (r1 - 1) * PWXbytes / 64;
+
+	/* Convert 128-byte blocks to 64-byte blocks */
+	r2 = r * 2;
+
+	/* 11: B_i <-- H(B_i) */
+	salsa20(&Bout[i * 8], 1);
+
+	for (i++; i < r2; i++) {
+		/* 13: B_i <-- H(B_i \xor B_{i-1}) */
+		blkxor(&Bout[i * 8], &Bout[(i - 1) * 8], 8);
+		salsa20(&Bout[i * 8], 1);
+	}
+}
+
+/**
+ * integerify(B, r):
+ * Return the result of parsing B_{2r-1} as a little-endian integer.
+ */
+static inline uint64_t
+integerify(const uint64_t * B, size_t r)
+{
+/*
+ * Our 64-bit words are in host byte order, and word 6 holds the second 32-bit
+ * word of B_{2r-1} due to SIMD shuffling.  The 64-bit value we return is also
+ * in host byte order, as it should be.
+ */
+	const uint64_t * X = &B[(2 * r - 1) * 8];
+	uint32_t lo = X[0];
+	uint32_t hi = X[6] >> 32;
+	return ((uint64_t)hi << 32) + lo;
+}
+
+/**
+ * smix1(B, r, N, flags, V, NROM, VROM, XY, ctx):
+ * Compute first loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage XY must be 256r + 64 bytes in length.  The value N must be even and
+ * no smaller than 2.
+ */
+static void
+smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags,
+    uint64_t * V, uint64_t NROM, const uint64_t * VROM,
+    uint64_t * XY, pwxform_ctx_t * ctx)
+{
+	size_t s = 16 * r;
+	uint64_t * X = V;
+	uint64_t * Y = &XY[s];
+	uint64_t * Z = &XY[2 * s];
+	uint64_t n, i, j;
+	size_t k;
+
+	/* 1: X <-- B */
+	/* 3: V_i <-- X */
+	for (i = 0; i < 2 * r; i++) {
+		const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
+		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
+		salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
+		for (k = 0; k < 16; k++)
+			tmp->w[k] = le32dec(&src->w[k]);
+		salsa20_simd_shuffle(tmp, dst);
+	}
+
+	/* 4: X <-- H(X) */
+	/* 3: V_i <-- X */
+	if (ctx)
+		blockmix_pwxform(X, Y, ctx, r);
+	else
+		blockmix_salsa8(X, Y, Z, r);
+	blkcpy(&V[s], Y, s);
+
+	X = XY;
+
+	if (VROM) {
+		/* j <-- Integerify(X) mod NROM */
+		j = integerify(Y, r) & (NROM - 1);
+
+		/* X <-- H(X \xor VROM_j) */
+		blkxor(Y, &VROM[j * s], s);
+
+		blockmix_pwxform(Y, X, ctx, r);
+
+		/* 2: for i = 0 to N - 1 do */
+		for (n = 1, i = 2; i < N; i += 2) {
+			/* 3: V_i <-- X */
+			blkcpy(&V[i * s], X, s);
+
+			if ((i & (i - 1)) == 0)
+				n <<= 1;
+
+			/* j <-- Wrap(Integerify(X), i) */
+			j = integerify(X, r) & (n - 1);
+			j += i - n;
+
+			/* X <-- X \xor V_j */
+			blkxor(X, &V[j * s], s);
+
+			/* 4: X <-- H(X) */
+			blockmix_pwxform(X, Y, ctx, r);
+
+			/* 3: V_i <-- X */
+			blkcpy(&V[(i + 1) * s], Y, s);
+
+			/* j <-- Integerify(X) mod NROM */
+			j = integerify(Y, r) & (NROM - 1);
+
+			/* X <-- H(X \xor VROM_j) */
+			blkxor(Y, &VROM[j * s], s);
+
+			blockmix_pwxform(Y, X, ctx, r);
+		}
+	} else if (flags & YESCRYPT_RW) {
+		/* 4: X <-- H(X) */
+		blockmix_pwxform(Y, X, ctx, r);
+
+		/* 2: for i = 0 to N - 1 do */
+		for (n = 1, i = 2; i < N; i += 2) {
+			/* 3: V_i <-- X */
+			blkcpy(&V[i * s], X, s);
+
+			if ((i & (i - 1)) == 0)
+				n <<= 1;
+
+			/* j <-- Wrap(Integerify(X), i) */
+			j = integerify(X, r) & (n - 1);
+			j += i - n;
+
+			/* X <-- X \xor V_j */
+			blkxor(X, &V[j * s], s);
+
+			/* 4: X <-- H(X) */
+			blockmix_pwxform(X, Y, ctx, r);
+
+			/* 3: V_i <-- X */
+			blkcpy(&V[(i + 1) * s], Y, s);
+
+			/* j <-- Wrap(Integerify(X), i) */
+			j = integerify(Y, r) & (n - 1);
+			j += (i + 1) - n;
+
+			/* X <-- X \xor V_j */
+			blkxor(Y, &V[j * s], s);
+
+			/* 4: X <-- H(X) */
+			blockmix_pwxform(Y, X, ctx, r);
+		}
+	} else {
+		/* 4: X <-- H(X) */
+		blockmix_salsa8(Y, X, Z, r);
+
+		/* 2: for i = 0 to N - 1 do */
+		for (n = 1, i = 2; i < N; i += 2) {
+			/* 3: V_i <-- X */
+			blkcpy(&V[i * s], X, s);
+
+			/* 4: X <-- H(X) */
+			blockmix_salsa8(X, Y, Z, r);
+
+			/* 3: V_i <-- X */
+			blkcpy(&V[(i + 1) * s], Y, s);
+
+			/* 4: X <-- H(X) */
+			blockmix_salsa8(Y, X, Z, r);
+		}
+	}
+
+	/* B' <-- X */
+	for (i = 0; i < 2 * r; i++) {
+		const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
+		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
+		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
+		for (k = 0; k < 16; k++)
+			le32enc(&tmp->w[k], src->w[k]);
+		salsa20_simd_unshuffle(tmp, dst);
+	}
+}
+
+/**
+ * smix2(B, r, N, Nloop, flags, V, NROM, VROM, XY, ctx):
+ * Compute second loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage XY must be 256r + 64 bytes in length.  The value N must be a
+ * power of 2 greater than 1.  The value Nloop must be even.
+ */
+static void
+smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop,
+    yescrypt_flags_t flags,
+    uint64_t * V, uint64_t NROM, const uint64_t * VROM,
+    uint64_t * XY, pwxform_ctx_t * ctx)
+{
+	size_t s = 16 * r;
+	uint64_t * X = XY;
+	uint64_t * Y = &XY[s];
+	uint64_t i, j;
+
+	if (Nloop == 0)
+		return;
+
+	/* X <-- B' */
+	for (i = 0; i < 2 * r; i++) {
+		const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
+		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
+		salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
+		size_t k;
+		for (k = 0; k < 16; k++)
+			tmp->w[k] = le32dec(&src->w[k]);
+		salsa20_simd_shuffle(tmp, dst);
+	}
+
+	if (VROM) {
+		yescrypt_flags_t rw = flags & YESCRYPT_RW;
+
+		/* 6: for i = 0 to N - 1 do */
+		for (i = 0; i < Nloop; i += 2) {
+			/* 7: j <-- Integerify(X) mod N */
+			j = integerify(X, r) & (N - 1);
+
+			/* 8: X <-- H(X \xor V_j) */
+			blkxor(X, &V[j * s], s);
+			/* V_j <-- Xprev \xor V_j */
+			if (rw)
+				blkcpy(&V[j * s], X, s);
+			blockmix_pwxform(X, Y, ctx, r);
+
+			/* j <-- Integerify(X) mod NROM */
+			j = integerify(Y, r) & (NROM - 1);
+
+			/* X <-- H(X \xor VROM_j) */
+			blkxor(Y, &VROM[j * s], s);
+
+			blockmix_pwxform(Y, X, ctx, r);
+		}
+	} else if (ctx) {
+		yescrypt_flags_t rw = flags & YESCRYPT_RW;
+
+		/* 6: for i = 0 to N - 1 do */
+		i = Nloop / 2;
+		do {
+			/* 7: j <-- Integerify(X) mod N */
+			j = integerify(X, r) & (N - 1);
+
+			/* 8: X <-- H(X \xor V_j) */
+			blkxor(X, &V[j * s], s);
+			/* V_j <-- Xprev \xor V_j */
+			if (rw)
+				blkcpy(&V[j * s], X, s);
+			blockmix_pwxform(X, Y, ctx, r);
+
+			/* 7: j <-- Integerify(X) mod N */
+			j = integerify(Y, r) & (N - 1);
+
+			/* 8: X <-- H(X \xor V_j) */
+			blkxor(Y, &V[j * s], s);
+			/* V_j <-- Xprev \xor V_j */
+			if (rw)
+				blkcpy(&V[j * s], Y, s);
+			blockmix_pwxform(Y, X, ctx, r);
+		} while (--i);
+	} else {
+		uint64_t * Z = &XY[2 * s];
+
+		/* 6: for i = 0 to N - 1 do */
+		i = Nloop / 2;
+		do {
+			/* 7: j <-- Integerify(X) mod N */
+			j = integerify(X, r) & (N - 1);
+
+			/* 8: X <-- H(X \xor V_j) */
+			blkxor(X, &V[j * s], s);
+			/* V_j <-- Xprev \xor V_j */
+			blockmix_salsa8(X, Y, Z, r);
+
+			/* 7: j <-- Integerify(X) mod N */
+			j = integerify(Y, r) & (N - 1);
+
+			/* 8: X <-- H(X \xor V_j) */
+			blkxor(Y, &V[j * s], s);
+			/* V_j <-- Xprev \xor V_j */
+			blockmix_salsa8(Y, X, Z, r);
+		} while (--i);
+	}
+
+	/* 10: B' <-- X */
+	for (i = 0; i < 2 * r; i++) {
+		const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
+		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
+		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
+		size_t k;
+		for (k = 0; k < 16; k++)
+			le32enc(&tmp->w[k], src->w[k]);
+		salsa20_simd_unshuffle(tmp, dst);
+	}
+}
+
+/**
+ * p2floor(x):
+ * Largest power of 2 not greater than argument.
+ */
+static uint64_t
+p2floor(uint64_t x)
+{
+	uint64_t y;
+	while ((y = x & (x - 1)))
+		x = y;
+	return x;
+}
+
+/**
+ * smix(B, r, N, p, t, flags, V, NROM, VROM, XY, S, passwd):
+ * Compute B = SMix_r(B, N).  The input B must be 128rp bytes in length; the
+ * temporary storage V must be 128rN bytes in length; the temporary storage
+ * XY must be 256r+64 or (256r+64)*p bytes in length (the larger size is
+ * required with OpenMP-enabled builds).  The value N must be a power of 2
+ * greater than 1.
+ */
+static void
+smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t,
+    yescrypt_flags_t flags,
+    uint64_t * V, uint64_t NROM, const uint64_t * VROM,
+    uint64_t * XY, uint8_t * S, uint8_t * passwd)
+{
+	size_t s = 16 * r;
+	uint64_t Nchunk, Nloop_all, Nloop_rw;
+	uint32_t i;
+
+	/* 1: n <-- N / p */
+	Nchunk = N / p;
+
+	/* 2: Nloop_all <-- fNloop(n, t, flags) */
+	Nloop_all = Nchunk;
+	if (flags & YESCRYPT_RW) {
+		if (t <= 1) {
+			if (t)
+				Nloop_all *= 2; /* 2/3 */
+			Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */
+		} else {
+			Nloop_all *= t - 1;
+		}
+	} else if (t) {
+		if (t == 1)
+			Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */
+		Nloop_all *= t;
+	}
+
+	/* 6: Nloop_rw <-- 0 */
+	Nloop_rw = 0;
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		Nloop_rw = Nloop_all;
+	} else {
+		/* 3: if YESCRYPT_RW flag is set */
+		if (flags & YESCRYPT_RW) {
+			/* 4: Nloop_rw <-- Nloop_all / p */
+			Nloop_rw = Nloop_all / p;
+		}
+	}
+
+	/* 8: n <-- n - (n mod 2) */
+	Nchunk &= ~(uint64_t)1; /* round down to even */
+	/* 9: Nloop_all <-- Nloop_all + (Nloop_all mod 2) */
+	Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */
+	/* 10: Nloop_rw <-- Nloop_rw + (Nloop_rw mod 2) */
+	Nloop_rw++; Nloop_rw &= ~(uint64_t)1; /* round up to even */
+
+	/* 11: for i = 0 to p - 1 do */
+#ifdef _OPENMP
+#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, VROM, XY, S, passwd, s, Nchunk, Nloop_all, Nloop_rw)
+	{
+#pragma omp for
+#endif
+	for (i = 0; i < p; i++) {
+		/* 12: u <-- in */
+		uint64_t Vchunk = i * Nchunk;
+		/* 13: if i = p - 1 */
+		/* 14:   n <-- N - u */
+		/* 15: end if */
+		/* 16: v <-- u + n - 1 */
+		uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk);
+		uint64_t * Bp = &B[i * s];
+		uint64_t * Vp = &V[Vchunk * s];
+#ifdef _OPENMP
+		uint64_t * XYp = &XY[i * (2 * s + 8)];
+#else
+		uint64_t * XYp = XY;
+#endif
+		pwxform_ctx_t * ctx_i = NULL;
+		/* 17: if YESCRYPT_RW flag is set */
+		if (flags & YESCRYPT_RW) {
+			uint64_t *Si = (uint64_t *)(S + i * Salloc);
+			/* 18: SMix1_1(B_i, Sbytes / 128, S_i, no flags) */
+			smix1(Bp, 1, Sbytes / 128, 0 /* no flags */,
+			    Si, 0, NULL, XYp, NULL);
+			ctx_i = (pwxform_ctx_t *)(Si + Swords);
+			/* 19: S2_i <-- S_{i,0...2^Swidth-1} */
+			ctx_i->S2 = Si;
+			/* 20: S1_i <-- S_{i,2^Swidth...2*2^Swidth-1} */
+			ctx_i->S1 = Si + Swords / 3;
+			/* 21: S0_i <-- S_{i,2*2^Swidth...3*2^Swidth-1} */
+			ctx_i->S0 = Si + Swords / 3 * 2;
+			/* 22: w_i <-- 0 */
+			ctx_i->w = 0;
+			/* 23: if i = 0 */
+			if (i == 0) {
+				/* 24: passwd <-- HMAC-SHA256(B_{0,2r-1}, passwd) */
+				HMAC_SHA256_CTX_Y ctx;
+				HMAC_SHA256_Init_Y(&ctx, Bp + (s - 8), 64);
+				HMAC_SHA256_Update_Y(&ctx, passwd, 32);
+				HMAC_SHA256_Final_Y(passwd, &ctx);
+			}
+		}
+		if (!(flags & __YESCRYPT_INIT_SHARED_2)) {
+			/* 27: SMix1_r(B_i, n, V_{u..v}, flags) */
+			smix1(Bp, r, Np, flags, Vp, NROM, VROM, XYp, ctx_i);
+		}
+		/* 28: SMix2_r(B_i, p2floor(n), Nloop_rw, V_{u..v}, flags) */
+		smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp,
+		    NROM, VROM, XYp, ctx_i);
+	}
+
+	/* 30: for i = 0 to p - 1 do */
+	if (Nloop_all > Nloop_rw) {
+#ifdef _OPENMP
+#pragma omp for
+#endif
+		for (i = 0; i < p; i++) {
+			uint64_t * Bp = &B[i * s];
+#ifdef _OPENMP
+			uint64_t * XYp = &XY[i * (2 * s + 8)];
+#else
+			uint64_t * XYp = XY;
+#endif
+			pwxform_ctx_t * ctx_i = NULL;
+			if (flags & YESCRYPT_RW)
+				ctx_i = (pwxform_ctx_t *)(S + i * Salloc + Sbytes);
+			/* 31: SMix2_r(B_i, N, Nloop_all - Nloop_rw, V, flags excluding YESCRYPT_RW) */
+			smix2(Bp, r, N, Nloop_all - Nloop_rw,
+			    flags & ~YESCRYPT_RW, V, NROM, VROM, XYp, ctx_i);
+		}
+	}
+#ifdef _OPENMP
+	}
+#endif
+}
+
+/**
+ * yescrypt_kdf_body(shared, local, passwd, passwdlen, salt, saltlen,
+ *     N, r, p, t, flags, buf, buflen):
+ * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
+ * p, buflen), or a revision of scrypt as requested by flags and shared, and
+ * write the result into buf.  The parameters r, p, and buflen must satisfy
+ * r * p < 2^30 and buflen <= (2^32 - 1) * 32.  The parameter N must be a power
+ * of 2 greater than 1.
+ *
+ * t controls computation time while not affecting peak memory usage.  shared
+ * and flags may request special modes as described in yescrypt.h.  local is
+ * the thread-local data structure, allowing to preserve and reuse a memory
+ * allocation across calls, thereby reducing its overhead.
+ *
+ * Return 0 on success; or -1 on error.
+ */
+static int
+yescrypt_kdf_body(const yescrypt_shared_t * shared, yescrypt_local_t * local,
+    const uint8_t * passwd, size_t passwdlen,
+    const uint8_t * salt, size_t saltlen,
+    uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags,
+    uint8_t * buf, size_t buflen)
+{
+	yescrypt_region_t tmp;
+	uint64_t NROM;
+	const uint64_t * VROM;
+	size_t B_size, V_size, XY_size, need;
+	uint64_t * B, * V, * XY;
+	uint8_t * S;
+	uint64_t sha256[4];
+	uint8_t dk[sizeof(sha256)], * dkp = buf;
+
+	/* Sanity-check parameters */
+	if ((flags & ~YESCRYPT_KNOWN_FLAGS) || (!flags && t)) {
+		errno = EINVAL;
+		return -1;
+	}
+#if SIZE_MAX > UINT32_MAX
+	if (buflen > (((uint64_t)(1) << 32) - 1) * 32) {
+		errno = EFBIG;
+		return -1;
+	}
+#endif
+	if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) {
+		errno = EFBIG;
+		return -1;
+	}
+	if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) {
+		errno = EINVAL;
+		return -1;
+	}
+	if ((p > SIZE_MAX / ((size_t)256 * r + 64)) ||
+#if SIZE_MAX / 256 <= UINT32_MAX
+	    (r > SIZE_MAX / 256) ||
+#endif
+	    (N > SIZE_MAX / 128 / r)) {
+		errno = ENOMEM;
+		return -1;
+	}
+	if (N > UINT64_MAX / ((uint64_t)t + 1)) {
+		errno = EFBIG;
+		return -1;
+	}
+	if (flags & YESCRYPT_RW) {
+		if ((flags & YESCRYPT_WORM) || (N / p <= 1) || (r < rmin)) {
+			errno = EINVAL;
+			return -1;
+		}
+		if (p > SIZE_MAX / Salloc) {
+			errno = ENOMEM;
+			return -1;
+		}
+	}
+#ifdef _OPENMP
+	else if (N > SIZE_MAX / 128 / (r * p)) {
+		errno = ENOMEM;
+		return -1;
+	}
+#endif
+
+	NROM = 0;
+	VROM = NULL;
+	if (shared) {
+		NROM = shared->aligned_size / ((size_t)128 * r);
+		if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) ||
+		    !(flags & YESCRYPT_RW)) {
+			errno = EINVAL;
+			return -1;
+		}
+		VROM = shared->aligned;
+	}
+
+	/* Allocate memory */
+	V = NULL;
+	V_size = (size_t)128 * r * N;
+#ifdef _OPENMP
+	if (!(flags & YESCRYPT_RW))
+		V_size *= p;
+#endif
+	need = V_size;
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		if (local->aligned_size < need) {
+			if (local->base || local->aligned ||
+			    local->base_size || local->aligned_size) {
+				errno = EINVAL;
+				return -1;
+			}
+			if (!alloc_region(local, need))
+				return -1;
+		}
+		V = (uint64_t *)local->aligned;
+		need = 0;
+	}
+	B_size = (size_t)128 * r * p;
+	need += B_size;
+	if (need < B_size) {
+		errno = ENOMEM;
+		return -1;
+	}
+	XY_size = (size_t)256 * r + 64;
+#ifdef _OPENMP
+	XY_size *= p;
+#endif
+	need += XY_size;
+	if (need < XY_size) {
+		errno = ENOMEM;
+		return -1;
+	}
+	if (flags & YESCRYPT_RW) {
+		size_t S_size = (size_t)Salloc * p;
+		need += S_size;
+		if (need < S_size) {
+			errno = ENOMEM;
+			return -1;
+		}
+	}
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		if (!alloc_region(&tmp, need))
+			return -1;
+		B = (uint64_t *)tmp.aligned;
+		XY = (uint64_t *)((uint8_t *)B + B_size);
+	} else {
+		init_region(&tmp);
+		if (local->aligned_size < need) {
+			if (free_region(local))
+				return -1;
+			if (!alloc_region(local, need))
+				return -1;
+		}
+		B = (uint64_t *)local->aligned;
+		V = (uint64_t *)((uint8_t *)B + B_size);
+		XY = (uint64_t *)((uint8_t *)V + V_size);
+	}
+	S = NULL;
+	if (flags & YESCRYPT_RW)
+		S = (uint8_t *)XY + XY_size;
+
+	if (flags) {
+		HMAC_SHA256_CTX_Y ctx;
+		HMAC_SHA256_Init_Y(&ctx, "yescrypt-prehash",
+		    (flags & __YESCRYPT_PREHASH) ? 16 : 8);
+		HMAC_SHA256_Update_Y(&ctx, passwd, passwdlen);
+		HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx);
+		passwd = (uint8_t *)sha256;
+		passwdlen = sizeof(sha256);
+	}
+
+	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+	PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1,
+	    (uint8_t *)B, B_size);
+
+	if (flags)
+		blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0]));
+
+	if (p == 1 || (flags & YESCRYPT_RW)) {
+		smix(B, r, N, p, t, flags, V, NROM, VROM, XY, S,
+		    (uint8_t *)sha256);
+	} else {
+		uint32_t i;
+
+		/* 2: for i = 0 to p - 1 do */
+#ifdef _OPENMP
+#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, VROM, XY, S)
+#endif
+		for (i = 0; i < p; i++) {
+			/* 3: B_i <-- MF(B_i, N) */
+#ifdef _OPENMP
+			smix(&B[(size_t)16 * r * i], r, N, 1, t, flags,
+			    &V[(size_t)16 * r * i * N],
+			    NROM, VROM,
+			    &XY[((size_t)32 * r + 8) * i], NULL, NULL);
+#else
+			smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, V,
+			    NROM, VROM, XY, NULL, NULL);
+#endif
+		}
+	}
+
+	dkp = buf;
+	if (flags && buflen < sizeof(dk)) {
+		PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1,
+		    dk, sizeof(dk));
+		dkp = dk;
+	}
+
+	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+	PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen);
+
+	/*
+	 * Except when computing classic scrypt, allow all computation so far
+	 * to be performed on the client.  The final steps below match those of
+	 * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so
+	 * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of
+	 * SCRAM's use of SHA-1) would be usable with yescrypt hashes.
+	 */
+	if (flags && !(flags & __YESCRYPT_PREHASH)) {
+		/* Compute ClientKey */
+		{
+			HMAC_SHA256_CTX_Y ctx;
+			HMAC_SHA256_Init_Y(&ctx, dkp, sizeof(dk));
+			HMAC_SHA256_Update_Y(&ctx, "Client Key", 10);
+			HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx);
+		}
+		/* Compute StoredKey */
+		{
+			SHA256_CTX_Y ctx;
+			size_t clen = buflen;
+			if (clen > sizeof(dk))
+				clen = sizeof(dk);
+			SHA256_Init_Y(&ctx);
+			SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256));
+			SHA256_Final_Y(dk, &ctx);
+			memcpy(buf, dk, clen);
+		}
+	}
+
+	if (free_region(&tmp))
+		return -1;
+
+	/* Success! */
+	return 0;
+}
+
+/**
+ * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
+ *     N, r, p, t, g, flags, buf, buflen):
+ * Compute scrypt or its revision as requested by the parameters.  The inputs
+ * to this function are the same as those for yescrypt_kdf_body() above, with
+ * the addition of g, which controls hash upgrades (0 for no upgrades so far).
+ */
+int
+yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
+    const uint8_t * passwd, size_t passwdlen,
+    const uint8_t * salt, size_t saltlen,
+    uint64_t N, uint32_t r, uint32_t p, uint32_t t, uint32_t g,
+    yescrypt_flags_t flags,
+    uint8_t * buf, size_t buflen)
+{
+	uint8_t dk[32];
+
+	if ((flags & (YESCRYPT_RW | __YESCRYPT_INIT_SHARED)) == YESCRYPT_RW &&
+	    p >= 1 && N / p >= 0x100 && N / p * r >= 0x20000) {
+		int retval = yescrypt_kdf_body(shared, local,
+		    passwd, passwdlen, salt, saltlen,
+		    N >> 6, r, p, 0, flags | __YESCRYPT_PREHASH,
+		    dk, sizeof(dk));
+		if (retval)
+			return retval;
+		passwd = dk;
+		passwdlen = sizeof(dk);
+	}
+
+	do {
+		uint8_t * dkp = g ? dk : buf;
+		size_t dklen = g ? sizeof(dk) : buflen;
+		int retval = yescrypt_kdf_body(shared, local,
+		    passwd, passwdlen, salt, saltlen,
+		    N, r, p, t, flags, dkp, dklen);
+		if (retval)
+			return retval;
+
+		passwd = dkp;
+		passwdlen = dklen;
+
+		N <<= 2;
+		if (!N)
+			return -1;
+		t >>= 1;
+	} while (g--);
+
+	return 0;
+}
diff --git a/src/crypto/defyx/yescrypt-platform.c b/src/crypto/defyx/yescrypt-platform.c
new file mode 100644
index 00000000..8f6c22ba
--- /dev/null
+++ b/src/crypto/defyx/yescrypt-platform.c
@@ -0,0 +1,188 @@
+/*-
+ * Copyright 2013-2015 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/mman.h>
+#include <stdint.h>
+
+#include "yescrypt.h"
+
+#define HUGEPAGE_THRESHOLD		(12 * 1024 * 1024)
+
+#ifdef __x86_64__
+#define HUGEPAGE_SIZE			(2 * 1024 * 1024)
+#else
+#undef HUGEPAGE_SIZE
+#endif
+
+static void *
+alloc_region(yescrypt_region_t * region, size_t size)
+{
+	size_t base_size = size;
+	uint8_t * base, * aligned;
+#ifdef MAP_ANON
+	int flags =
+#ifdef MAP_NOCORE
+	    MAP_NOCORE |
+#endif
+	    MAP_ANON | MAP_PRIVATE;
+#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE)
+	size_t new_size = size;
+	const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1;
+	if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) {
+		flags |= MAP_HUGETLB;
+/*
+ * Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of
+ * huge page size, so let's round up to huge page size here.
+ */
+		new_size = size + hugepage_mask;
+		new_size &= ~hugepage_mask;
+	}
+	base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0);
+	if (base != MAP_FAILED) {
+		base_size = new_size;
+	} else
+	if (flags & MAP_HUGETLB) {
+		flags &= ~MAP_HUGETLB;
+		base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+	}
+
+#else
+	base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+#endif
+	if (base == MAP_FAILED)
+		base = NULL;
+	aligned = base;
+#elif defined(HAVE_POSIX_MEMALIGN)
+	if ((errno = posix_memalign((void **)&base, 64, size)) != 0)
+		base = NULL;
+	aligned = base;
+#else
+	base = aligned = NULL;
+	if (size + 63 < size) {
+		errno = ENOMEM;
+	} else if ((base = malloc(size + 63)) != NULL) {
+		aligned = base + 63;
+		aligned -= (uintptr_t)aligned & 63;
+	}
+#endif
+	region->base = base;
+	region->aligned = aligned;
+	region->base_size = base ? base_size : 0;
+	region->aligned_size = base ? size : 0;
+	return aligned;
+}
+
+static inline void
+init_region(yescrypt_region_t * region)
+{
+	region->base = region->aligned = NULL;
+	region->base_size = region->aligned_size = 0;
+}
+
+static int
+free_region(yescrypt_region_t * region)
+{
+	if (region->base) {
+#ifdef MAP_ANON
+		if (munmap(region->base, region->base_size))
+			return -1;
+#else
+		free(region->base);
+#endif
+	}
+	init_region(region);
+	return 0;
+}
+
+int
+yescrypt_init_shared(yescrypt_shared_t * shared,
+    const uint8_t * param, size_t paramlen,
+    uint64_t N, uint32_t r, uint32_t p,
+    yescrypt_init_shared_flags_t flags,
+    uint8_t * buf, size_t buflen)
+{
+	yescrypt_shared_t half1, half2;
+	uint8_t salt[32];
+
+	if (flags & YESCRYPT_SHARED_PREALLOCATED) {
+		if (!shared->aligned || !shared->aligned_size)
+			return -1;
+	} else {
+		init_region(shared);
+	}
+	if (!param && !paramlen && !N && !r && !p && !buf && !buflen)
+		return 0;
+
+	if (yescrypt_kdf(NULL, shared,
+	    param, paramlen, NULL, 0, N, r, p, 0, 0,
+	    YESCRYPT_RW | __YESCRYPT_INIT_SHARED_1,
+	    salt, sizeof(salt)))
+		goto out;
+
+	half1 = half2 = *shared;
+	half1.aligned_size /= 2;
+	half2.aligned += half1.aligned_size;
+	half2.aligned_size = half1.aligned_size;
+	N /= 2;
+
+	if (p > 1 && yescrypt_kdf(&half1, &half2,
+	    param, paramlen, salt, sizeof(salt), N, r, p, 0, 0,
+	    YESCRYPT_RW | __YESCRYPT_INIT_SHARED_2,
+	    salt, sizeof(salt)))
+		goto out;
+
+	if (yescrypt_kdf(&half2, &half1,
+	    param, paramlen, salt, sizeof(salt), N, r, p, 0, 0,
+	    YESCRYPT_RW | __YESCRYPT_INIT_SHARED_1,
+	    salt, sizeof(salt)))
+		goto out;
+
+	if (yescrypt_kdf(&half1, &half2,
+	    param, paramlen, salt, sizeof(salt), N, r, p, 0, 0,
+	    YESCRYPT_RW | __YESCRYPT_INIT_SHARED_1,
+	    buf, buflen))
+		goto out;
+
+	return 0;
+
+out:
+	if (!(flags & YESCRYPT_SHARED_PREALLOCATED))
+		free_region(shared);
+	return -1;
+}
+
+int
+yescrypt_free_shared(yescrypt_shared_t * shared)
+{
+	return free_region(shared);
+}
+
+int
+yescrypt_init_local(yescrypt_local_t * local)
+{
+	init_region(local);
+	return 0;
+}
+
+int
+yescrypt_free_local(yescrypt_local_t * local)
+{
+	return free_region(local);
+}
diff --git a/src/crypto/defyx/yescrypt-simd.c b/src/crypto/defyx/yescrypt-simd.c
new file mode 100644
index 00000000..884b2076
--- /dev/null
+++ b/src/crypto/defyx/yescrypt-simd.c
@@ -0,0 +1,1367 @@
+/*-
+ * Copyright 2009 Colin Percival
+ * Copyright 2012-2015 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+/*
+ * On 64-bit, enabling SSE4.1 helps our pwxform code indirectly, via avoiding
+ * gcc bug 54349 (fixed for gcc 4.9+).  On 32-bit, it's of direct help.  AVX
+ * and XOP are of further help either way.
+ */
+
+#include <emmintrin.h>
+#ifdef __XOP__
+#include <x86intrin.h>
+#endif
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "sha256.h"
+#include "sysendian.h"
+
+#include "yescrypt.h"
+
+#include "yescrypt-platform.c"
+
+#if __STDC_VERSION__ >= 199901L
+/* have restrict */
+#elif defined(__GNUC__)
+#define restrict __restrict
+#else
+#define restrict
+#endif
+
+#ifdef __GNUC__
+#define unlikely(exp) __builtin_expect(exp, 0)
+#else
+#define unlikely(exp) (exp)
+#endif
+
+#define PREFETCH(x, hint) _mm_prefetch((const char *)(x), (hint));
+
+#ifdef __XOP__
+#define ARX(out, in1, in2, s) \
+	out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s));
+#else
+#define ARX(out, in1, in2, s) \
+	{ \
+		__m128i T = _mm_add_epi32(in1, in2); \
+		out = _mm_xor_si128(out, _mm_slli_epi32(T, s)); \
+		out = _mm_xor_si128(out, _mm_srli_epi32(T, 32-s)); \
+	}
+#endif
+
+#define SALSA20_2ROUNDS \
+	/* Operate on "columns" */ \
+	ARX(X1, X0, X3, 7) \
+	ARX(X2, X1, X0, 9) \
+	ARX(X3, X2, X1, 13) \
+	ARX(X0, X3, X2, 18) \
+\
+	/* Rearrange data */ \
+	X1 = _mm_shuffle_epi32(X1, 0x93); \
+	X2 = _mm_shuffle_epi32(X2, 0x4E); \
+	X3 = _mm_shuffle_epi32(X3, 0x39); \
+\
+	/* Operate on "rows" */ \
+	ARX(X3, X0, X1, 7) \
+	ARX(X2, X3, X0, 9) \
+	ARX(X1, X2, X3, 13) \
+	ARX(X0, X1, X2, 18) \
+\
+	/* Rearrange data */ \
+	X1 = _mm_shuffle_epi32(X1, 0x39); \
+	X2 = _mm_shuffle_epi32(X2, 0x4E); \
+	X3 = _mm_shuffle_epi32(X3, 0x93);
+
+/**
+ * Apply the Salsa20/2 core to the block provided in (X0 ... X3).
+ */
+#define SALSA20_2(out) \
+	{ \
+		__m128i Y0 = X0; \
+		__m128i Y1 = X1; \
+		__m128i Y2 = X2; \
+		__m128i Y3 = X3; \
+		SALSA20_2ROUNDS \
+		(out)[0] = X0 = _mm_add_epi32(X0, Y0); \
+		(out)[1] = X1 = _mm_add_epi32(X1, Y1); \
+		(out)[2] = X2 = _mm_add_epi32(X2, Y2); \
+		(out)[3] = X3 = _mm_add_epi32(X3, Y3); \
+	}
+
+/**
+ * Apply the Salsa20/8 core to the block provided in (X0 ... X3) ^ (Z0 ... Z3).
+ */
+#define SALSA20_8_XOR_ANY(maybe_decl, Z0, Z1, Z2, Z3, out) \
+	X0 = _mm_xor_si128(X0, Z0); \
+	X1 = _mm_xor_si128(X1, Z1); \
+	X2 = _mm_xor_si128(X2, Z2); \
+	X3 = _mm_xor_si128(X3, Z3); \
+	{ \
+		maybe_decl Y0 = X0; \
+		maybe_decl Y1 = X1; \
+		maybe_decl Y2 = X2; \
+		maybe_decl Y3 = X3; \
+		SALSA20_2ROUNDS \
+		SALSA20_2ROUNDS \
+		SALSA20_2ROUNDS \
+		SALSA20_2ROUNDS \
+		(out)[0] = X0 = _mm_add_epi32(X0, Y0); \
+		(out)[1] = X1 = _mm_add_epi32(X1, Y1); \
+		(out)[2] = X2 = _mm_add_epi32(X2, Y2); \
+		(out)[3] = X3 = _mm_add_epi32(X3, Y3); \
+	}
+
+#define SALSA20_8_XOR_MEM(in, out) \
+	SALSA20_8_XOR_ANY(__m128i, (in)[0], (in)[1], (in)[2], (in)[3], out)
+
+#define SALSA20_8_XOR_REG(out) \
+	SALSA20_8_XOR_ANY(/* empty */, Y0, Y1, Y2, Y3, out)
+
+typedef union {
+	uint32_t w[16];
+	__m128i q[4];
+} salsa20_blk_t;
+
+/**
+ * blockmix_salsa8(Bin, Bout, r):
+ * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
+ * bytes in length; the output Bout must also be the same size.
+ */
+static void
+blockmix_salsa8(const salsa20_blk_t *restrict Bin,
+    salsa20_blk_t *restrict Bout, size_t r)
+{
+	size_t i;
+	__m128i X0, X1, X2, X3;
+
+	r--;
+	PREFETCH(&Bin[r * 2 + 1], _MM_HINT_T0)
+	for (i = 0; i < r; i++) {
+		PREFETCH(&Bin[i * 2], _MM_HINT_T0)
+		PREFETCH(&Bin[i * 2 + 1], _MM_HINT_T0)
+	}
+	PREFETCH(&Bin[r * 2], _MM_HINT_T0)
+
+	/* 1: X <-- B_{2r - 1} */
+	X0 = Bin[r * 2 + 1].q[0];
+	X1 = Bin[r * 2 + 1].q[1];
+	X2 = Bin[r * 2 + 1].q[2];
+	X3 = Bin[r * 2 + 1].q[3];
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i <= r; i++) {
+		/* 3: X <-- H(X \xor B_i) */
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		SALSA20_8_XOR_MEM(Bin[i * 2].q, Bout[i].q)
+
+		/* 3: X <-- H(X \xor B_i) */
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		SALSA20_8_XOR_MEM(Bin[i * 2 + 1].q, Bout[r + 1 + i].q)
+	}
+}
+
+/*
+ * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs
+ * starting with Sandy Bridge.  Additionally, PSHUFD uses separate source and
+ * destination registers, whereas the shifts would require an extra move
+ * instruction for our code when building without AVX.  Unfortunately, PSHUFD
+ * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ)
+ * and somewhat slower on some non-Intel CPUs (luckily not including AMD
+ * Bulldozer and Piledriver).
+ */
+#ifdef __AVX__
+#define HI32(X) \
+	_mm_srli_si128((X), 4)
+#elif 1 /* As an option, check for __SSE4_1__ here not to hurt Conroe */
+#define HI32(X) \
+	_mm_shuffle_epi32((X), _MM_SHUFFLE(2,3,0,1))
+#else
+#define HI32(X) \
+	_mm_srli_epi64((X), 32)
+#endif
+
+#if defined(__x86_64__) && (defined(__ICC) || defined(__llvm__))
+/* Intel's name, also supported by recent gcc */
+#define EXTRACT64(X) _mm_cvtsi128_si64(X)
+#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__)
+/* gcc got the 'x' name earlier than non-'x', MSVC and Open64 had bugs */
+#define EXTRACT64(X) _mm_cvtsi128_si64x(X)
+#elif defined(__x86_64__) && defined(__SSE4_1__)
+/* No known bugs for this intrinsic */
+#include <smmintrin.h>
+#define EXTRACT64(X) _mm_extract_epi64((X), 0)
+#elif defined(__SSE4_1__)
+/* 32-bit */
+#include <smmintrin.h>
+#if 0
+/* This is currently unused by the code below, which instead uses these two
+ * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */
+#define EXTRACT64(X) \
+	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
+	((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32))
+#endif
+#else
+/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64*() */
+#define EXTRACT64(X) \
+	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
+	((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32))
+#endif
+
+/* This is tunable */
+#define Swidth 8
+
+/* Not tunable in this implementation, hard-coded in a few places */
+#define PWXsimple 2
+#define PWXgather 4
+
+/* Derived values.  Not tunable except via Swidth above. */
+#define PWXbytes (PWXgather * PWXsimple * 8)
+#define Sbytes (3 * (1 << Swidth) * PWXsimple * 8)
+#define Smask (((1 << Swidth) - 1) * PWXsimple * 8)
+#define Smask2 (((uint64_t)Smask << 32) | Smask)
+
+#if !defined(__x86_64__) && defined(__SSE4_1__)
+/* 32-bit with SSE4.1 */
+#define PWXFORM_X_T __m128i
+#define PWXFORM_SIMD(X, x, s0, s1) \
+	x = _mm_and_si128(X, _mm_set1_epi64x(Smask2)); \
+	s0 = *(__m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \
+	s1 = *(__m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1)); \
+	X = _mm_mul_epu32(HI32(X), X); \
+	X = _mm_add_epi64(X, s0); \
+	X = _mm_xor_si128(X, s1);
+#else
+/* 64-bit, or 32-bit without SSE4.1 */
+#define PWXFORM_X_T uint64_t
+#define PWXFORM_SIMD(X, x, s0, s1) \
+	x = EXTRACT64(X) & Smask2; \
+	s0 = *(__m128i *)(S0 + (uint32_t)x); \
+	s1 = *(__m128i *)(S1 + (x >> 32)); \
+	X = _mm_mul_epu32(HI32(X), X); \
+	X = _mm_add_epi64(X, s0); \
+	X = _mm_xor_si128(X, s1);
+#endif
+
+#define PWXFORM_WRITE \
+	*(__m128i *)(S2 + w) = X0; \
+	*(__m128i *)(S2 + w + 16) = X1; \
+	*(__m128i *)(S2 + w + 32) = X2; \
+	*(__m128i *)(S2 + w + 48) = X3; \
+	w += 64;
+
+#define PWXFORM_ROUND \
+	PWXFORM_SIMD(X0, x0, s00, s01) \
+	PWXFORM_SIMD(X1, x1, s10, s11) \
+	PWXFORM_SIMD(X2, x2, s20, s21) \
+	PWXFORM_SIMD(X3, x3, s30, s31)
+
+#define PWXFORM \
+	{ \
+		PWXFORM_X_T x0, x1, x2, x3; \
+		__m128i s00, s01, s10, s11, s20, s21, s30, s31; \
+		PWXFORM_ROUND \
+		PWXFORM_ROUND PWXFORM_WRITE \
+		PWXFORM_ROUND PWXFORM_WRITE \
+		PWXFORM_ROUND PWXFORM_WRITE \
+		PWXFORM_ROUND PWXFORM_WRITE \
+		PWXFORM_ROUND \
+		w &= Smask; \
+		{ \
+			uint8_t * Stmp = S2; \
+			S2 = S1; \
+			S1 = S0; \
+			S0 = Stmp; \
+		} \
+	}
+
+#define XOR4(in) \
+	X0 = _mm_xor_si128(X0, (in)[0]); \
+	X1 = _mm_xor_si128(X1, (in)[1]); \
+	X2 = _mm_xor_si128(X2, (in)[2]); \
+	X3 = _mm_xor_si128(X3, (in)[3]);
+
+#define OUT(out) \
+	(out)[0] = X0; \
+	(out)[1] = X1; \
+	(out)[2] = X2; \
+	(out)[3] = X3;
+
+typedef struct {
+	uint8_t *S0, *S1, *S2;
+	size_t w;
+} pwxform_ctx_t;
+
+#define Salloc (Sbytes + ((sizeof(pwxform_ctx_t) + 63) & ~63U))
+
+/**
+ * blockmix_pwxform(Bin, Bout, r, S):
+ * Compute Bout = BlockMix_pwxform{salsa20/8, r, S}(Bin).  The input Bin must
+ * be 128r bytes in length; the output Bout must also be the same size.
+ */
+static void
+blockmix(const salsa20_blk_t *restrict Bin, salsa20_blk_t *restrict Bout,
+    size_t r, pwxform_ctx_t *restrict ctx)
+{
+	uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2;
+	size_t w = ctx->w;
+	size_t i;
+	__m128i X0, X1, X2, X3;
+
+	/* Convert 128-byte blocks to 64-byte blocks */
+	/* 1: r_1 <-- 128r / PWXbytes */
+	r *= 2;
+
+	r--;
+	PREFETCH(&Bin[r], _MM_HINT_T0)
+	for (i = 0; i < r; i++) {
+		PREFETCH(&Bin[i], _MM_HINT_T0)
+	}
+
+	/* 2: X <-- B'_{r_1 - 1} */
+	X0 = Bin[r].q[0];
+	X1 = Bin[r].q[1];
+	X2 = Bin[r].q[2];
+	X3 = Bin[r].q[3];
+
+	/* 3: for i = 0 to r_1 - 1 do */
+	i = 0;
+	do {
+		/* 5: X <-- X \xor B'_i */
+		XOR4(Bin[i].q)
+		/* 7: X <-- pwxform(X) */
+		PWXFORM
+
+		if (unlikely(i >= r))
+			break;
+
+		/* 8: B'_i <-- X */
+		OUT(Bout[i].q)
+
+		i++;
+	} while (1);
+
+	ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2;
+	ctx->w = w;
+
+	/* 11: B_i <-- H(B_i) */
+	SALSA20_2(Bout[i].q)
+}
+
+#define XOR4_2(in1, in2) \
+	X0 = _mm_xor_si128((in1)[0], (in2)[0]); \
+	X1 = _mm_xor_si128((in1)[1], (in2)[1]); \
+	X2 = _mm_xor_si128((in1)[2], (in2)[2]); \
+	X3 = _mm_xor_si128((in1)[3], (in2)[3]);
+
+static uint32_t
+blockmix_salsa8_xor(const salsa20_blk_t *restrict Bin1,
+    const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout,
+    size_t r)
+{
+	size_t i;
+	__m128i X0, X1, X2, X3;
+
+	r--;
+	PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0)
+	PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0)
+	for (i = 0; i < r; i++) {
+		PREFETCH(&Bin2[i * 2], _MM_HINT_T0)
+		PREFETCH(&Bin1[i * 2], _MM_HINT_T0)
+		PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0)
+		PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0)
+	}
+	PREFETCH(&Bin2[r * 2], _MM_HINT_T0)
+	PREFETCH(&Bin1[r * 2], _MM_HINT_T0)
+
+	/* 1: X <-- B_{2r - 1} */
+	XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q)
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i <= r; i++) {
+		/* 3: X <-- H(X \xor B_i) */
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		XOR4(Bin1[i * 2].q)
+		SALSA20_8_XOR_MEM(Bin2[i * 2].q, Bout[i].q)
+
+		/* 3: X <-- H(X \xor B_i) */
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		XOR4(Bin1[i * 2 + 1].q)
+		SALSA20_8_XOR_MEM(Bin2[i * 2 + 1].q, Bout[r + 1 + i].q)
+	}
+
+	return _mm_cvtsi128_si32(X0);
+}
+
+static uint32_t
+blockmix_xor(const salsa20_blk_t *restrict Bin1,
+    const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout,
+    size_t r, int Bin2_in_ROM, pwxform_ctx_t *restrict ctx)
+{
+	uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2;
+	size_t w = ctx->w;
+	size_t i;
+	__m128i X0, X1, X2, X3;
+
+	/* Convert 128-byte blocks to 64-byte blocks */
+	/* 1: r_1 <-- 128r / PWXbytes */
+	r *= 2;
+
+	r--;
+	if (Bin2_in_ROM) {
+		PREFETCH(&Bin2[r], _MM_HINT_NTA)
+		PREFETCH(&Bin1[r], _MM_HINT_T0)
+		for (i = 0; i < r; i++) {
+			PREFETCH(&Bin2[i], _MM_HINT_NTA)
+			PREFETCH(&Bin1[i], _MM_HINT_T0)
+		}
+	} else {
+		PREFETCH(&Bin2[r], _MM_HINT_T0)
+		PREFETCH(&Bin1[r], _MM_HINT_T0)
+		for (i = 0; i < r; i++) {
+			PREFETCH(&Bin2[i], _MM_HINT_T0)
+			PREFETCH(&Bin1[i], _MM_HINT_T0)
+		}
+	}
+
+	/* 2: X <-- B'_{r_1 - 1} */
+	XOR4_2(Bin1[r].q, Bin2[r].q)
+
+	/* 3: for i = 0 to r_1 - 1 do */
+	i = 0;
+	r--;
+	do {
+		/* 5: X <-- X \xor B'_i */
+		XOR4(Bin1[i].q)
+		XOR4(Bin2[i].q)
+		/* 7: X <-- pwxform(X) */
+		PWXFORM
+		/* 8: B'_i <-- X */
+		OUT(Bout[i].q)
+
+		/* 5: X <-- X \xor B'_i */
+		XOR4(Bin1[i + 1].q)
+		XOR4(Bin2[i + 1].q)
+		/* 7: X <-- pwxform(X) */
+		PWXFORM
+
+		if (unlikely(i >= r))
+			break;
+
+		/* 8: B'_i <-- X */
+		OUT(Bout[i + 1].q)
+
+		i += 2;
+	} while (1);
+	i++;
+
+	ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2;
+	ctx->w = w;
+
+	/* 11: B_i <-- H(B_i) */
+	SALSA20_2(Bout[i].q)
+
+	return _mm_cvtsi128_si32(X0);
+}
+
+#undef XOR4
+#define XOR4(in, out) \
+	(out)[0] = Y0 = _mm_xor_si128((in)[0], (out)[0]); \
+	(out)[1] = Y1 = _mm_xor_si128((in)[1], (out)[1]); \
+	(out)[2] = Y2 = _mm_xor_si128((in)[2], (out)[2]); \
+	(out)[3] = Y3 = _mm_xor_si128((in)[3], (out)[3]);
+
+#define XOR4_Y \
+	X0 = _mm_xor_si128(X0, Y0); \
+	X1 = _mm_xor_si128(X1, Y1); \
+	X2 = _mm_xor_si128(X2, Y2); \
+	X3 = _mm_xor_si128(X3, Y3);
+
+static uint32_t
+blockmix_xor_save(const salsa20_blk_t *restrict Bin1,
+    salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout,
+    size_t r, pwxform_ctx_t *restrict ctx)
+{
+	__m128i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+	uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2;
+	size_t w = ctx->w;
+	size_t i;
+
+	/* Convert 128-byte blocks to 64-byte blocks */
+	/* 1: r_1 <-- 128r / PWXbytes */
+	r *= 2;
+
+	r--;
+	PREFETCH(&Bin2[r], _MM_HINT_T0)
+	PREFETCH(&Bin1[r], _MM_HINT_T0)
+	for (i = 0; i < r; i++) {
+		PREFETCH(&Bin2[i], _MM_HINT_T0)
+		PREFETCH(&Bin1[i], _MM_HINT_T0)
+	}
+
+	/* 2: X <-- B'_{r_1 - 1} */
+	XOR4_2(Bin1[r].q, Bin2[r].q)
+
+	/* 3: for i = 0 to r_1 - 1 do */
+	i = 0;
+	r--;
+	do {
+		XOR4(Bin1[i].q, Bin2[i].q)
+		/* 5: X <-- X \xor B'_i */
+		XOR4_Y
+		/* 7: X <-- pwxform(X) */
+		PWXFORM
+		/* 8: B'_i <-- X */
+		OUT(Bout[i].q)
+
+		XOR4(Bin1[i + 1].q, Bin2[i + 1].q)
+		/* 5: X <-- X \xor B'_i */
+		XOR4_Y
+		/* 7: X <-- pwxform(X) */
+		PWXFORM
+
+		if (unlikely(i >= r))
+			break;
+
+		/* 8: B'_i <-- X */
+		OUT(Bout[i + 1].q)
+
+		i += 2;
+	} while (1);
+	i++;
+
+	ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2;
+	ctx->w = w;
+
+	/* 11: B_i <-- H(B_i) */
+	SALSA20_2(Bout[i].q)
+
+	return _mm_cvtsi128_si32(X0);
+}
+
+#undef ARX
+#undef SALSA20_2ROUNDS
+#undef SALSA20_2
+#undef SALSA20_8_XOR_ANY
+#undef SALSA20_8_XOR_MEM
+#undef SALSA20_8_XOR_REG
+#undef PWXFORM_X_T
+#undef PWXFORM_SIMD
+#undef PWXFORM_ROUND
+#undef PWXFORM
+#undef OUT
+#undef XOR4
+#undef XOR4_2
+#undef XOR4_Y
+
+/**
+ * integerify(B, r):
+ * Return the result of parsing B_{2r-1} as a little-endian integer.
+ */
+static inline uint32_t
+integerify(const salsa20_blk_t * B, size_t r)
+{
+	return B[2 * r - 1].w[0];
+}
+
+/**
+ * smix1(B, r, N, flags, V, NROM, VROM, XY, ctx):
+ * Compute first loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage XY must be 128r bytes in length.  The value N must be even and no
+ * smaller than 2.  The array V must be aligned to a multiple of 64 bytes, and
+ * arrays B and XY to a multiple of at least 16 bytes (aligning them to 64
+ * bytes as well saves cache lines, but might result in cache bank conflicts).
+ */
+static void
+smix1(uint8_t * B, size_t r, uint32_t N, yescrypt_flags_t flags,
+    salsa20_blk_t * V, uint32_t NROM, const salsa20_blk_t * VROM,
+    salsa20_blk_t * XY, pwxform_ctx_t * ctx)
+{
+	size_t s = 2 * r;
+	salsa20_blk_t * X = V, * Y;
+	uint32_t i, j;
+	size_t k;
+
+	/* 1: X <-- B */
+	/* 3: V_i <-- X */
+	for (k = 0; k < 2 * r; k++) {
+		for (i = 0; i < 16; i++) {
+			X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]);
+		}
+	}
+
+	if (VROM) {
+		uint32_t n;
+		salsa20_blk_t * V_n;
+		const salsa20_blk_t * V_j;
+
+		/* 4: X <-- H(X) */
+		/* 3: V_i <-- X */
+		Y = &V[s];
+		blockmix(X, Y, r, ctx);
+
+		X = &V[2 * s];
+		/* j <-- Integerify(X) mod NROM */
+		j = integerify(Y, r) & (NROM - 1);
+		V_j = &VROM[j * s];
+
+		/* X <-- H(X \xor VROM_j) */
+		j = blockmix_xor(Y, V_j, X, r, 1, ctx);
+
+		for (n = 2; n < N; n <<= 1) {
+			uint32_t m = (n < N / 2) ? n : (N - 1 - n);
+
+			V_n = &V[n * s];
+
+			/* 2: for i = 0 to N - 1 do */
+			for (i = 1; i < m; i += 2) {
+				/* j <-- Wrap(Integerify(X), i) */
+				j &= n - 1;
+				j += i - 1;
+				V_j = &V[j * s];
+
+				/* X <-- X \xor V_j */
+				/* 4: X <-- H(X) */
+				/* 3: V_i <-- X */
+				Y = &V_n[i * s];
+
+				/* j <-- Integerify(X) mod NROM */
+				j = blockmix_xor(X, V_j, Y, r, 0, ctx) & (NROM - 1);
+				V_j = &VROM[j * s];
+
+				/* X <-- H(X \xor VROM_j) */
+				X = &V_n[(i + 1) * s];
+				j = blockmix_xor(Y, V_j, X, r, 1, ctx);
+			}
+		}
+
+		n >>= 1;
+
+		/* j <-- Wrap(Integerify(X), i) */
+		j &= n - 1;
+		j += N - 2 - n;
+		V_j = &V[j * s];
+
+		/* X <-- X \xor V_j */
+		/* 4: X <-- H(X) */
+		/* 3: V_i <-- X */
+		Y = &V[(N - 1) * s];
+
+		/* j <-- Integerify(X) mod NROM */
+		j = blockmix_xor(X, V_j, Y, r, 0, ctx) & (NROM - 1);
+		V_j = &VROM[j * s];
+
+		/* X <-- X \xor V_j */
+		/* 4: X <-- H(X) */
+		X = XY;
+		blockmix_xor(Y, V_j, X, r, 1, ctx);
+	} else if (flags & YESCRYPT_RW) {
+		uint32_t n;
+		salsa20_blk_t * V_n, * V_j;
+
+		/* 4: X <-- H(X) */
+		/* 3: V_i <-- X */
+		Y = &V[s];
+		blockmix(X, Y, r, ctx);
+
+		/* 4: X <-- H(X) */
+		/* 3: V_i <-- X */
+		X = &V[2 * s];
+		blockmix(Y, X, r, ctx);
+		j = integerify(X, r);
+
+		for (n = 2; n < N; n <<= 1) {
+			uint32_t m = (n < N / 2) ? n : (N - 1 - n);
+
+			V_n = &V[n * s];
+
+			/* 2: for i = 0 to N - 1 do */
+			for (i = 1; i < m; i += 2) {
+				Y = &V_n[i * s];
+
+				/* j <-- Wrap(Integerify(X), i) */
+				j &= n - 1;
+				j += i - 1;
+				V_j = &V[j * s];
+
+				/* X <-- X \xor V_j */
+				/* 4: X <-- H(X) */
+				/* 3: V_i <-- X */
+				j = blockmix_xor(X, V_j, Y, r, 0, ctx);
+
+				/* j <-- Wrap(Integerify(X), i) */
+				j &= n - 1;
+				j += i;
+				V_j = &V[j * s];
+
+				/* X <-- X \xor V_j */
+				/* 4: X <-- H(X) */
+				/* 3: V_i <-- X */
+				X = &V_n[(i + 1) * s];
+				j = blockmix_xor(Y, V_j, X, r, 0, ctx);
+			}
+		}
+
+		n >>= 1;
+
+		/* j <-- Wrap(Integerify(X), i) */
+		j &= n - 1;
+		j += N - 2 - n;
+		V_j = &V[j * s];
+
+		/* X <-- X \xor V_j */
+		/* 4: X <-- H(X) */
+		/* 3: V_i <-- X */
+		Y = &V[(N - 1) * s];
+		j = blockmix_xor(X, V_j, Y, r, 0, ctx);
+
+		/* j <-- Wrap(Integerify(X), i) */
+		j &= n - 1;
+		j += N - 1 - n;
+		V_j = &V[j * s];
+
+		/* X <-- X \xor V_j */
+		/* 4: X <-- H(X) */
+		X = XY;
+		blockmix_xor(Y, V_j, X, r, 0, ctx);
+	} else {
+		/* 2: for i = 0 to N - 1 do */
+		for (i = 1; i < N - 1; i += 2) {
+			/* 4: X <-- H(X) */
+			/* 3: V_i <-- X */
+			Y = &V[i * s];
+			blockmix_salsa8(X, Y, r);
+
+			/* 4: X <-- H(X) */
+			/* 3: V_i <-- X */
+			X = &V[(i + 1) * s];
+			blockmix_salsa8(Y, X, r);
+		}
+
+		/* 4: X <-- H(X) */
+		/* 3: V_i <-- X */
+		Y = &V[i * s];
+		blockmix_salsa8(X, Y, r);
+
+		/* 4: X <-- H(X) */
+		X = XY;
+		blockmix_salsa8(Y, X, r);
+	}
+
+	/* B' <-- X */
+	for (k = 0; k < 2 * r; k++) {
+		for (i = 0; i < 16; i++) {
+			le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]);
+		}
+	}
+}
+
+/**
+ * smix2(B, r, N, Nloop, flags, V, NROM, VROM, XY, ctx):
+ * Compute second loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage XY must be 256r bytes in length.  The value N must be a power of 2
+ * greater than 1.  The value Nloop must be even.  The array V must be aligned
+ * to a multiple of 64 bytes, and arrays B and XY to a multiple of at least 16
+ * bytes (aligning them to 64 bytes as well saves cache lines, but might result
+ * in cache bank conflicts).
+ */
+static void
+smix2(uint8_t * B, size_t r, uint32_t N, uint64_t Nloop,
+    yescrypt_flags_t flags, salsa20_blk_t * V, uint32_t NROM,
+    const salsa20_blk_t * VROM, salsa20_blk_t * XY, pwxform_ctx_t * ctx)
+{
+	size_t s = 2 * r;
+	salsa20_blk_t * X = XY, * Y = &XY[s];
+	uint64_t i;
+	uint32_t j;
+	size_t k;
+
+	if (Nloop == 0)
+		return;
+
+	/* X <-- B' */
+	/* 3: V_i <-- X */
+	for (k = 0; k < 2 * r; k++) {
+		for (i = 0; i < 16; i++) {
+			X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]);
+		}
+	}
+
+	i = Nloop / 2;
+
+	/* 7: j <-- Integerify(X) mod N */
+	j = integerify(X, r) & (N - 1);
+
+/*
+ * Normally, VROM implies YESCRYPT_RW, but we check for these separately
+ * because our SMix resets YESCRYPT_RW for the smix2() calls operating on the
+ * entire V when p > 1.
+ */
+	if (VROM && (flags & YESCRYPT_RW)) {
+		/* 6: for i = 0 to N - 1 do */
+		for (i = 0; i < Nloop; i += 2) {
+			salsa20_blk_t * V_j = &V[j * s];
+			const salsa20_blk_t * VROM_j;
+
+			/* 8: X <-- H(X \xor V_j) */
+			/* V_j <-- Xprev \xor V_j */
+			/* j <-- Integerify(X) mod NROM */
+			j = blockmix_xor_save(X, V_j, Y, r, ctx) & (NROM - 1);
+			VROM_j = &VROM[j * s];
+
+			/* X <-- H(X \xor VROM_j) */
+			/* 7: j <-- Integerify(X) mod N */
+			j = blockmix_xor(Y, VROM_j, X, r, 1, ctx) & (N - 1);
+			V_j = &V[j * s];
+		}
+	} else if (VROM) {
+		/* 6: for i = 0 to N - 1 do */
+		for (i = 0; i < Nloop; i += 2) {
+			const salsa20_blk_t * V_j = &V[j * s];
+
+			/* 8: X <-- H(X \xor V_j) */
+			/* V_j <-- Xprev \xor V_j */
+			/* j <-- Integerify(X) mod NROM */
+			j = blockmix_xor(X, V_j, Y, r, 0, ctx) & (NROM - 1);
+			V_j = &VROM[j * s];
+
+			/* X <-- H(X \xor VROM_j) */
+			/* 7: j <-- Integerify(X) mod N */
+			j = blockmix_xor(Y, V_j, X, r, 1, ctx) & (N - 1);
+			V_j = &V[j * s];
+		}
+	} else if (flags & YESCRYPT_RW) {
+		/* 6: for i = 0 to N - 1 do */
+		do {
+			salsa20_blk_t * V_j = &V[j * s];
+
+			/* 8: X <-- H(X \xor V_j) */
+			/* V_j <-- Xprev \xor V_j */
+			/* 7: j <-- Integerify(X) mod N */
+			j = blockmix_xor_save(X, V_j, Y, r, ctx) & (N - 1);
+			V_j = &V[j * s];
+
+			/* 8: X <-- H(X \xor V_j) */
+			/* V_j <-- Xprev \xor V_j */
+			/* 7: j <-- Integerify(X) mod N */
+			j = blockmix_xor_save(Y, V_j, X, r, ctx) & (N - 1);
+		} while (--i);
+	} else if (ctx) {
+		/* 6: for i = 0 to N - 1 do */
+		do {
+			const salsa20_blk_t * V_j = &V[j * s];
+
+			/* 8: X <-- H(X \xor V_j) */
+			/* 7: j <-- Integerify(X) mod N */
+			j = blockmix_xor(X, V_j, Y, r, 0, ctx) & (N - 1);
+			V_j = &V[j * s];
+
+			/* 8: X <-- H(X \xor V_j) */
+			/* 7: j <-- Integerify(X) mod N */
+			j = blockmix_xor(Y, V_j, X, r, 0, ctx) & (N - 1);
+		} while (--i);
+	} else {
+		/* 6: for i = 0 to N - 1 do */
+		do {
+			const salsa20_blk_t * V_j = &V[j * s];
+
+			/* 8: X <-- H(X \xor V_j) */
+			/* 7: j <-- Integerify(X) mod N */
+			j = blockmix_salsa8_xor(X, V_j, Y, r) & (N - 1);
+			V_j = &V[j * s];
+
+			/* 8: X <-- H(X \xor V_j) */
+			/* 7: j <-- Integerify(X) mod N */
+			j = blockmix_salsa8_xor(Y, V_j, X, r) & (N - 1);
+		} while (--i);
+	}
+
+	/* 10: B' <-- X */
+	for (k = 0; k < 2 * r; k++) {
+		for (i = 0; i < 16; i++) {
+			le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]);
+		}
+	}
+}
+
+/**
+ * p2floor(x):
+ * Largest power of 2 not greater than argument.
+ */
+static uint64_t
+p2floor(uint64_t x)
+{
+	uint64_t y;
+	while ((y = x & (x - 1)))
+		x = y;
+	return x;
+}
+
+/**
+ * smix(B, r, N, p, t, flags, V, NROM, VROM, XY, S, passwd):
+ * Compute B = SMix_r(B, N).  The input B must be 128rp bytes in length; the
+ * temporary storage V must be 128rN bytes in length; the temporary storage XY
+ * must be 256r or 256rp bytes in length (the larger size is required with
+ * OpenMP-enabled builds).  The value N must be a power of 2 greater than 1.
+ * The array V must be aligned to a multiple of 64 bytes, and arrays B and
+ * XY to a multiple of at least 16 bytes (aligning them to 64 bytes as well
+ * saves cache lines and helps avoid false sharing in OpenMP-enabled builds
+ * when p > 1, but it might also result in cache bank conflicts).
+ */
+static void
+smix(uint8_t * B, size_t r, uint32_t N, uint32_t p, uint32_t t,
+    yescrypt_flags_t flags,
+    salsa20_blk_t * V, uint32_t NROM, const salsa20_blk_t * VROM,
+    salsa20_blk_t * XY, uint8_t * S, uint8_t * passwd)
+{
+	size_t s = 2 * r;
+	uint32_t Nchunk;
+	uint64_t Nloop_all, Nloop_rw;
+	uint32_t i;
+
+	/* 1: n <-- N / p */
+	Nchunk = N / p;
+
+	/* 2: Nloop_all <-- fNloop(n, t, flags) */
+	Nloop_all = Nchunk;
+	if (flags & YESCRYPT_RW) {
+		if (t <= 1) {
+			if (t)
+				Nloop_all *= 2; /* 2/3 */
+			Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */
+		} else {
+			Nloop_all *= t - 1;
+		}
+	} else if (t) {
+		if (t == 1)
+			Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */
+		Nloop_all *= t;
+	}
+
+	/* 6: Nloop_rw <-- 0 */
+	Nloop_rw = 0;
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		Nloop_rw = Nloop_all;
+	} else {
+		/* 3: if YESCRYPT_RW flag is set */
+		if (flags & YESCRYPT_RW) {
+			/* 4: Nloop_rw <-- Nloop_all / p */
+			Nloop_rw = Nloop_all / p;
+		}
+	}
+
+	/* 8: n <-- n - (n mod 2) */
+	Nchunk &= ~(uint32_t)1; /* round down to even */
+	/* 9: Nloop_all <-- Nloop_all + (Nloop_all mod 2) */
+	Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */
+	/* 10: Nloop_rw <-- Nloop_rw + (Nloop_rw mod 2) */
+	Nloop_rw++; Nloop_rw &= ~(uint64_t)1; /* round up to even */
+
+	/* 11: for i = 0 to p - 1 do */
+#ifdef _OPENMP
+#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, VROM, XY, S, passwd, s, Nchunk, Nloop_all, Nloop_rw)
+	{
+#pragma omp for
+#endif
+	for (i = 0; i < p; i++) {
+		/* 12: u <-- in */
+		uint32_t Vchunk = i * Nchunk;
+		/* 13: if i = p - 1 */
+		/* 14:   n <-- N - u */
+		/* 15: end if */
+		/* 16: v <-- u + n - 1 */
+		uint32_t Np = (i < p - 1) ? Nchunk : (N - Vchunk);
+		uint8_t * Bp = &B[128 * r * i];
+		salsa20_blk_t * Vp = &V[Vchunk * s];
+#ifdef _OPENMP
+		salsa20_blk_t * XYp = &XY[i * (2 * s)];
+#else
+		salsa20_blk_t * XYp = XY;
+#endif
+		pwxform_ctx_t * ctx_i = NULL;
+		/* 17: if YESCRYPT_RW flag is set */
+		if (flags & YESCRYPT_RW) {
+			uint8_t *Si = S + i * Salloc;
+			/* 18: SMix1_1(B_i, Sbytes / 128, S_i, no flags) */
+			smix1(Bp, 1, Sbytes / 128, 0 /* no flags */,
+			    (salsa20_blk_t *)Si, 0, NULL, XYp, NULL);
+			ctx_i = (pwxform_ctx_t *)(Si + Sbytes);
+			/* 19: S2_i <-- S_{i,0...2^Swidth-1} */
+			ctx_i->S2 = Si;
+			/* 20: S1_i <-- S_{i,2^Swidth...2*2^Swidth-1} */
+			ctx_i->S1 = Si + Sbytes / 3;
+			/* 21: S0_i <-- S_{i,2*2^Swidth...3*2^Swidth-1} */
+			ctx_i->S0 = Si + Sbytes / 3 * 2;
+			/* 22: w_i <-- 0 */
+			ctx_i->w = 0;
+			/* 23: if i = 0 */
+			if (i == 0) {
+				/* 24: passwd <-- HMAC-SHA256(B_{0,2r-1}, passwd) */
+				HMAC_SHA256_CTX_Y ctx;
+				HMAC_SHA256_Init_Y(&ctx, Bp + (128 * r - 64), 64);
+				HMAC_SHA256_Update_Y(&ctx, passwd, 32);
+				HMAC_SHA256_Final_Y(passwd, &ctx);
+			}
+		}
+		if (!(flags & __YESCRYPT_INIT_SHARED_2)) {
+			/* 27: SMix1_r(B_i, n, V_{u..v}, flags) */
+			smix1(Bp, r, Np, flags, Vp, NROM, VROM, XYp, ctx_i);
+		}
+		/* 28: SMix2_r(B_i, p2floor(n), Nloop_rw, V_{u..v}, flags) */
+		smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp,
+		    NROM, VROM, XYp, ctx_i);
+	}
+
+	/* 30: for i = 0 to p - 1 do */
+	if (Nloop_all > Nloop_rw) {
+#ifdef _OPENMP
+#pragma omp for
+#endif
+		for (i = 0; i < p; i++) {
+			uint8_t * Bp = &B[128 * r * i];
+#ifdef _OPENMP
+			salsa20_blk_t * XYp = &XY[i * (2 * s)];
+#else
+			salsa20_blk_t * XYp = XY;
+#endif
+			pwxform_ctx_t * ctx_i = NULL;
+			if (flags & YESCRYPT_RW) {
+				uint8_t *Si = S + i * Salloc;
+				ctx_i = (pwxform_ctx_t *)(Si + Sbytes);
+			}
+			/* 31: SMix2_r(B_i, N, Nloop_all - Nloop_rw, V, flags excluding YESCRYPT_RW) */
+			smix2(Bp, r, N, Nloop_all - Nloop_rw,
+			    flags & ~YESCRYPT_RW, V, NROM, VROM, XYp, ctx_i);
+		}
+	}
+#ifdef _OPENMP
+	}
+#endif
+}
+
+/**
+ * yescrypt_kdf_body(shared, local, passwd, passwdlen, salt, saltlen,
+ *     N, r, p, t, flags, buf, buflen):
+ * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
+ * p, buflen), or a revision of scrypt as requested by flags and shared, and
+ * write the result into buf.  The parameters r, p, and buflen must satisfy
+ * r * p < 2^30 and buflen <= (2^32 - 1) * 32.  The parameter N must be a power
+ * of 2 greater than 1.  (This optimized implementation currently additionally
+ * limits N to the range from 8 to 2^31, but other implementation might not.)
+ *
+ * t controls computation time while not affecting peak memory usage.  shared
+ * and flags may request special modes as described in yescrypt.h.  local is
+ * the thread-local data structure, allowing to preserve and reuse a memory
+ * allocation across calls, thereby reducing its overhead.
+ *
+ * Return 0 on success; or -1 on error.
+ */
+static int
+yescrypt_kdf_body(const yescrypt_shared_t * shared, yescrypt_local_t * local,
+    const uint8_t * passwd, size_t passwdlen,
+    const uint8_t * salt, size_t saltlen,
+    uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags,
+    uint8_t * buf, size_t buflen)
+{
+	yescrypt_region_t tmp;
+	uint64_t NROM;
+	const salsa20_blk_t * VROM;
+	size_t B_size, V_size, XY_size, need;
+	uint8_t * B, * S;
+	salsa20_blk_t * V, * XY;
+	uint8_t sha256[32];
+	uint8_t dk[sizeof(sha256)], * dkp = buf;
+
+	/* Sanity-check parameters */
+	if (flags & ~YESCRYPT_KNOWN_FLAGS) {
+		errno = EINVAL;
+		return -1;
+	}
+#if SIZE_MAX > UINT32_MAX
+	if (buflen > (((uint64_t)(1) << 32) - 1) * 32) {
+		errno = EFBIG;
+		return -1;
+	}
+#endif
+	if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) {
+		errno = EFBIG;
+		return -1;
+	}
+	if (N > UINT32_MAX) {
+		errno = EFBIG;
+		return -1;
+	}
+	if (((N & (N - 1)) != 0) || (N <= 3) || (r < 1) || (p < 1)) {
+		errno = EINVAL;
+		return -1;
+	}
+	if ((r > SIZE_MAX / 256 / p) ||
+	    (N > SIZE_MAX / 128 / r)) {
+		errno = ENOMEM;
+		return -1;
+	}
+	if (flags & YESCRYPT_RW) {
+		if (N / p <= 3) {
+			errno = EINVAL;
+			return -1;
+		}
+		if (p > SIZE_MAX / Salloc) {
+			errno = ENOMEM;
+			return -1;
+		}
+	}
+#ifdef _OPENMP
+	else if (N > SIZE_MAX / 128 / (r * p)) {
+		errno = ENOMEM;
+		return -1;
+	}
+#endif
+
+	NROM = 0;
+	VROM = NULL;
+	if (shared) {
+		NROM = shared->aligned_size / ((size_t)128 * r);
+		if (NROM > UINT32_MAX) {
+			errno = EFBIG;
+			return -1;
+		}
+		if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) ||
+		    !(flags & YESCRYPT_RW)) {
+			errno = EINVAL;
+			return -1;
+		}
+		VROM = shared->aligned;
+	}
+
+	/* Allocate memory */
+	V = NULL;
+	V_size = (size_t)128 * r * N;
+#ifdef _OPENMP
+	if (!(flags & YESCRYPT_RW))
+		V_size *= p;
+#endif
+	need = V_size;
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		if (local->aligned_size < need) {
+			if (local->base || local->aligned ||
+			    local->base_size || local->aligned_size) {
+				errno = EINVAL;
+				return -1;
+			}
+			if (!alloc_region(local, need))
+				return -1;
+		}
+		V = (salsa20_blk_t *)local->aligned;
+		need = 0;
+	}
+	B_size = (size_t)128 * r * p;
+	need += B_size;
+	if (need < B_size) {
+		errno = ENOMEM;
+		return -1;
+	}
+	XY_size = (size_t)256 * r;
+#ifdef _OPENMP
+	XY_size *= p;
+#endif
+	need += XY_size;
+	if (need < XY_size) {
+		errno = ENOMEM;
+		return -1;
+	}
+	if (flags & YESCRYPT_RW) {
+		size_t S_size = (size_t)Salloc * p;
+		need += S_size;
+		if (need < S_size) {
+			errno = ENOMEM;
+			return -1;
+		}
+	}
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		if (!alloc_region(&tmp, need))
+			return -1;
+		B = (uint8_t *)tmp.aligned;
+		XY = (salsa20_blk_t *)((uint8_t *)B + B_size);
+	} else {
+		init_region(&tmp);
+		if (local->aligned_size < need) {
+			if (free_region(local))
+				return -1;
+			if (!alloc_region(local, need))
+				return -1;
+		}
+		B = (uint8_t *)local->aligned;
+		V = (salsa20_blk_t *)((uint8_t *)B + B_size);
+		XY = (salsa20_blk_t *)((uint8_t *)V + V_size);
+	}
+	S = NULL;
+	if (flags & YESCRYPT_RW)
+		S = (uint8_t *)XY + XY_size;
+
+	if (flags) {
+		HMAC_SHA256_CTX_Y ctx;
+		HMAC_SHA256_Init_Y(&ctx, "yescrypt-prehash",
+		    (flags & __YESCRYPT_PREHASH) ? 16 : 8);
+		HMAC_SHA256_Update_Y(&ctx, passwd, passwdlen);
+		HMAC_SHA256_Final_Y(sha256, &ctx);
+		passwd = sha256;
+		passwdlen = sizeof(sha256);
+	}
+
+	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+	PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, B, B_size);
+
+	if (t || flags)
+		memcpy(sha256, B, sizeof(sha256));
+
+	if (p == 1 || (flags & YESCRYPT_RW)) {
+		smix(B, r, N, p, t, flags, V, NROM, VROM, XY, S, sha256);
+	} else {
+		uint32_t i;
+
+		/* 2: for i = 0 to p - 1 do */
+#ifdef _OPENMP
+#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, VROM, XY, S)
+#endif
+		for (i = 0; i < p; i++) {
+			/* 3: B_i <-- MF(B_i, N) */
+#ifdef _OPENMP
+			smix(&B[(size_t)128 * r * i], r, N, 1, t, flags,
+			    &V[(size_t)2 * r * i * N],
+			    NROM, VROM,
+			    &XY[(size_t)4 * r * i], NULL, NULL);
+#else
+			smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, V,
+			    NROM, VROM, XY, NULL, NULL);
+#endif
+		}
+	}
+
+	dkp = buf;
+	if (flags && buflen < sizeof(dk)) {
+		PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, dk, sizeof(dk));
+		dkp = dk;
+	}
+
+	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+	PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, buf, buflen);
+
+	/*
+	 * Except when computing classic scrypt, allow all computation so far
+	 * to be performed on the client.  The final steps below match those of
+	 * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so
+	 * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of
+	 * SCRAM's use of SHA-1) would be usable with yescrypt hashes.
+	 */
+	if (flags && !(flags & __YESCRYPT_PREHASH)) {
+		/* Compute ClientKey */
+		{
+			HMAC_SHA256_CTX_Y ctx;
+			HMAC_SHA256_Init_Y(&ctx, dkp, sizeof(dk));
+			HMAC_SHA256_Update_Y(&ctx, "Client Key", 10);
+			HMAC_SHA256_Final_Y(sha256, &ctx);
+		}
+		/* Compute StoredKey */
+		{
+			SHA256_CTX_Y ctx;
+			size_t clen = buflen;
+			if (clen > sizeof(dk))
+				clen = sizeof(dk);
+			SHA256_Init_Y(&ctx);
+			SHA256_Update_Y(&ctx, sha256, sizeof(sha256));
+			SHA256_Final_Y(dk, &ctx);
+			memcpy(buf, dk, clen);
+		}
+	}
+
+	if (free_region(&tmp))
+		return -1;
+
+	/* Success! */
+	return 0;
+}
+
+/**
+ * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
+ *     N, r, p, t, g, flags, buf, buflen):
+ * Compute scrypt or its revision as requested by the parameters.  The inputs
+ * to this function are the same as those for yescrypt_kdf_body() above, with
+ * the addition of g, which controls hash upgrades (0 for no upgrades so far).
+ */
+int
+yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
+    const uint8_t * passwd, size_t passwdlen,
+    const uint8_t * salt, size_t saltlen,
+    uint64_t N, uint32_t r, uint32_t p, uint32_t t, uint32_t g,
+    yescrypt_flags_t flags,
+    uint8_t * buf, size_t buflen)
+{
+	uint8_t dk[32];
+
+	if ((flags & (YESCRYPT_RW | __YESCRYPT_INIT_SHARED)) == YESCRYPT_RW &&
+	    p >= 1 && N / p >= 0x100 && N / p * r >= 0x20000) {
+		int retval = yescrypt_kdf_body(shared, local,
+		    passwd, passwdlen, salt, saltlen,
+		    N >> 6, r, p, 0, flags | __YESCRYPT_PREHASH,
+		    dk, sizeof(dk));
+		if (retval)
+			return retval;
+		passwd = dk;
+		passwdlen = sizeof(dk);
+	}
+
+	do {
+		uint8_t * dkp = g ? dk : buf;
+		size_t dklen = g ? sizeof(dk) : buflen;
+		int retval = yescrypt_kdf_body(shared, local,
+		    passwd, passwdlen, salt, saltlen,
+		    N, r, p, t, flags, dkp, dklen);
+		if (retval)
+			return retval;
+
+		passwd = dkp;
+		passwdlen = dklen;
+
+		N <<= 2;
+		if (!N)
+			return -1;
+		t >>= 1;
+	} while (g--);
+
+	return 0;
+}
diff --git a/src/crypto/defyx/yescrypt.h b/src/crypto/defyx/yescrypt.h
new file mode 100644
index 00000000..4af307e8
--- /dev/null
+++ b/src/crypto/defyx/yescrypt.h
@@ -0,0 +1,326 @@
+/*-
+ * Copyright 2009 Colin Percival
+ * Copyright 2013-2015 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+#ifndef _YESCRYPT_H_
+#define _YESCRYPT_H_
+
+#include <stdint.h>
+#include <stdlib.h> /* for size_t */
+
+/**
+ * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen):
+ * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
+ * p, buflen) and write the result into buf.  The parameters r, p, and buflen
+ * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32.  The parameter N
+ * must be a power of 2 greater than 1.
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * MT-safe as long as buf is local to the thread.
+ */
+extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen,
+    const uint8_t * __salt, size_t __saltlen,
+    uint64_t __N, uint32_t __r, uint32_t __p,
+    uint8_t * __buf, size_t __buflen);
+
+/**
+ * Internal type used by the memory allocator.  Please do not use it directly.
+ * Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since
+ * they might differ from each other in a future version.
+ */
+typedef struct {
+	void * base, * aligned;
+	size_t base_size, aligned_size;
+} yescrypt_region_t;
+
+/**
+ * Types for shared (ROM) and thread-local (RAM) data structures.
+ */
+typedef yescrypt_region_t yescrypt_shared_t;
+typedef yescrypt_region_t yescrypt_local_t;
+
+/**
+ * Possible values for yescrypt_init_shared()'s flags argument.
+ */
+typedef enum {
+	YESCRYPT_SHARED_DEFAULTS = 0,
+	YESCRYPT_SHARED_PREALLOCATED = 0x100
+} yescrypt_init_shared_flags_t;
+
+/**
+ * Possible values for the flags argument of yescrypt_kdf(),
+ * yescrypt_gensalt_r(), yescrypt_gensalt().  These may be OR'ed together,
+ * except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive.
+ * Please refer to the description of yescrypt_kdf() below for the meaning of
+ * these flags.
+ */
+typedef enum {
+/* public */
+	YESCRYPT_WORM = 2,
+	YESCRYPT_RW = 1,
+/* private */
+	__YESCRYPT_INIT_SHARED_1 = 0x10000,
+	__YESCRYPT_INIT_SHARED_2 = 0x20000,
+	__YESCRYPT_INIT_SHARED = 0x30000,
+	__YESCRYPT_PREHASH = 0x100000
+} yescrypt_flags_t;
+
+#define YESCRYPT_KNOWN_FLAGS \
+	(YESCRYPT_WORM | YESCRYPT_RW | \
+	__YESCRYPT_INIT_SHARED | __YESCRYPT_PREHASH)
+
+/**
+ * yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, buf, buflen):
+ * Optionally allocate memory for and initialize the shared (ROM) data
+ * structure.  The parameters N, r, and p must satisfy the same conditions as
+ * with crypto_scrypt().  param and paramlen specify a local parameter with
+ * which the ROM is seeded.  If buf is not NULL, then it is used to return
+ * buflen bytes of message digest for the initialized ROM (the caller may use
+ * this to verify that the ROM has been computed in the same way that it was on
+ * a previous run).
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the
+ * ROM is assumed to have been preallocated by the caller, with shared->aligned
+ * being the start address of the ROM and shared->aligned_size being its size
+ * (which must be consistent with N, r, and p).  This may be used e.g. when the
+ * ROM is to be placed in a SysV shared memory segment allocated by the caller.
+ *
+ * MT-safe as long as shared is local to the thread.
+ */
+extern int yescrypt_init_shared(yescrypt_shared_t * __shared,
+    const uint8_t * __param, size_t __paramlen,
+    uint64_t __N, uint32_t __r, uint32_t __p,
+    yescrypt_init_shared_flags_t __flags,
+    uint8_t * __buf, size_t __buflen);
+
+/**
+ * yescrypt_free_shared(shared):
+ * Free memory that had been allocated with yescrypt_init_shared().
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * MT-safe as long as shared is local to the thread.
+ */
+extern int yescrypt_free_shared(yescrypt_shared_t * __shared);
+
+/**
+ * yescrypt_init_local(local):
+ * Initialize the thread-local (RAM) data structure.  Actual memory allocation
+ * is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r().
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * MT-safe as long as local is local to the thread.
+ */
+extern int yescrypt_init_local(yescrypt_local_t * __local);
+
+/**
+ * yescrypt_free_local(local):
+ * Free memory that may have been allocated for an initialized thread-local
+ * (RAM) data structure.
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * MT-safe as long as local is local to the thread.
+ */
+extern int yescrypt_free_local(yescrypt_local_t * __local);
+
+/**
+ * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
+ *     N, r, p, t, g, flags, buf, buflen):
+ * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
+ * p, buflen), or a revision of scrypt as requested by flags and shared, and
+ * write the result into buf.  The parameters N, r, p, and buflen must satisfy
+ * the same conditions as with crypto_scrypt().  t controls computation time
+ * while not affecting peak memory usage.  g controls hash upgrades (0 for no
+ * upgrades so far).  shared and flags may request special modes as described
+ * below.  local is the thread-local data structure, allowing to preserve and
+ * reuse a memory allocation across calls, thereby reducing its overhead.
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * t controls computation time.  t = 0 is optimal in terms of achieving the
+ * highest area-time for ASIC attackers.  Thus, higher computation time, if
+ * affordable, is best achieved by increasing N rather than by increasing t.
+ * However, if the higher memory usage (which goes along with higher N) is not
+ * affordable, or if fine-tuning of the time is needed (recall that N must be a
+ * power of 2), then t = 1 or above may be used to increase time while staying
+ * at the same peak memory usage.  t = 1 increases the time by 25% and
+ * decreases the normalized area-time to 96% of optimal.  (Of course, in
+ * absolute terms the area-time increases with higher t.  It's just that it
+ * would increase slightly more with higher N*r rather than with higher t.)
+ * t = 2 increases the time by another 20% and decreases the normalized
+ * area-time to 89% of optimal.  Thus, these two values are reasonable to use
+ * for fine-tuning.  Values of t higher than 2 result in further increase in
+ * time while reducing the efficiency much further (e.g., down to around 50% of
+ * optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact
+ * numbers varying by the flags settings).
+ *
+ * Classic scrypt is available by setting t = 0, flags = 0, and shared = NULL.
+ * In this mode, the thread-local memory region (RAM) is first sequentially
+ * written to and then randomly read from.  This algorithm is friendly towards
+ * time-memory tradeoffs (TMTO), available both to defenders (albeit not in
+ * this implementation) and to attackers.
+ *
+ * Setting YESCRYPT_WORM enables only minimal enhancements relative to classic
+ * scrypt: support for the t parameter, and pre- and post-hashing.
+ *
+ * Setting YESCRYPT_RW adds extra random reads and writes to the thread-local
+ * memory region (RAM), which makes TMTO a lot less efficient.  This may be
+ * used to slow down the kinds of attackers who would otherwise benefit from
+ * classic scrypt's efficient TMTO.  Since classic scrypt's TMTO allows not
+ * only for the tradeoff, but also for a decrease of attacker's area-time (by
+ * up to a constant factor), setting YESCRYPT_RW substantially increases the
+ * cost of attacks in area-time terms as well.  Yet another benefit of it is
+ * that optimal area-time is reached at an earlier time than with classic
+ * scrypt, and t = 0 actually corresponds to this earlier completion time,
+ * resulting in quicker hash computations (and thus in higher request rate
+ * capacity).  Due to these properties, YESCRYPT_RW should almost always be
+ * set, except when compatibility with classic scrypt or TMTO-friendliness are
+ * desired.
+ *
+ * YESCRYPT_RW also moves parallelism that is present with p > 1 to a
+ * lower level as compared to where it is in classic scrypt.  This reduces
+ * flexibility for efficient computation (for both attackers and defenders) by
+ * requiring that, short of resorting to TMTO, the full amount of memory be
+ * allocated as needed for the specified p, regardless of whether that
+ * parallelism is actually being fully made use of or not.  (For comparison, a
+ * single instance of classic scrypt may be computed in less memory without any
+ * CPU time overhead, but in more real time, by not making full use of the
+ * parallelism.)  This may be desirable when the defender has enough memory
+ * with sufficiently low latency and high bandwidth for efficient full parallel
+ * execution, yet the required memory size is high enough that some likely
+ * attackers might end up being forced to choose between using higher latency
+ * memory than they could use otherwise (waiting for data longer) or using TMTO
+ * (waiting for data more times per one hash computation).  The area-time cost
+ * for other kinds of attackers (who would use the same memory type and TMTO
+ * factor or no TMTO either way) remains roughly the same, given the same
+ * running time for the defender.
+ *
+ * As a side effect of differences between the algorithms, setting YESCRYPT_RW
+ * also changes the way the total processing time (combined for all threads)
+ * and memory allocation (if the parallelism is being made use of) is to be
+ * controlled from N*r*p (for classic scrypt) to N*r (in this modification).
+ * Obviously, these only differ for p > 1.
+ *
+ * Passing a shared structure, with ROM contents previously computed by
+ * yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for
+ * the thread-local RAM region.  In order to allow for initialization of the
+ * ROM to be split into a separate program, the shared->aligned and
+ * shared->aligned_size fields may be set by the caller of yescrypt_kdf()
+ * manually rather than with yescrypt_init_shared().
+ *
+ * local must be initialized with yescrypt_init_local().
+ *
+ * MT-safe as long as local and buf are local to the thread.
+ */
+extern int yescrypt_kdf(const yescrypt_shared_t * __shared,
+    yescrypt_local_t * __local,
+    const uint8_t * __passwd, size_t __passwdlen,
+    const uint8_t * __salt, size_t __saltlen,
+    uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t, uint32_t __g,
+    yescrypt_flags_t __flags,
+    uint8_t * __buf, size_t __buflen);
+
+/**
+ * yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen):
+ * Compute and encode an scrypt or enhanced scrypt hash of passwd given the
+ * parameters and salt value encoded in setting.  If shared is not NULL, a ROM
+ * is used and YESCRYPT_RW is required.  Otherwise, whether to compute classic
+ * scrypt, YESCRYPT_WORM (a slight deviation from classic scrypt), or
+ * YESCRYPT_RW (time-memory tradeoff discouraging modification) is determined
+ * by the setting string.  shared (if not NULL) and local must be initialized
+ * as described above for yescrypt_kdf().  buf must be large enough (as
+ * indicated by buflen) to hold the encoded hash string.
+ *
+ * Return the encoded hash string on success; or NULL on error.
+ *
+ * MT-safe as long as local and buf are local to the thread.
+ */
+extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared,
+    yescrypt_local_t * __local,
+    const uint8_t * __passwd, size_t __passwdlen,
+    const uint8_t * __setting,
+    uint8_t * __buf, size_t __buflen);
+
+/**
+ * yescrypt(passwd, setting):
+ * Compute and encode an scrypt or enhanced scrypt hash of passwd given the
+ * parameters and salt value encoded in setting.  Whether to compute classic
+ * scrypt, YESCRYPT_WORM (a slight deviation from classic scrypt), or
+ * YESCRYPT_RW (time-memory tradeoff discouraging modification) is determined
+ * by the setting string.
+ *
+ * Return the encoded hash string on success; or NULL on error.
+ *
+ * This is a crypt(3)-like interface, which is simpler to use than
+ * yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM,
+ * and it is slower than yescrypt_r() for repeated calls because it allocates
+ * and frees memory on each call.
+ *
+ * MT-unsafe.
+ */
+extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting);
+
+/**
+ * yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen):
+ * Generate a setting string for use with yescrypt_r() and yescrypt() by
+ * encoding into it the parameters N_log2 (which is to be set to base 2
+ * logarithm of the desired value for N), r, p, flags, and a salt given by src
+ * (of srclen bytes).  buf must be large enough (as indicated by buflen) to
+ * hold the setting string.
+ *
+ * Return the setting string on success; or NULL on error.
+ *
+ * MT-safe as long as buf is local to the thread.
+ */
+extern uint8_t * yescrypt_gensalt_r(
+    uint32_t __N_log2, uint32_t __r, uint32_t __p,
+    yescrypt_flags_t __flags,
+    const uint8_t * __src, size_t __srclen,
+    uint8_t * __buf, size_t __buflen);
+
+/**
+ * yescrypt_gensalt(N_log2, r, p, flags, src, srclen):
+ * Generate a setting string for use with yescrypt_r() and yescrypt().  This
+ * function is the same as yescrypt_gensalt_r() except that it uses a static
+ * buffer and thus is not MT-safe.
+ *
+ * Return the setting string on success; or NULL on error.
+ *
+ * MT-unsafe.
+ */
+extern uint8_t * yescrypt_gensalt(
+    uint32_t __N_log2, uint32_t __r, uint32_t __p,
+    yescrypt_flags_t __flags,
+    const uint8_t * __src, size_t __srclen);
+
+#endif /* !_YESCRYPT_H_ */
diff --git a/src/crypto/rx/RxAlgo.cpp b/src/crypto/rx/RxAlgo.cpp
index b0e92e6e..3b7766ae 100644
--- a/src/crypto/rx/RxAlgo.cpp
+++ b/src/crypto/rx/RxAlgo.cpp
@@ -26,6 +26,7 @@
 
 
 #include "crypto/randomx/randomx.h"
+#include "crypto/defyx/defyx.h"
 #include "crypto/rx/RxAlgo.h"
 
 
@@ -40,6 +41,10 @@ xmrig::Algorithm::Id xmrig::RxAlgo::apply(Algorithm::Id algorithm)
         randomx_apply_config(RandomX_LokiConfig);
         break;
 
+    case Algorithm::DEFYX:
+        randomx_apply_config(RandomX_ScalaConfig);
+        break;
+
     default:
         randomx_apply_config(RandomX_MoneroConfig);
         break;
@@ -61,6 +66,9 @@ size_t xmrig::RxAlgo::l3(Algorithm::Id algorithm)
     case Algorithm::RX_LOKI:
         return RandomX_LokiConfig.ScratchpadL3_Size;
 
+    case Algorithm::DEFYX:
+        return RandomX_ScalaConfig.ScratchpadL3_Size;
+
     default:
         break;
     }