From c19cb4672d880d39ee2c60e7de18fbc01558072f Mon Sep 17 00:00:00 2001 From: MoneroOcean Date: Thu, 15 Aug 2019 17:23:22 -0700 Subject: [PATCH] Added experimental DefyX support --- CMakeLists.txt | 23 + src/backend/cpu/CpuWorker.cpp | 7 +- src/core/Benchmark.cpp | 1 + src/core/Benchmark.h | 2 + src/core/config/usage.h | 2 +- src/crypto/cn/CnAlgo.h | 3 + src/crypto/common/Algorithm.cpp | 8 + src/crypto/common/Algorithm.h | 1 + src/crypto/defyx/KangarooTwelve.c | 271 ++++ src/crypto/defyx/KangarooTwelve.h | 89 ++ src/crypto/defyx/KeccakP-1600-SnP.h | 41 + src/crypto/defyx/KeccakP-1600-reference.c | 402 ++++++ src/crypto/defyx/KeccakSponge-common.h | 35 + src/crypto/defyx/KeccakSponge.inc | 311 +++++ src/crypto/defyx/KeccakSpongeWidth1600.c | 54 + src/crypto/defyx/KeccakSpongeWidth1600.h | 31 + src/crypto/defyx/Phases.h | 22 + src/crypto/defyx/align.h | 32 + src/crypto/defyx/brg_endian.h | 143 +++ src/crypto/defyx/defyx.cpp | 109 ++ src/crypto/defyx/defyx.h | 57 + src/crypto/defyx/sha256.c | 411 +++++++ src/crypto/defyx/sha256.h | 62 + src/crypto/defyx/sysendian.h | 138 +++ src/crypto/defyx/yescrypt-best.c | 5 + src/crypto/defyx/yescrypt-opt.c | 1102 +++++++++++++++++ src/crypto/defyx/yescrypt-platform.c | 188 +++ src/crypto/defyx/yescrypt-simd.c | 1367 +++++++++++++++++++++ src/crypto/defyx/yescrypt.h | 326 +++++ src/crypto/rx/RxAlgo.cpp | 8 + 30 files changed, 5249 insertions(+), 2 deletions(-) create mode 100644 src/crypto/defyx/KangarooTwelve.c create mode 100644 src/crypto/defyx/KangarooTwelve.h create mode 100644 src/crypto/defyx/KeccakP-1600-SnP.h create mode 100644 src/crypto/defyx/KeccakP-1600-reference.c create mode 100644 src/crypto/defyx/KeccakSponge-common.h create mode 100644 src/crypto/defyx/KeccakSponge.inc create mode 100644 src/crypto/defyx/KeccakSpongeWidth1600.c create mode 100644 src/crypto/defyx/KeccakSpongeWidth1600.h create mode 100644 src/crypto/defyx/Phases.h create mode 100644 src/crypto/defyx/align.h create mode 100644 src/crypto/defyx/brg_endian.h create mode 100644 src/crypto/defyx/defyx.cpp create mode 100644 src/crypto/defyx/defyx.h create mode 100644 src/crypto/defyx/sha256.c create mode 100644 src/crypto/defyx/sha256.h create mode 100644 src/crypto/defyx/sysendian.h create mode 100644 src/crypto/defyx/yescrypt-best.c create mode 100644 src/crypto/defyx/yescrypt-opt.c create mode 100644 src/crypto/defyx/yescrypt-platform.c create mode 100644 src/crypto/defyx/yescrypt-simd.c create mode 100644 src/crypto/defyx/yescrypt.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 09a28602..b312bfc5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -154,6 +154,7 @@ find_package(UV REQUIRED) if (WITH_RANDOMX) include_directories(src/crypto/randomx) + include_directories(src/crypto/defyx) add_definitions(/DXMRIG_ALGO_RANDOMX) set(SOURCES_CRYPTO "${SOURCES_CRYPTO}" @@ -188,6 +189,28 @@ if (WITH_RANDOMX) src/crypto/rx/RxDataset.h src/crypto/rx/RxVm.cpp src/crypto/rx/RxVm.h + + src/crypto/defyx/align.h + src/crypto/defyx/brg_endian.h + src/crypto/defyx/defyx.cpp + src/crypto/defyx/defyx.h + src/crypto/defyx/KangarooTwelve.c + src/crypto/defyx/KangarooTwelve.h + src/crypto/defyx/KeccakP-1600-reference.c + src/crypto/defyx/KeccakP-1600-SnP.h + src/crypto/defyx/KeccakSponge-common.h + src/crypto/defyx/KeccakSponge.inc + src/crypto/defyx/KeccakSpongeWidth1600.c + src/crypto/defyx/KeccakSpongeWidth1600.h + src/crypto/defyx/Phases.h + src/crypto/defyx/sha256.c + src/crypto/defyx/sha256.h + src/crypto/defyx/sysendian.h + src/crypto/defyx/yescrypt-best.c + src/crypto/defyx/yescrypt-opt.c + src/crypto/defyx/yescrypt-platform.c + src/crypto/defyx/yescrypt-simd.c + src/crypto/defyx/yescrypt.h ) if (NOT ARCH_ID) set(ARCH_ID ${CMAKE_HOST_SYSTEM_PROCESSOR}) diff --git a/src/backend/cpu/CpuWorker.cpp b/src/backend/cpu/CpuWorker.cpp index 14ef1797..6035025e 100644 --- a/src/backend/cpu/CpuWorker.cpp +++ b/src/backend/cpu/CpuWorker.cpp @@ -41,6 +41,7 @@ #ifdef XMRIG_ALGO_RANDOMX # include "crypto/randomx/randomx.h" +# include "crypto/defyx/defyx.h" #endif @@ -190,7 +191,11 @@ void xmrig::CpuWorker::start() # ifdef XMRIG_ALGO_RANDOMX if (job.algorithm().family() == Algorithm::RANDOM_X) { - randomx_calculate_hash(m_vm->get(), m_job.blob(), job.size(), m_hash); + if (job.algorithm() == Algorithm::DEFYX) { + defyx_calculate_hash(m_vm->get(), m_job.blob(), job.size(), m_hash); + } else { + randomx_calculate_hash(m_vm->get(), m_job.blob(), job.size(), m_hash); + } } else # endif diff --git a/src/core/Benchmark.cpp b/src/core/Benchmark.cpp index 3840bddc..6d754f84 100644 --- a/src/core/Benchmark.cpp +++ b/src/core/Benchmark.cpp @@ -126,6 +126,7 @@ float Benchmark::get_algo_perf(Algorithm::Id algo) const { case Algorithm::RX_0: return m_bench_algo_perf[BenchAlgo::RX_0]; case Algorithm::RX_WOW: return m_bench_algo_perf[BenchAlgo::RX_WOW]; case Algorithm::RX_LOKI: return m_bench_algo_perf[BenchAlgo::RX_0]; + case Algorithm::DEFYX: return m_bench_algo_perf[BenchAlgo::DEFYX]; default: return 0.0f; } } diff --git a/src/core/Benchmark.h b/src/core/Benchmark.h index 66949878..443904e8 100644 --- a/src/core/Benchmark.h +++ b/src/core/Benchmark.h @@ -32,6 +32,7 @@ class Benchmark : public IJobResultListener { enum BenchAlgo : int { RX_0, // "rx/0" RandomX (reference configuration). RX_WOW, // "rx/wow" RandomWOW (Wownero). + DEFYX, // "defyx DefyX. CN_R, // "cn/r" CryptoNightR (Monero's variant 4). CN_GPU, // "cn/gpu" CryptoNight-GPU (Ryo). CN_LITE_1, // "cn-lite/1" CryptoNight-Lite variant 1. @@ -45,6 +46,7 @@ class Benchmark : public IJobResultListener { const Algorithm::Id ba2a[BenchAlgo::MAX] = { Algorithm::RX_0, Algorithm::RX_WOW, + Algorithm::DEFYX, Algorithm::CN_R, Algorithm::CN_GPU, Algorithm::CN_LITE_1, diff --git a/src/core/config/usage.h b/src/core/config/usage.h index b41ec6db..f1acd7c1 100644 --- a/src/core/config/usage.h +++ b/src/core/config/usage.h @@ -57,7 +57,7 @@ Options:\n\ #endif #ifdef XMRIG_ALGO_RANDOMX "\ - rx/wow, rx/loki\n" + rx/wow, rx/loki, defyx\n" #endif "\ -o, --url=URL URL of mining server\n\ diff --git a/src/crypto/cn/CnAlgo.h b/src/crypto/cn/CnAlgo.h index ed64331a..3cd88a5a 100644 --- a/src/crypto/cn/CnAlgo.h +++ b/src/crypto/cn/CnAlgo.h @@ -132,6 +132,7 @@ private: 0, // RX_0 0, // RX_WOW 0, // RX_LOKI + 0, // DEFYX # endif }; @@ -167,6 +168,7 @@ private: 0, // RX_0 0, // RX_WOW 0, // RX_LOKI + 0, // DEFYX # endif }; @@ -202,6 +204,7 @@ private: Algorithm::INVALID, // RX_0 Algorithm::INVALID, // RX_WOW Algorithm::INVALID, // RX_LOKI + Algorithm::INVALID, // DEFYX # endif }; }; diff --git a/src/crypto/common/Algorithm.cpp b/src/crypto/common/Algorithm.cpp index db6cb234..2d12ea6d 100644 --- a/src/crypto/common/Algorithm.cpp +++ b/src/crypto/common/Algorithm.cpp @@ -115,6 +115,7 @@ static AlgoName const algorithm_names[] = { { "RandomWOW", nullptr, Algorithm::RX_WOW }, { "randomx/loki", "rx/loki", Algorithm::RX_LOKI }, { "RandomXL", nullptr, Algorithm::RX_LOKI }, + { "DefyX", "defyx", Algorithm::DEFYX }, # endif }; @@ -159,6 +160,9 @@ size_t xmrig::Algorithm::l2() const case RX_WOW: return 0x20000; + case DEFYX: + return 0x20000; + default: break; } @@ -189,6 +193,9 @@ size_t xmrig::Algorithm::l3() const case RX_WOW: return oneMiB; + case DEFYX: + return 0x40000; + default: break; } @@ -241,6 +248,7 @@ xmrig::Algorithm::Family xmrig::Algorithm::family(Id id) case RX_0: case RX_WOW: case RX_LOKI: + case DEFYX: return RANDOM_X; # endif diff --git a/src/crypto/common/Algorithm.h b/src/crypto/common/Algorithm.h index 92fcc61e..8c708f9a 100644 --- a/src/crypto/common/Algorithm.h +++ b/src/crypto/common/Algorithm.h @@ -72,6 +72,7 @@ public: RX_0, // "rx/0" RandomX (reference configuration). RX_WOW, // "rx/wow" RandomWOW (Wownero). RX_LOKI, // "rx/loki" RandomXL (Loki). + DEFYX, // "defyx" DefyX (Scala). # endif MAX }; diff --git a/src/crypto/defyx/KangarooTwelve.c b/src/crypto/defyx/KangarooTwelve.c new file mode 100644 index 00000000..5f8b879f --- /dev/null +++ b/src/crypto/defyx/KangarooTwelve.c @@ -0,0 +1,271 @@ +/* +Implementation by Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to our website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include +#include "KangarooTwelve.h" +#ifndef KeccakP1600timesN_excluded + // #include "KeccakP-1600-times2-SnP.h" + // #include "KeccakP-1600-times4-SnP.h" + // #include "KeccakP-1600-times8-SnP.h" +#endif + +#define chunkSize 8192 +#define laneSize 8 +#define suffixLeaf 0x0B /* '110': message hop, simple padding, inner node */ + +#define security 128 +#define capacity (2*security) +#define capacityInBytes (capacity/8) +#define capacityInLanes (capacityInBytes/laneSize) +#define rate (1600-capacity) +#define rateInBytes (rate/8) +#define rateInLanes (rateInBytes/laneSize) + +#define ParallelSpongeFastLoop( Parallellism ) \ + while ( inLen >= Parallellism * chunkSize ) { \ + ALIGN(KeccakP1600times##Parallellism##_statesAlignment) unsigned char states[KeccakP1600times##Parallellism##_statesSizeInBytes]; \ + unsigned char intermediate[Parallellism*capacityInBytes]; \ + unsigned int localBlockLen = chunkSize; \ + const unsigned char * localInput = input; \ + unsigned int i; \ + unsigned int fastLoopOffset; \ + \ + KeccakP1600times##Parallellism##_StaticInitialize(); \ + KeccakP1600times##Parallellism##_InitializeAll(states); \ + fastLoopOffset = KeccakP1600times##Parallellism##_12rounds_FastLoop_Absorb(states, rateInLanes, chunkSize / laneSize, rateInLanes, localInput, Parallellism * chunkSize); \ + localBlockLen -= fastLoopOffset; \ + localInput += fastLoopOffset; \ + for ( i = 0; i < Parallellism; ++i, localInput += chunkSize ) { \ + KeccakP1600times##Parallellism##_AddBytes(states, i, localInput, 0, localBlockLen); \ + KeccakP1600times##Parallellism##_AddByte(states, i, suffixLeaf, localBlockLen); \ + KeccakP1600times##Parallellism##_AddByte(states, i, 0x80, rateInBytes-1); \ + } \ + KeccakP1600times##Parallellism##_PermuteAll_12rounds(states); \ + input += Parallellism * chunkSize; \ + inLen -= Parallellism * chunkSize; \ + ktInstance->blockNumber += Parallellism; \ + KeccakP1600times##Parallellism##_ExtractLanesAll(states, intermediate, capacityInLanes, capacityInLanes ); \ + if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, intermediate, Parallellism * capacityInBytes) != 0) return 1; \ + } + +#define ParallelSpongeLoop( Parallellism ) \ + while ( inLen >= Parallellism * chunkSize ) { \ + ALIGN(KeccakP1600times##Parallellism##_statesAlignment) unsigned char states[KeccakP1600times##Parallellism##_statesSizeInBytes]; \ + unsigned char intermediate[Parallellism*capacityInBytes]; \ + unsigned int localBlockLen = chunkSize; \ + const unsigned char * localInput = input; \ + unsigned int i; \ + \ + KeccakP1600times##Parallellism##_StaticInitialize(); \ + KeccakP1600times##Parallellism##_InitializeAll(states); \ + while(localBlockLen >= rateInBytes) { \ + KeccakP1600times##Parallellism##_AddLanesAll(states, localInput, rateInLanes, chunkSize / laneSize); \ + KeccakP1600times##Parallellism##_PermuteAll_12rounds(states); \ + localBlockLen -= rateInBytes; \ + localInput += rateInBytes; \ + } \ + for ( i = 0; i < Parallellism; ++i, localInput += chunkSize ) { \ + KeccakP1600times##Parallellism##_AddBytes(states, i, localInput, 0, localBlockLen); \ + KeccakP1600times##Parallellism##_AddByte(states, i, suffixLeaf, localBlockLen); \ + KeccakP1600times##Parallellism##_AddByte(states, i, 0x80, rateInBytes-1); \ + } \ + KeccakP1600times##Parallellism##_PermuteAll_12rounds(states); \ + input += Parallellism * chunkSize; \ + inLen -= Parallellism * chunkSize; \ + ktInstance->blockNumber += Parallellism; \ + KeccakP1600times##Parallellism##_ExtractLanesAll(states, intermediate, capacityInLanes, capacityInLanes ); \ + if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, intermediate, Parallellism * capacityInBytes) != 0) return 1; \ + } + +static unsigned int right_encode( unsigned char * encbuf, size_t value ) +{ + unsigned int n, i; + size_t v; + + for ( v = value, n = 0; v && (n < sizeof(size_t)); ++n, v >>= 8 ) + ; /* empty */ + for ( i = 1; i <= n; ++i ) + encbuf[i-1] = (unsigned char)(value >> (8 * (n-i))); + encbuf[n] = (unsigned char)n; + return n + 1; +} + +int KangarooTwelve_Initialize(KangarooTwelve_Instance *ktInstance, size_t outputLen) +{ + ktInstance->fixedOutputLength = outputLen; + ktInstance->queueAbsorbedLen = 0; + ktInstance->blockNumber = 0; + ktInstance->phase = ABSORBING; + return KeccakWidth1600_12rounds_SpongeInitialize(&ktInstance->finalNode, rate, capacity); +} + +int KangarooTwelve_Update(KangarooTwelve_Instance *ktInstance, const unsigned char *input, size_t inLen) +{ + if (ktInstance->phase != ABSORBING) + return 1; + + if ( ktInstance->blockNumber == 0 ) { + /* First block, absorb in final node */ + unsigned int len = (inLen < (chunkSize - ktInstance->queueAbsorbedLen)) ? inLen : (chunkSize - ktInstance->queueAbsorbedLen); + if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, input, len) != 0) + return 1; + input += len; + inLen -= len; + ktInstance->queueAbsorbedLen += len; + if ( (ktInstance->queueAbsorbedLen == chunkSize) && (inLen != 0) ) { + /* First block complete and more input data available, finalize it */ + const unsigned char padding = 0x03; /* '110^6': message hop, simple padding */ + ktInstance->queueAbsorbedLen = 0; + ktInstance->blockNumber = 1; + if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, &padding, 1) != 0) + return 1; + ktInstance->finalNode.byteIOIndex = (ktInstance->finalNode.byteIOIndex + 7) & ~7; /* Zero padding up to 64 bits */ + } + } + else if ( ktInstance->queueAbsorbedLen != 0 ) { + /* There is data in the queue, absorb further in queue until block complete */ + unsigned int len = (inLen < (chunkSize - ktInstance->queueAbsorbedLen)) ? inLen : (chunkSize - ktInstance->queueAbsorbedLen); + if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->queueNode, input, len) != 0) + return 1; + input += len; + inLen -= len; + ktInstance->queueAbsorbedLen += len; + if ( ktInstance->queueAbsorbedLen == chunkSize ) { + unsigned char intermediate[capacityInBytes]; + ktInstance->queueAbsorbedLen = 0; + ++ktInstance->blockNumber; + if (KeccakWidth1600_12rounds_SpongeAbsorbLastFewBits(&ktInstance->queueNode, suffixLeaf) != 0) + return 1; + if (KeccakWidth1600_12rounds_SpongeSqueeze(&ktInstance->queueNode, intermediate, capacityInBytes) != 0) + return 1; + if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, intermediate, capacityInBytes) != 0) + return 1; + } + } + + #if defined(KeccakP1600times8_implementation) && !defined(KeccakP1600times8_isFallback) + #if defined(KeccakP1600times8_12rounds_FastLoop_supported) + ParallelSpongeFastLoop( 8 ) + #else + ParallelSpongeLoop( 8 ) + #endif + #endif + + #if defined(KeccakP1600times4_implementation) && !defined(KeccakP1600times4_isFallback) + #if defined(KeccakP1600times4_12rounds_FastLoop_supported) + ParallelSpongeFastLoop( 4 ) + #else + ParallelSpongeLoop( 4 ) + #endif + #endif + + #if defined(KeccakP1600times2_implementation) && !defined(KeccakP1600times2_isFallback) + #if defined(KeccakP1600times2_12rounds_FastLoop_supported) + ParallelSpongeFastLoop( 2 ) + #else + ParallelSpongeLoop( 2 ) + #endif + #endif + + while ( inLen > 0 ) { + unsigned int len = (inLen < chunkSize) ? inLen : chunkSize; + if (KeccakWidth1600_12rounds_SpongeInitialize(&ktInstance->queueNode, rate, capacity) != 0) + return 1; + if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->queueNode, input, len) != 0) + return 1; + input += len; + inLen -= len; + if ( len == chunkSize ) { + unsigned char intermediate[capacityInBytes]; + ++ktInstance->blockNumber; + if (KeccakWidth1600_12rounds_SpongeAbsorbLastFewBits(&ktInstance->queueNode, suffixLeaf) != 0) + return 1; + if (KeccakWidth1600_12rounds_SpongeSqueeze(&ktInstance->queueNode, intermediate, capacityInBytes) != 0) + return 1; + if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, intermediate, capacityInBytes) != 0) + return 1; + } + else + ktInstance->queueAbsorbedLen = len; + } + + return 0; +} + +int KangarooTwelve_Final(KangarooTwelve_Instance *ktInstance, unsigned char * output, const unsigned char * customization, size_t customLen) +{ + unsigned char encbuf[sizeof(size_t)+1+2]; + unsigned char padding; + + if (ktInstance->phase != ABSORBING) + return 1; + + /* Absorb customization | right_encode(customLen) */ + if ((customLen != 0) && (KangarooTwelve_Update(ktInstance, customization, customLen) != 0)) + return 1; + if (KangarooTwelve_Update(ktInstance, encbuf, right_encode(encbuf, customLen)) != 0) + return 1; + + if ( ktInstance->blockNumber == 0 ) { + /* Non complete first block in final node, pad it */ + padding = 0x07; /* '11': message hop, final node */ + } + else { + unsigned int n; + + if ( ktInstance->queueAbsorbedLen != 0 ) { + /* There is data in the queue node */ + unsigned char intermediate[capacityInBytes]; + ++ktInstance->blockNumber; + if (KeccakWidth1600_12rounds_SpongeAbsorbLastFewBits(&ktInstance->queueNode, suffixLeaf) != 0) + return 1; + if (KeccakWidth1600_12rounds_SpongeSqueeze(&ktInstance->queueNode, intermediate, capacityInBytes) != 0) + return 1; + if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, intermediate, capacityInBytes) != 0) + return 1; + } + --ktInstance->blockNumber; /* Absorb right_encode(number of Chaining Values) || 0xFF || 0xFF */ + n = right_encode(encbuf, ktInstance->blockNumber); + encbuf[n++] = 0xFF; + encbuf[n++] = 0xFF; + if (KeccakWidth1600_12rounds_SpongeAbsorb(&ktInstance->finalNode, encbuf, n) != 0) + return 1; + padding = 0x06; /* '01': chaining hop, final node */ + } + if (KeccakWidth1600_12rounds_SpongeAbsorbLastFewBits(&ktInstance->finalNode, padding) != 0) + return 1; + if ( ktInstance->fixedOutputLength != 0 ) { + ktInstance->phase = FINAL; + return KeccakWidth1600_12rounds_SpongeSqueeze(&ktInstance->finalNode, output, ktInstance->fixedOutputLength); + } + ktInstance->phase = SQUEEZING; + return 0; +} + +int KangarooTwelve_Squeeze(KangarooTwelve_Instance *ktInstance, unsigned char * output, size_t outputLen) +{ + if (ktInstance->phase != SQUEEZING) + return 1; + return KeccakWidth1600_12rounds_SpongeSqueeze(&ktInstance->finalNode, output, outputLen); +} + +int KangarooTwelve( const unsigned char * input, size_t inLen, unsigned char * output, size_t outLen, const unsigned char * customization, size_t customLen ) +{ + KangarooTwelve_Instance ktInstance; + + if (outLen == 0) + return 1; + if (KangarooTwelve_Initialize(&ktInstance, outLen) != 0) + return 1; + if (KangarooTwelve_Update(&ktInstance, input, inLen) != 0) + return 1; + return KangarooTwelve_Final(&ktInstance, output, customization, customLen); +} diff --git a/src/crypto/defyx/KangarooTwelve.h b/src/crypto/defyx/KangarooTwelve.h new file mode 100644 index 00000000..0e9ed41e --- /dev/null +++ b/src/crypto/defyx/KangarooTwelve.h @@ -0,0 +1,89 @@ +/* +Implementation by Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to our website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifndef _KangarooTwelve_h_ +#define _KangarooTwelve_h_ + +#ifndef KeccakP1600_excluded + +#include +#include "align.h" +#include "KeccakSpongeWidth1600.h" +#include "Phases.h" + +typedef KCP_Phases KangarooTwelve_Phases; + +typedef struct { + KeccakWidth1600_12rounds_SpongeInstance queueNode; + KeccakWidth1600_12rounds_SpongeInstance finalNode; + size_t fixedOutputLength; + size_t blockNumber; + unsigned int queueAbsorbedLen; + KangarooTwelve_Phases phase; +} KangarooTwelve_Instance; + +/** Extendable ouput function KangarooTwelve. + * @param input Pointer to the input message (M). + * @param inputByteLen The length of the input message in bytes. + * @param output Pointer to the output buffer. + * @param outputByteLen The desired number of output bytes. + * @param customization Pointer to the customization string (C). + * @param customByteLen The length of the customization string in bytes. + * @return 0 if successful, 1 otherwise. + */ +int KangarooTwelve(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen ); + +/** + * Function to initialize a KangarooTwelve instance. + * @param ktInstance Pointer to the instance to be initialized. + * @param outputByteLen The desired number of output bytes, + * or 0 for an arbitrarily-long output. + * @return 0 if successful, 1 otherwise. + */ +int KangarooTwelve_Initialize(KangarooTwelve_Instance *ktInstance, size_t outputByteLen); + +/** + * Function to give input data to be absorbed. + * @param ktInstance Pointer to the instance initialized by KangarooTwelve_Initialize(). + * @param input Pointer to the input message data (M). + * @param inputByteLen The number of bytes provided in the input message data. + * @return 0 if successful, 1 otherwise. + */ +int KangarooTwelve_Update(KangarooTwelve_Instance *ktInstance, const unsigned char *input, size_t inputByteLen); + +/** + * Function to call after all the input message has been input, and to get + * output bytes if the length was specified when calling KangarooTwelve_Initialize(). + * @param ktInstance Pointer to the hash instance initialized by KangarooTwelve_Initialize(). + * If @a outputByteLen was not 0 in the call to KangarooTwelve_Initialize(), the number of + * output bytes is equal to @a outputByteLen. + * If @a outputByteLen was 0 in the call to KangarooTwelve_Initialize(), the output bytes + * must be extracted using the KangarooTwelve_Squeeze() function. + * @param output Pointer to the buffer where to store the output data. + * @param customization Pointer to the customization string (C). + * @param customByteLen The length of the customization string in bytes. + * @return 0 if successful, 1 otherwise. + */ +int KangarooTwelve_Final(KangarooTwelve_Instance *ktInstance, unsigned char *output, const unsigned char *customization, size_t customByteLen); + +/** + * Function to squeeze output data. + * @param ktInstance Pointer to the hash instance initialized by KangarooTwelve_Initialize(). + * @param data Pointer to the buffer where to store the output data. + * @param outputByteLen The number of output bytes desired. + * @pre KangarooTwelve_Final() must have been already called. + * @return 0 if successful, 1 otherwise. + */ +int KangarooTwelve_Squeeze(KangarooTwelve_Instance *ktInstance, unsigned char *output, size_t outputByteLen); + +#endif + +#endif diff --git a/src/crypto/defyx/KeccakP-1600-SnP.h b/src/crypto/defyx/KeccakP-1600-SnP.h new file mode 100644 index 00000000..907e3581 --- /dev/null +++ b/src/crypto/defyx/KeccakP-1600-SnP.h @@ -0,0 +1,41 @@ +/* +Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen, +Michaël Peeters, Gilles Van Assche and Ronny Van Keer, +hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to our website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to SnP-documentation.h for more details. +*/ + +#ifndef _KeccakP_1600_SnP_h_ +#define _KeccakP_1600_SnP_h_ + +#define KeccakP1600_implementation "64-bit reference implementation" +#define KeccakP1600_stateSizeInBytes 200 +#define KeccakP1600_stateAlignment 8 + +#ifdef KeccakReference +void KeccakP1600_StaticInitialize( void ); +#else +#define KeccakP1600_StaticInitialize() +#endif +void KeccakP1600_Initialize(void *state); +void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); +void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount); +void KeccakP1600_Permute_Nrounds(void *state, unsigned int nrounds); +void KeccakP1600_Permute_12rounds(void *state); +void KeccakP1600_Permute_24rounds(void *state); +void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length); + +#endif diff --git a/src/crypto/defyx/KeccakP-1600-reference.c b/src/crypto/defyx/KeccakP-1600-reference.c new file mode 100644 index 00000000..0c126030 --- /dev/null +++ b/src/crypto/defyx/KeccakP-1600-reference.c @@ -0,0 +1,402 @@ +/* +Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen, +Michaël Peeters, Gilles Van Assche and Ronny Van Keer, +hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to our website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +This file implements Keccak-p[1600] in a SnP-compatible way. +Please refer to SnP-documentation.h for more details. + +This implementation comes with KeccakP-1600-SnP.h in the same folder. +Please refer to LowLevel.build for the exact list of other files it must be combined with. +*/ + +#include +#include +#include +#include +#include "brg_endian.h" +#ifdef KeccakReference +#include "displayIntermediateValues.h" +#endif + +typedef unsigned char UINT8; +typedef unsigned long long UINT64; +typedef UINT64 tKeccakLane; + +#define maxNrRounds 24 +#define nrLanes 25 +#define index(x, y) (((x)%5)+5*((y)%5)) + +#ifdef KeccakReference + +static tKeccakLane KeccakRoundConstants[maxNrRounds]; +static unsigned int KeccakRhoOffsets[nrLanes]; + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_InitializeRoundConstants(void); +void KeccakP1600_InitializeRhoOffsets(void); +static int LFSR86540(UINT8 *LFSR); + +void KeccakP1600_StaticInitialize(void) +{ + if (sizeof(tKeccakLane) != 8) { + printf("tKeccakLane should be 64-bit wide\n"); + abort(); + } + KeccakP1600_InitializeRoundConstants(); + KeccakP1600_InitializeRhoOffsets(); +} + +void KeccakP1600_InitializeRoundConstants(void) +{ + UINT8 LFSRstate = 0x01; + unsigned int i, j, bitPosition; + + for(i=0; i> (64-offset))) : a) + +static void theta(tKeccakLane *A) +{ + unsigned int x, y; + tKeccakLane C[5], D[5]; + + for(x=0; x<5; x++) { + C[x] = 0; + for(y=0; y<5; y++) + C[x] ^= A[index(x, y)]; + } + for(x=0; x<5; x++) + D[x] = ROL64(C[(x+1)%5], 1) ^ C[(x+4)%5]; + for(x=0; x<5; x++) + for(y=0; y<5; y++) + A[index(x, y)] ^= D[x]; +} + +static void rho(tKeccakLane *A) +{ + unsigned int x, y; + + for(x=0; x<5; x++) for(y=0; y<5; y++) + A[index(x, y)] = ROL64(A[index(x, y)], KeccakRhoOffsets[index(x, y)]); +} + +static void pi(tKeccakLane *A) +{ + unsigned int x, y; + tKeccakLane tempA[25]; + + for(x=0; x<5; x++) for(y=0; y<5; y++) + tempA[index(x, y)] = A[index(x, y)]; + for(x=0; x<5; x++) for(y=0; y<5; y++) + A[index(0*x+1*y, 2*x+3*y)] = tempA[index(x, y)]; +} + +static void chi(tKeccakLane *A) +{ + unsigned int x, y; + tKeccakLane C[5]; + + for(y=0; y<5; y++) { + for(x=0; x<5; x++) + C[x] = A[index(x, y)] ^ ((~A[index(x+1, y)]) & A[index(x+2, y)]); + for(x=0; x<5; x++) + A[index(x, y)] = C[x]; + } +} + +static void iota(tKeccakLane *A, unsigned int indexRound) +{ + A[index(0, 0)] ^= KeccakRoundConstants[indexRound]; +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) +{ + assert(offset < 200); + assert(offset+length <= 200); + memcpy(data, (unsigned char*)state+offset, length); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length) +{ + unsigned int i; + + assert(offset < 200); + assert(offset+length <= 200); + for(i=0; i> 32)); + fprintf(f, "%08X", (unsigned int)(KeccakRoundConstants[i] & 0xFFFFFFFFULL)); + fprintf(f, "\n"); + } + fprintf(f, "\n"); +} + +void KeccakP1600_DisplayRhoOffsets(FILE *f) +{ + unsigned int x, y; + + for(y=0; y<5; y++) for(x=0; x<5; x++) { + fprintf(f, "RhoOffset[%i][%i] = ", x, y); + fprintf(f, "%2i", KeccakRhoOffsets[index(x, y)]); + fprintf(f, "\n"); + } + fprintf(f, "\n"); +} diff --git a/src/crypto/defyx/KeccakSponge-common.h b/src/crypto/defyx/KeccakSponge-common.h new file mode 100644 index 00000000..8fb3ba1a --- /dev/null +++ b/src/crypto/defyx/KeccakSponge-common.h @@ -0,0 +1,35 @@ +/* +Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen, +Michaël Peeters, Gilles Van Assche and Ronny Van Keer, +hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to our website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifndef _KeccakSpongeCommon_h_ +#define _KeccakSpongeCommon_h_ + +#include +#include "align.h" + +#define KCP_DeclareSpongeStructure(prefix, size, alignment) \ + ALIGN(alignment) typedef struct prefix##_SpongeInstanceStruct { \ + unsigned char state[size]; \ + unsigned int rate; \ + unsigned int byteIOIndex; \ + int squeezing; \ + } prefix##_SpongeInstance; + +#define KCP_DeclareSpongeFunctions(prefix) \ + int prefix##_Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen); \ + int prefix##_SpongeInitialize(prefix##_SpongeInstance *spongeInstance, unsigned int rate, unsigned int capacity); \ + int prefix##_SpongeAbsorb(prefix##_SpongeInstance *spongeInstance, const unsigned char *data, size_t dataByteLen); \ + int prefix##_SpongeAbsorbLastFewBits(prefix##_SpongeInstance *spongeInstance, unsigned char delimitedData); \ + int prefix##_SpongeSqueeze(prefix##_SpongeInstance *spongeInstance, unsigned char *data, size_t dataByteLen); + +#endif diff --git a/src/crypto/defyx/KeccakSponge.inc b/src/crypto/defyx/KeccakSponge.inc new file mode 100644 index 00000000..f6c59cee --- /dev/null +++ b/src/crypto/defyx/KeccakSponge.inc @@ -0,0 +1,311 @@ +/* +Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen, +Michaël Peeters, Gilles Van Assche and Ronny Van Keer, +hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to our website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#define JOIN0(a, b) a ## b +#define JOIN(a, b) JOIN0(a, b) + +#define Sponge JOIN(prefix, _Sponge) +#define SpongeInstance JOIN(prefix, _SpongeInstance) +#define SpongeInitialize JOIN(prefix, _SpongeInitialize) +#define SpongeAbsorb JOIN(prefix, _SpongeAbsorb) +#define SpongeAbsorbLastFewBits JOIN(prefix, _SpongeAbsorbLastFewBits) +#define SpongeSqueeze JOIN(prefix, _SpongeSqueeze) + +#define SnP_stateSizeInBytes JOIN(SnP, _stateSizeInBytes) +#define SnP_stateAlignment JOIN(SnP, _stateAlignment) +#define SnP_StaticInitialize JOIN(SnP, _StaticInitialize) +#define SnP_Initialize JOIN(SnP, _Initialize) +#define SnP_AddByte JOIN(SnP, _AddByte) +#define SnP_AddBytes JOIN(SnP, _AddBytes) +#define SnP_ExtractBytes JOIN(SnP, _ExtractBytes) + +int Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen) +{ + ALIGN(SnP_stateAlignment) unsigned char state[SnP_stateSizeInBytes]; + unsigned int partialBlock; + const unsigned char *curInput = input; + unsigned char *curOutput = output; + unsigned int rateInBytes = rate/8; + + if (rate+capacity != SnP_width) + return 1; + if ((rate <= 0) || (rate > SnP_width) || ((rate % 8) != 0)) + return 1; + if (suffix == 0) + return 1; + + /* Initialize the state */ + SnP_StaticInitialize(); + SnP_Initialize(state); + + /* First, absorb whole blocks */ +#ifdef SnP_FastLoop_Absorb + if (((rateInBytes % (SnP_width/200)) == 0) && (inputByteLen >= rateInBytes)) { + /* fast lane: whole lane rate */ + size_t j; + j = SnP_FastLoop_Absorb(state, rateInBytes/(SnP_width/200), curInput, inputByteLen); + curInput += j; + inputByteLen -= j; + } +#endif + while(inputByteLen >= (size_t)rateInBytes) { + #ifdef KeccakReference + displayBytes(1, "Block to be absorbed", curInput, rateInBytes); + #endif + SnP_AddBytes(state, curInput, 0, rateInBytes); + SnP_Permute(state); + curInput += rateInBytes; + inputByteLen -= rateInBytes; + } + + /* Then, absorb what remains */ + partialBlock = (unsigned int)inputByteLen; + #ifdef KeccakReference + displayBytes(1, "Block to be absorbed (part)", curInput, partialBlock); + #endif + SnP_AddBytes(state, curInput, 0, partialBlock); + + /* Finally, absorb the suffix */ + #ifdef KeccakReference + { + unsigned char delimitedData1[1]; + delimitedData1[0] = suffix; + displayBytes(1, "Block to be absorbed (last few bits + first bit of padding)", delimitedData1, 1); + } + #endif + /* Last few bits, whose delimiter coincides with first bit of padding */ + SnP_AddByte(state, suffix, partialBlock); + /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */ + if ((suffix >= 0x80) && (partialBlock == (rateInBytes-1))) + SnP_Permute(state); + /* Second bit of padding */ + SnP_AddByte(state, 0x80, rateInBytes-1); + #ifdef KeccakReference + { + unsigned char block[SnP_width/8]; + memset(block, 0, SnP_width/8); + block[rateInBytes-1] = 0x80; + displayBytes(1, "Second bit of padding", block, rateInBytes); + } + #endif + SnP_Permute(state); + #ifdef KeccakReference + displayText(1, "--- Switching to squeezing phase ---"); + #endif + + /* First, output whole blocks */ + while(outputByteLen > (size_t)rateInBytes) { + SnP_ExtractBytes(state, curOutput, 0, rateInBytes); + SnP_Permute(state); + #ifdef KeccakReference + displayBytes(1, "Squeezed block", curOutput, rateInBytes); + #endif + curOutput += rateInBytes; + outputByteLen -= rateInBytes; + } + + /* Finally, output what remains */ + partialBlock = (unsigned int)outputByteLen; + SnP_ExtractBytes(state, curOutput, 0, partialBlock); + #ifdef KeccakReference + displayBytes(1, "Squeezed block (part)", curOutput, partialBlock); + #endif + + return 0; +} + +/* ---------------------------------------------------------------- */ +/* ---------------------------------------------------------------- */ +/* ---------------------------------------------------------------- */ + +int SpongeInitialize(SpongeInstance *instance, unsigned int rate, unsigned int capacity) +{ + if (rate+capacity != SnP_width) + return 1; + if ((rate <= 0) || (rate > SnP_width) || ((rate % 8) != 0)) + return 1; + SnP_StaticInitialize(); + SnP_Initialize(instance->state); + instance->rate = rate; + instance->byteIOIndex = 0; + instance->squeezing = 0; + + return 0; +} + +/* ---------------------------------------------------------------- */ + +int SpongeAbsorb(SpongeInstance *instance, const unsigned char *data, size_t dataByteLen) +{ + size_t i, j; + unsigned int partialBlock; + const unsigned char *curData; + unsigned int rateInBytes = instance->rate/8; + + if (instance->squeezing) + return 1; /* Too late for additional input */ + + i = 0; + curData = data; + while(i < dataByteLen) { + if ((instance->byteIOIndex == 0) && (dataByteLen >= (i + rateInBytes))) { +#ifdef SnP_FastLoop_Absorb + /* processing full blocks first */ + if ((rateInBytes % (SnP_width/200)) == 0) { + /* fast lane: whole lane rate */ + j = SnP_FastLoop_Absorb(instance->state, rateInBytes/(SnP_width/200), curData, dataByteLen - i); + i += j; + curData += j; + } + else { +#endif + for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) { + #ifdef KeccakReference + displayBytes(1, "Block to be absorbed", curData, rateInBytes); + #endif + SnP_AddBytes(instance->state, curData, 0, rateInBytes); + SnP_Permute(instance->state); + curData+=rateInBytes; + } + i = dataByteLen - j; +#ifdef SnP_FastLoop_Absorb + } +#endif + } + else { + /* normal lane: using the message queue */ + partialBlock = (unsigned int)(dataByteLen - i); + if (partialBlock+instance->byteIOIndex > rateInBytes) + partialBlock = rateInBytes-instance->byteIOIndex; + #ifdef KeccakReference + displayBytes(1, "Block to be absorbed (part)", curData, partialBlock); + #endif + i += partialBlock; + + SnP_AddBytes(instance->state, curData, instance->byteIOIndex, partialBlock); + curData += partialBlock; + instance->byteIOIndex += partialBlock; + if (instance->byteIOIndex == rateInBytes) { + SnP_Permute(instance->state); + instance->byteIOIndex = 0; + } + } + } + return 0; +} + +/* ---------------------------------------------------------------- */ + +int SpongeAbsorbLastFewBits(SpongeInstance *instance, unsigned char delimitedData) +{ + unsigned int rateInBytes = instance->rate/8; + + if (delimitedData == 0) + return 1; + if (instance->squeezing) + return 1; /* Too late for additional input */ + + #ifdef KeccakReference + { + unsigned char delimitedData1[1]; + delimitedData1[0] = delimitedData; + displayBytes(1, "Block to be absorbed (last few bits + first bit of padding)", delimitedData1, 1); + } + #endif + /* Last few bits, whose delimiter coincides with first bit of padding */ + SnP_AddByte(instance->state, delimitedData, instance->byteIOIndex); + /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */ + if ((delimitedData >= 0x80) && (instance->byteIOIndex == (rateInBytes-1))) + SnP_Permute(instance->state); + /* Second bit of padding */ + SnP_AddByte(instance->state, 0x80, rateInBytes-1); + #ifdef KeccakReference + { + unsigned char block[SnP_width/8]; + memset(block, 0, SnP_width/8); + block[rateInBytes-1] = 0x80; + displayBytes(1, "Second bit of padding", block, rateInBytes); + } + #endif + SnP_Permute(instance->state); + instance->byteIOIndex = 0; + instance->squeezing = 1; + #ifdef KeccakReference + displayText(1, "--- Switching to squeezing phase ---"); + #endif + return 0; +} + +/* ---------------------------------------------------------------- */ + +int SpongeSqueeze(SpongeInstance *instance, unsigned char *data, size_t dataByteLen) +{ + size_t i, j; + unsigned int partialBlock; + unsigned int rateInBytes = instance->rate/8; + unsigned char *curData; + + if (!instance->squeezing) + SpongeAbsorbLastFewBits(instance, 0x01); + + i = 0; + curData = data; + while(i < dataByteLen) { + if ((instance->byteIOIndex == rateInBytes) && (dataByteLen >= (i + rateInBytes))) { + for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) { + SnP_Permute(instance->state); + SnP_ExtractBytes(instance->state, curData, 0, rateInBytes); + #ifdef KeccakReference + displayBytes(1, "Squeezed block", curData, rateInBytes); + #endif + curData+=rateInBytes; + } + i = dataByteLen - j; + } + else { + /* normal lane: using the message queue */ + if (instance->byteIOIndex == rateInBytes) { + SnP_Permute(instance->state); + instance->byteIOIndex = 0; + } + partialBlock = (unsigned int)(dataByteLen - i); + if (partialBlock+instance->byteIOIndex > rateInBytes) + partialBlock = rateInBytes-instance->byteIOIndex; + i += partialBlock; + + SnP_ExtractBytes(instance->state, curData, instance->byteIOIndex, partialBlock); + #ifdef KeccakReference + displayBytes(1, "Squeezed block (part)", curData, partialBlock); + #endif + curData += partialBlock; + instance->byteIOIndex += partialBlock; + } + } + return 0; +} + +/* ---------------------------------------------------------------- */ + +#undef Sponge +#undef SpongeInstance +#undef SpongeInitialize +#undef SpongeAbsorb +#undef SpongeAbsorbLastFewBits +#undef SpongeSqueeze +#undef SnP_stateSizeInBytes +#undef SnP_stateAlignment +#undef SnP_StaticInitialize +#undef SnP_Initialize +#undef SnP_AddByte +#undef SnP_AddBytes +#undef SnP_ExtractBytes diff --git a/src/crypto/defyx/KeccakSpongeWidth1600.c b/src/crypto/defyx/KeccakSpongeWidth1600.c new file mode 100644 index 00000000..672ec36d --- /dev/null +++ b/src/crypto/defyx/KeccakSpongeWidth1600.c @@ -0,0 +1,54 @@ +/* +Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen, +Michaël Peeters, Gilles Van Assche and Ronny Van Keer, +hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to our website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include "KeccakSpongeWidth1600.h" + +#ifdef KeccakReference + #include "displayIntermediateValues.h" +#endif + +#ifndef KeccakP1600_excluded + #include "KeccakP-1600-SnP.h" + + #define prefix KeccakWidth1600 + #define SnP KeccakP1600 + #define SnP_width 1600 + #define SnP_Permute KeccakP1600_Permute_24rounds + #if defined(KeccakF1600_FastLoop_supported) + #define SnP_FastLoop_Absorb KeccakF1600_FastLoop_Absorb + #endif + #include "KeccakSponge.inc" + #undef prefix + #undef SnP + #undef SnP_width + #undef SnP_Permute + #undef SnP_FastLoop_Absorb +#endif + +#ifndef KeccakP1600_excluded + #include "KeccakP-1600-SnP.h" + + #define prefix KeccakWidth1600_12rounds + #define SnP KeccakP1600 + #define SnP_width 1600 + #define SnP_Permute KeccakP1600_Permute_12rounds + #if defined(KeccakP1600_12rounds_FastLoop_supported) + #define SnP_FastLoop_Absorb KeccakP1600_12rounds_FastLoop_Absorb + #endif + #include "KeccakSponge.inc" + #undef prefix + #undef SnP + #undef SnP_width + #undef SnP_Permute + #undef SnP_FastLoop_Absorb +#endif diff --git a/src/crypto/defyx/KeccakSpongeWidth1600.h b/src/crypto/defyx/KeccakSpongeWidth1600.h new file mode 100644 index 00000000..1558256c --- /dev/null +++ b/src/crypto/defyx/KeccakSpongeWidth1600.h @@ -0,0 +1,31 @@ +/* +Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen, +Michaël Peeters, Gilles Van Assche and Ronny Van Keer, +hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to our website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifndef _KeccakSpongeWidth1600_h_ +#define _KeccakSpongeWidth1600_h_ + +#include "KeccakSponge-common.h" + +#ifndef KeccakP1600_excluded + #include "KeccakP-1600-SnP.h" + KCP_DeclareSpongeStructure(KeccakWidth1600, KeccakP1600_stateSizeInBytes, KeccakP1600_stateAlignment) + KCP_DeclareSpongeFunctions(KeccakWidth1600) +#endif + +#ifndef KeccakP1600_excluded + #include "KeccakP-1600-SnP.h" + KCP_DeclareSpongeStructure(KeccakWidth1600_12rounds, KeccakP1600_stateSizeInBytes, KeccakP1600_stateAlignment) + KCP_DeclareSpongeFunctions(KeccakWidth1600_12rounds) +#endif + +#endif diff --git a/src/crypto/defyx/Phases.h b/src/crypto/defyx/Phases.h new file mode 100644 index 00000000..769125c3 --- /dev/null +++ b/src/crypto/defyx/Phases.h @@ -0,0 +1,22 @@ +/* +Implementation by Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to our website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifndef _Phases_h_ +#define _Phases_h_ + +typedef enum { + NOT_INITIALIZED, + ABSORBING, + FINAL, + SQUEEZING +} KCP_Phases; + +#endif diff --git a/src/crypto/defyx/align.h b/src/crypto/defyx/align.h new file mode 100644 index 00000000..90c1b37a --- /dev/null +++ b/src/crypto/defyx/align.h @@ -0,0 +1,32 @@ +/* +Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen, +Michaël Peeters, Gilles Van Assche and Ronny Van Keer, +hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to our website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifndef _align_h_ +#define _align_h_ + +/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */ +#ifdef ALIGN +#undef ALIGN +#endif + +#if defined(__GNUC__) +#define ALIGN(x) __attribute__ ((aligned(x))) +#elif defined(_MSC_VER) +#define ALIGN(x) __declspec(align(x)) +#elif defined(__ARMCC_VERSION) +#define ALIGN(x) __align(x) +#else +#define ALIGN(x) +#endif + +#endif diff --git a/src/crypto/defyx/brg_endian.h b/src/crypto/defyx/brg_endian.h new file mode 100644 index 00000000..7c640b90 --- /dev/null +++ b/src/crypto/defyx/brg_endian.h @@ -0,0 +1,143 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The redistribution and use of this software (with or without changes) + is allowed without the payment of fees or royalties provided that: + + 1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + 2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + + 3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 20/12/2007 + Changes for ARM 9/9/2010 +*/ + +#ifndef _BRG_ENDIAN_H +#define _BRG_ENDIAN_H + +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ + +#if 0 +/* Include files where endian defines and byteswap functions may reside */ +#if defined( __sun ) +# include +#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) +# include +#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ + defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) +# include +#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) +# if !defined( __MINGW32__ ) && !defined( _AIX ) +# include +# if !defined( __BEOS__ ) +# include +# endif +# endif +#endif +#endif + +/* Now attempt to set the define for platform byte order using any */ +/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ +/* seem to encompass most endian symbol definitions */ + +#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) +# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) +# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( _BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( _LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) +# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) +# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +/* if the platform byte order could not be determined, then try to */ +/* set this define using common machine defines */ +#if !defined(PLATFORM_BYTE_ORDER) + +#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ + defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ + defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ + defined( vax ) || defined( vms ) || defined( VMS ) || \ + defined( __VMS ) || defined( _M_X64 ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + +#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ + defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ + defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ + defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ + defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ + defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ + defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX ) || \ + defined( __s390__ ) || defined( __s390x__ ) || defined( __zarch__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + +#elif defined(__arm__) +# ifdef __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# else +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif 1 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#else +# error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order +#endif + +#endif + +#endif diff --git a/src/crypto/defyx/defyx.cpp b/src/crypto/defyx/defyx.cpp new file mode 100644 index 00000000..0d1a9697 --- /dev/null +++ b/src/crypto/defyx/defyx.cpp @@ -0,0 +1,109 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "defyx.h" +#include "crypto/randomx/blake2/blake2.h" +#include "crypto/randomx/vm_interpreted.hpp" +#include "crypto/randomx/vm_interpreted_light.hpp" +#include "crypto/randomx/vm_compiled.hpp" +#include "crypto/randomx/vm_compiled_light.hpp" +#include "crypto/randomx/jit_compiler_x86_static.hpp" + +#include + +extern "C" { +#include "yescrypt.h" +#include "KangarooTwelve.h" +} + +#define YESCRYPT_FLAGS YESCRYPT_RW +#define YESCRYPT_BASE_N 2048 +#define YESCRYPT_R 8 +#define YESCRYPT_P 1 + +RandomX_ConfigurationScala::RandomX_ConfigurationScala() +{ + ArgonMemory = 131072; + ArgonIterations = 2; + ArgonSalt = "DefyXScala\x13"; + CacheAccesses = 2; + DatasetBaseSize = 33554432; + ProgramSize = 64; + ProgramIterations = 1024; + ProgramCount = 4; + ScratchpadL3_Size = 262144; + ScratchpadL2_Size = 131072; + ScratchpadL1_Size = 65536; +} + +RandomX_ConfigurationScala RandomX_ScalaConfig; + +int sipesh(void *out, size_t outlen, const void *in, size_t inlen, const void *salt, size_t saltlen, unsigned int t_cost, unsigned int m_cost) +{ + yescrypt_local_t local; + int retval; + + if (yescrypt_init_local(&local)) + return -1; + retval = yescrypt_kdf(NULL, &local, (const uint8_t*)in, inlen, (const uint8_t*)salt, saltlen, + (uint64_t)YESCRYPT_BASE_N << m_cost, YESCRYPT_R, YESCRYPT_P, + t_cost, 0, YESCRYPT_FLAGS, (uint8_t*)out, outlen); + if (yescrypt_free_local(&local)) + return -1; + return retval; +} + +int k12(const void *data, size_t length, void *hash) +{ + + int kDo = KangarooTwelve((const unsigned char *)data, length, (unsigned char *)hash, 32, 0, 0); + return kDo; +} + + +extern "C" { + + void defyx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output) { + assert(machine != nullptr); + assert(inputSize == 0 || input != nullptr); + assert(output != nullptr); + alignas(16) uint64_t tempHash[8]; + blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0); + sipesh(tempHash, sizeof(tempHash), input, inputSize, input, inputSize, 0, 0); + k12(input, inputSize, tempHash); + machine->initScratchpad(&tempHash); + machine->resetRoundingMode(); + for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) { + machine->run(&tempHash); + blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0); + } + machine->run(&tempHash); + machine->getFinalResult(output, RANDOMX_HASH_SIZE); + } + +} diff --git a/src/crypto/defyx/defyx.h b/src/crypto/defyx/defyx.h new file mode 100644 index 00000000..a67e7fde --- /dev/null +++ b/src/crypto/defyx/defyx.h @@ -0,0 +1,57 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef DEFYX_H +#define DEFYX_H + +#include "crypto/randomx/randomx.h" + +struct RandomX_ConfigurationScala : public RandomX_ConfigurationBase { RandomX_ConfigurationScala(); }; + +extern RandomX_ConfigurationScala RandomX_ScalaConfig; + +#if defined(__cplusplus) +extern "C" { +#endif + +/** + * Calculates a RandomX hash value. + * + * @param machine is a pointer to a randomx_vm structure. Must not be NULL. + * @param input is a pointer to memory to be hashed. Must not be NULL. + * @param inputSize is the number of bytes to be hashed. + * @param output is a pointer to memory where the hash will be stored. Must not + * be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing. +*/ +RANDOMX_EXPORT void defyx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/crypto/defyx/sha256.c b/src/crypto/defyx/sha256.c new file mode 100644 index 00000000..49b937cd --- /dev/null +++ b/src/crypto/defyx/sha256.c @@ -0,0 +1,411 @@ +/*- + * Copyright 2005,2007,2009 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "sysendian.h" + +#include "sha256.h" + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static void +be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + be32enc(dst + i * 4, src[i]); +} + +/* + * Decode a big-endian length len vector of (unsigned char) into a length + * len/4 vector of (uint32_t). Assumes len is a multiple of 4. + */ +static void +be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + dst[i] = be32dec(src + i * 4); +} + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + t0 = h + S1(e) + Ch(e, f, g) + k; \ + t1 = S0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, k) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i] + k) + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA256_Transform(uint32_t * state, const unsigned char block[64]) +{ + uint32_t W[64]; + uint32_t S[8]; + uint32_t t0, t1; + int i; + + /* 1. Prepare message schedule W. */ + be32dec_vect(W, block, 64); + for (i = 16; i < 64; i++) + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + RNDr(S, W, 0, 0x428a2f98); + RNDr(S, W, 1, 0x71374491); + RNDr(S, W, 2, 0xb5c0fbcf); + RNDr(S, W, 3, 0xe9b5dba5); + RNDr(S, W, 4, 0x3956c25b); + RNDr(S, W, 5, 0x59f111f1); + RNDr(S, W, 6, 0x923f82a4); + RNDr(S, W, 7, 0xab1c5ed5); + RNDr(S, W, 8, 0xd807aa98); + RNDr(S, W, 9, 0x12835b01); + RNDr(S, W, 10, 0x243185be); + RNDr(S, W, 11, 0x550c7dc3); + RNDr(S, W, 12, 0x72be5d74); + RNDr(S, W, 13, 0x80deb1fe); + RNDr(S, W, 14, 0x9bdc06a7); + RNDr(S, W, 15, 0xc19bf174); + RNDr(S, W, 16, 0xe49b69c1); + RNDr(S, W, 17, 0xefbe4786); + RNDr(S, W, 18, 0x0fc19dc6); + RNDr(S, W, 19, 0x240ca1cc); + RNDr(S, W, 20, 0x2de92c6f); + RNDr(S, W, 21, 0x4a7484aa); + RNDr(S, W, 22, 0x5cb0a9dc); + RNDr(S, W, 23, 0x76f988da); + RNDr(S, W, 24, 0x983e5152); + RNDr(S, W, 25, 0xa831c66d); + RNDr(S, W, 26, 0xb00327c8); + RNDr(S, W, 27, 0xbf597fc7); + RNDr(S, W, 28, 0xc6e00bf3); + RNDr(S, W, 29, 0xd5a79147); + RNDr(S, W, 30, 0x06ca6351); + RNDr(S, W, 31, 0x14292967); + RNDr(S, W, 32, 0x27b70a85); + RNDr(S, W, 33, 0x2e1b2138); + RNDr(S, W, 34, 0x4d2c6dfc); + RNDr(S, W, 35, 0x53380d13); + RNDr(S, W, 36, 0x650a7354); + RNDr(S, W, 37, 0x766a0abb); + RNDr(S, W, 38, 0x81c2c92e); + RNDr(S, W, 39, 0x92722c85); + RNDr(S, W, 40, 0xa2bfe8a1); + RNDr(S, W, 41, 0xa81a664b); + RNDr(S, W, 42, 0xc24b8b70); + RNDr(S, W, 43, 0xc76c51a3); + RNDr(S, W, 44, 0xd192e819); + RNDr(S, W, 45, 0xd6990624); + RNDr(S, W, 46, 0xf40e3585); + RNDr(S, W, 47, 0x106aa070); + RNDr(S, W, 48, 0x19a4c116); + RNDr(S, W, 49, 0x1e376c08); + RNDr(S, W, 50, 0x2748774c); + RNDr(S, W, 51, 0x34b0bcb5); + RNDr(S, W, 52, 0x391c0cb3); + RNDr(S, W, 53, 0x4ed8aa4a); + RNDr(S, W, 54, 0x5b9cca4f); + RNDr(S, W, 55, 0x682e6ff3); + RNDr(S, W, 56, 0x748f82ee); + RNDr(S, W, 57, 0x78a5636f); + RNDr(S, W, 58, 0x84c87814); + RNDr(S, W, 59, 0x8cc70208); + RNDr(S, W, 60, 0x90befffa); + RNDr(S, W, 61, 0xa4506ceb); + RNDr(S, W, 62, 0xbef9a3f7); + RNDr(S, W, 63, 0xc67178f2); + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; + + /* Clean the stack. */ + memset(W, 0, 256); + memset(S, 0, 32); + t0 = t1 = 0; +} + +static unsigned char PAD[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Add padding and terminating bit-count. */ +static void +SHA256_Pad(SHA256_CTX_Y * ctx) +{ + unsigned char len[8]; + uint32_t r, plen; + + /* + * Convert length to a vector of bytes -- we do this now rather + * than later because the length will change after we pad. + */ + be32enc_vect(len, ctx->count, 8); + + /* Add 1--64 bytes so that the resulting length is 56 mod 64 */ + r = (ctx->count[1] >> 3) & 0x3f; + plen = (r < 56) ? (56 - r) : (120 - r); + SHA256_Update_Y(ctx, PAD, (size_t)plen); + + /* Add the terminating bit-count */ + SHA256_Update_Y(ctx, len, 8); +} + +/* SHA-256 initialization. Begins a SHA-256 operation. */ +void +SHA256_Init_Y(SHA256_CTX_Y * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x6A09E667; + ctx->state[1] = 0xBB67AE85; + ctx->state[2] = 0x3C6EF372; + ctx->state[3] = 0xA54FF53A; + ctx->state[4] = 0x510E527F; + ctx->state[5] = 0x9B05688C; + ctx->state[6] = 0x1F83D9AB; + ctx->state[7] = 0x5BE0CD19; +} + +/* Add bytes into the hash */ +void +SHA256_Update_Y(SHA256_CTX_Y * ctx, const void *in, size_t len) +{ + uint32_t bitlen[2]; + uint32_t r; + const unsigned char *src = in; + + /* Number of bytes left in the buffer from previous updates */ + r = (ctx->count[1] >> 3) & 0x3f; + + /* Convert the length into a number of bits */ + bitlen[1] = ((uint32_t)len) << 3; + bitlen[0] = (uint32_t)(len >> 29); + + /* Update number of bits */ + if ((ctx->count[1] += bitlen[1]) < bitlen[1]) + ctx->count[0]++; + ctx->count[0] += bitlen[0]; + + /* Handle the case where we don't need to perform any transforms */ + if (len < 64 - r) { + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block */ + memcpy(&ctx->buf[r], src, 64 - r); + SHA256_Transform(ctx->state, ctx->buf); + src += 64 - r; + len -= 64 - r; + + /* Perform complete blocks */ + while (len >= 64) { + SHA256_Transform(ctx->state, src); + src += 64; + len -= 64; + } + + /* Copy left over data into buffer */ + memcpy(ctx->buf, src, len); +} + +/* + * SHA-256 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA256_Final_Y(unsigned char digest[32], SHA256_CTX_Y * ctx) +{ + + /* Add padding */ + SHA256_Pad(ctx); + + /* Write the hash */ + be32enc_vect(digest, ctx->state, 32); + + /* Clear the context state */ + memset((void *)ctx, 0, sizeof(*ctx)); +} + +/* Initialize an HMAC-SHA256 operation with the given key. */ +void +HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y * ctx, const void * _K, size_t Klen) +{ + unsigned char pad[64]; + unsigned char khash[32]; + const unsigned char * K = _K; + size_t i; + + /* If Klen > 64, the key is really SHA256(K). */ + if (Klen > 64) { + SHA256_Init_Y(&ctx->ictx); + SHA256_Update_Y(&ctx->ictx, K, Klen); + SHA256_Final_Y(khash, &ctx->ictx); + K = khash; + Klen = 32; + } + + /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ + SHA256_Init_Y(&ctx->ictx); + memset(pad, 0x36, 64); + for (i = 0; i < Klen; i++) + pad[i] ^= K[i]; + SHA256_Update_Y(&ctx->ictx, pad, 64); + + /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ + SHA256_Init_Y(&ctx->octx); + memset(pad, 0x5c, 64); + for (i = 0; i < Klen; i++) + pad[i] ^= K[i]; + SHA256_Update_Y(&ctx->octx, pad, 64); + + /* Clean the stack. */ + memset(khash, 0, 32); +} + +/* Add bytes to the HMAC-SHA256 operation. */ +void +HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y * ctx, const void *in, size_t len) +{ + + /* Feed data to the inner SHA256 operation. */ + SHA256_Update_Y(&ctx->ictx, in, len); +} + +/* Finish an HMAC-SHA256 operation. */ +void +HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx) +{ + unsigned char ihash[32]; + + /* Finish the inner SHA256 operation. */ + SHA256_Final_Y(ihash, &ctx->ictx); + + /* Feed the inner hash to the outer SHA256 operation. */ + SHA256_Update_Y(&ctx->octx, ihash, 32); + + /* Finish the outer SHA256 operation. */ + SHA256_Final_Y(digest, &ctx->octx); + + /* Clean the stack. */ + memset(ihash, 0, 32); +} + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ +void +PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt, + size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen) +{ + HMAC_SHA256_CTX_Y PShctx, hctx; + size_t i; + uint8_t ivec[4]; + uint8_t U[32]; + uint8_t T[32]; + uint64_t j; + int k; + size_t clen; + + /* Compute HMAC state after processing P and S. */ + HMAC_SHA256_Init_Y(&PShctx, passwd, passwdlen); + HMAC_SHA256_Update_Y(&PShctx, salt, saltlen); + + /* Iterate through the blocks. */ + for (i = 0; i * 32 < dkLen; i++) { + /* Generate INT(i + 1). */ + be32enc(ivec, (uint32_t)(i + 1)); + + /* Compute U_1 = PRF(P, S || INT(i)). */ + memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_Y)); + HMAC_SHA256_Update_Y(&hctx, ivec, 4); + HMAC_SHA256_Final_Y(U, &hctx); + + /* T_i = U_1 ... */ + memcpy(T, U, 32); + + for (j = 2; j <= c; j++) { + /* Compute U_j. */ + HMAC_SHA256_Init_Y(&hctx, passwd, passwdlen); + HMAC_SHA256_Update_Y(&hctx, U, 32); + HMAC_SHA256_Final_Y(U, &hctx); + + /* ... xor U_j ... */ + for (k = 0; k < 32; k++) + T[k] ^= U[k]; + } + + /* Copy as many bytes as necessary into buf. */ + clen = dkLen - i * 32; + if (clen > 32) + clen = 32; + memcpy(&buf[i * 32], T, clen); + } + + /* Clean PShctx, since we never called _Final on it. */ + memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y)); +} diff --git a/src/crypto/defyx/sha256.h b/src/crypto/defyx/sha256.h new file mode 100644 index 00000000..f935cfaa --- /dev/null +++ b/src/crypto/defyx/sha256.h @@ -0,0 +1,62 @@ +/*- + * Copyright 2005,2007,2009 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/lib/libmd/sha256_Y.h,v 1.2 2006/01/17 15:35:56 phk Exp $ + */ + +#ifndef _SHA256_H_ +#define _SHA256_H_ + +#include + +#include + +typedef struct SHA256Context { + uint32_t state[8]; + uint32_t count[2]; + unsigned char buf[64]; +} SHA256_CTX_Y; + +typedef struct HMAC_SHA256Context { + SHA256_CTX_Y ictx; + SHA256_CTX_Y octx; +} HMAC_SHA256_CTX_Y; + +void SHA256_Init_Y(SHA256_CTX_Y *); +void SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t); +void SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *); +void HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t); +void HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t); +void HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *); + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ +void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t, + uint64_t, uint8_t *, size_t); + +#endif /* !_SHA256_H_ */ diff --git a/src/crypto/defyx/sysendian.h b/src/crypto/defyx/sysendian.h new file mode 100644 index 00000000..04951946 --- /dev/null +++ b/src/crypto/defyx/sysendian.h @@ -0,0 +1,138 @@ +/*- + * Copyright 2007-2009 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ +#ifndef _SYSENDIAN_H_ +#define _SYSENDIAN_H_ + +/* If we don't have be64enc, the we have isn't usable. */ +#if !HAVE_DECL_BE64ENC +#undef HAVE_SYS_ENDIAN_H +#endif + +#ifdef HAVE_SYS_ENDIAN_H + +#include + +#else + +#include + +static inline uint32_t +be32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + + ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); +} + +static inline void +be32enc(void *pp, uint32_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[3] = x & 0xff; + p[2] = (x >> 8) & 0xff; + p[1] = (x >> 16) & 0xff; + p[0] = (x >> 24) & 0xff; +} + +static inline uint64_t +be64dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint64_t)(p[7]) + ((uint64_t)(p[6]) << 8) + + ((uint64_t)(p[5]) << 16) + ((uint64_t)(p[4]) << 24) + + ((uint64_t)(p[3]) << 32) + ((uint64_t)(p[2]) << 40) + + ((uint64_t)(p[1]) << 48) + ((uint64_t)(p[0]) << 56)); +} + +static inline void +be64enc(void *pp, uint64_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[7] = x & 0xff; + p[6] = (x >> 8) & 0xff; + p[5] = (x >> 16) & 0xff; + p[4] = (x >> 24) & 0xff; + p[3] = (x >> 32) & 0xff; + p[2] = (x >> 40) & 0xff; + p[1] = (x >> 48) & 0xff; + p[0] = (x >> 56) & 0xff; +} + +static inline uint32_t +le32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + + ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); +} + +static inline void +le32enc(void *pp, uint32_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; +} + +static inline uint64_t +le64dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint64_t)(p[0]) + ((uint64_t)(p[1]) << 8) + + ((uint64_t)(p[2]) << 16) + ((uint64_t)(p[3]) << 24) + + ((uint64_t)(p[4]) << 32) + ((uint64_t)(p[5]) << 40) + + ((uint64_t)(p[6]) << 48) + ((uint64_t)(p[7]) << 56)); +} + +static inline void +le64enc(void *pp, uint64_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; + p[4] = (x >> 32) & 0xff; + p[5] = (x >> 40) & 0xff; + p[6] = (x >> 48) & 0xff; + p[7] = (x >> 56) & 0xff; +} +#endif /* !HAVE_SYS_ENDIAN_H */ + +#endif /* !_SYSENDIAN_H_ */ diff --git a/src/crypto/defyx/yescrypt-best.c b/src/crypto/defyx/yescrypt-best.c new file mode 100644 index 00000000..4e836215 --- /dev/null +++ b/src/crypto/defyx/yescrypt-best.c @@ -0,0 +1,5 @@ +#ifdef __SSE2__ +#include "yescrypt-simd.c" +#else +#include "yescrypt-opt.c" +#endif diff --git a/src/crypto/defyx/yescrypt-opt.c b/src/crypto/defyx/yescrypt-opt.c new file mode 100644 index 00000000..3da0a532 --- /dev/null +++ b/src/crypto/defyx/yescrypt-opt.c @@ -0,0 +1,1102 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2013-2015 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#include +#include +#include +#include + +#include "sha256.h" +#include "sysendian.h" + +#include "yescrypt.h" + +#include "yescrypt-platform.c" + +static inline void +blkcpy(uint64_t * dest, const uint64_t * src, size_t count) +{ + do { + *dest++ = *src++; *dest++ = *src++; + *dest++ = *src++; *dest++ = *src++; + } while (count -= 4); +} + +static inline void +blkxor(uint64_t * dest, const uint64_t * src, size_t count) +{ + do { + *dest++ ^= *src++; *dest++ ^= *src++; + *dest++ ^= *src++; *dest++ ^= *src++; + } while (count -= 4); +} + +typedef union { + uint32_t w[16]; + uint64_t d[8]; +} salsa20_blk_t; + +static inline void +salsa20_simd_shuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout) +{ +#define COMBINE(out, in1, in2) \ + Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32); + COMBINE(0, 0, 2) + COMBINE(1, 5, 7) + COMBINE(2, 2, 4) + COMBINE(3, 7, 1) + COMBINE(4, 4, 6) + COMBINE(5, 1, 3) + COMBINE(6, 6, 0) + COMBINE(7, 3, 5) +#undef COMBINE +} + +static inline void +salsa20_simd_unshuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout) +{ +#define UNCOMBINE(out, in1, in2) \ + Bout->w[out * 2] = Bin->d[in1]; \ + Bout->w[out * 2 + 1] = Bin->d[in2] >> 32; + UNCOMBINE(0, 0, 6) + UNCOMBINE(1, 5, 3) + UNCOMBINE(2, 2, 0) + UNCOMBINE(3, 7, 5) + UNCOMBINE(4, 4, 2) + UNCOMBINE(5, 1, 7) + UNCOMBINE(6, 6, 4) + UNCOMBINE(7, 3, 1) +#undef UNCOMBINE +} + +/** + * salsa20(B): + * Apply the Salsa20 core to the provided block. + */ +static void +salsa20(uint64_t B[8], uint32_t doublerounds) +{ + salsa20_blk_t X; +#define x X.w + + salsa20_simd_unshuffle((const salsa20_blk_t *)B, &X); + + do { +#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns */ + x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); + x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); + + x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); + x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); + + x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); + x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); + + x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); + x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); + + /* Operate on rows */ + x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); + x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); + + x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); + x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); + + x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); + x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); + + x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); + x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); +#undef R + } while (--doublerounds); +#undef x + + { + uint32_t i; + salsa20_blk_t Y; + salsa20_simd_shuffle(&X, &Y); + for (i = 0; i < 16; i += 4) { + ((salsa20_blk_t *)B)->w[i] += Y.w[i]; + ((salsa20_blk_t *)B)->w[i + 1] += Y.w[i + 1]; + ((salsa20_blk_t *)B)->w[i + 2] += Y.w[i + 2]; + ((salsa20_blk_t *)B)->w[i + 3] += Y.w[i + 3]; + } + } +} + +/** + * blockmix_salsa8(Bin, Bout, X, r): + * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r + * bytes in length; the output Bout must also be the same size. The + * temporary space X must be 64 bytes. + */ +static void +blockmix_salsa8(const uint64_t * Bin, uint64_t * Bout, uint64_t * X, size_t r) +{ + size_t i; + + /* 1: X <-- B_{2r - 1} */ + blkcpy(X, &Bin[(2 * r - 1) * 8], 8); + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < 2 * r; i += 2) { + /* 3: X <-- H(X \xor B_i) */ + blkxor(X, &Bin[i * 8], 8); + salsa20(X, 4); + + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + blkcpy(&Bout[i * 4], X, 8); + + /* 3: X <-- H(X \xor B_i) */ + blkxor(X, &Bin[i * 8 + 8], 8); + salsa20(X, 4); + + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + blkcpy(&Bout[i * 4 + r * 8], X, 8); + } +} + +/* These are tunable */ +#define PWXsimple 2 +#define PWXgather 4 +#define PWXrounds 6 +#define Swidth 8 + +/* Derived values. Not tunable on their own. */ +#define PWXbytes (PWXgather * PWXsimple * 8) +#define PWXwords (PWXbytes / sizeof(uint64_t)) +#define Sbytes (3 * (1 << Swidth) * PWXsimple * 8) +#define Swords (Sbytes / sizeof(uint64_t)) +#define Smask (((1 << Swidth) - 1) * PWXsimple * 8) +#define Smask2 (((uint64_t)Smask << 32) | Smask) +#define rmin ((PWXbytes + 127) / 128) + +#if PWXbytes % 32 != 0 +#error "blkcpy() and blkxor() currently work on multiples of 32." +#endif + +typedef struct { + uint64_t *S0, *S1, *S2; + size_t w; +} pwxform_ctx_t; + +#define Salloc (Sbytes + ((sizeof(pwxform_ctx_t) + 63) & ~63U)) + +/** + * pwxform(B): + * Transform the provided block using the provided S-boxes. + */ +static void +pwxform(uint64_t * B, pwxform_ctx_t * ctx) +{ + uint64_t (*X)[PWXsimple] = (uint64_t (*)[PWXsimple])B; + uint64_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; + size_t w = ctx->w; + size_t i, j; +#if PWXsimple > 2 + size_t k; +#endif + + /* 2: for j = 0 to PWXgather - 1 do */ + for (j = 0; j < PWXgather; j++) { + uint64_t *Xj = X[j]; + uint64_t x0 = Xj[0]; +#if PWXsimple > 1 + uint64_t x1 = Xj[1]; +#endif + + /* 1: for i = 0 to PWXrounds - 1 do */ + for (i = 0; i < PWXrounds; i++) { + uint64_t x = x0 & Smask2; + const uint64_t *p0, *p1; + + /* 3: p0 <-- (lo(B_{j,0}) & Smask) / (PWXsimple * 8) */ + p0 = (const uint64_t *)((uint8_t *)S0 + (uint32_t)x); + /* 4: p1 <-- (hi(B_{j,0}) & Smask) / (PWXsimple * 8) */ + p1 = (const uint64_t *)((uint8_t *)S1 + (x >> 32)); + + /* 5: for k = 0 to PWXsimple - 1 do */ + /* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) \xor S1_{p1,k} */ + x0 = (uint64_t)(x0 >> 32) * (uint32_t)x0; + x0 += p0[0]; + x0 ^= p1[0]; + +#if PWXsimple > 1 + /* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) \xor S1_{p1,k} */ + x1 = (uint64_t)(x1 >> 32) * (uint32_t)x1; + x1 += p0[1]; + x1 ^= p1[1]; +#endif + +#if PWXsimple > 2 + /* 5: for k = 0 to PWXsimple - 1 do */ + for (k = 2; k < PWXsimple; k++) { + /* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) \xor S1_{p1,k} */ + x = Xj[k]; + + x = (uint64_t)(x >> 32) * (uint32_t)x; + x += p0[k]; + x ^= p1[k]; + + Xj[k] = x; + } +#endif + + /* 8: if (i != 0) and (i != PWXrounds - 1) */ + if ((i - 1) < PWXrounds - 2) { + /* 9: S2_w <-- B_j */ + /* 10: w <-- w + 1 */ + uint64_t *p2 = (uint64_t *)((uint8_t *)S2 + w); + w += PWXbytes; + p2[0] = x0; +#if PWXsimple > 1 + p2[1] = x1; +#endif +#if PWXsimple > 2 + for (k = 2; k < PWXsimple; k++) + p2[k] = Xj[k]; +#endif + } + } + + Xj[0] = x0; +#if PWXsimple > 1 + Xj[1] = x1; +#endif + + w -= (PWXrounds - 2) * PWXbytes - PWXsimple * 8; + } + + /* 14: (S0, S1, S2) <-- (S2, S0, S1) */ + ctx->S0 = S2; + ctx->S1 = S0; + ctx->S2 = S1; + /* 15: w <-- w mod 2^Swidth */ + ctx->w = (w + (PWXrounds - 3) * PWXbytes) & Smask; +} + +/** + * blockmix_pwxform(Bin, Bout, S, r): + * Compute Bout = BlockMix_pwxform{salsa20/2, ctx, r}(Bin). The input Bin must + * be 128r bytes in length; the output Bout must also be the same size. + */ +static void +blockmix_pwxform(const uint64_t * Bin, uint64_t * Bout, + pwxform_ctx_t * ctx, size_t r) +{ + size_t r1, r2, i; + + /* Convert 128-byte blocks to PWXbytes blocks */ + /* 1: r_1 <-- 128r / PWXbytes */ + r1 = r * 128 / PWXbytes; + + /* 2: X <-- B'_{r_1 - 1} */ + blkcpy(Bout, &Bin[(r1 - 1) * PWXwords], PWXwords); + + /* 3: for i = 0 to r_1 - 1 do */ + /* 4: if r_1 > 1 */ + if (r1 > 1) { + /* 5: X <-- X \xor B'_i */ + blkxor(Bout, Bin, PWXwords); + } + + /* 7: X <-- pwxform(X) */ + /* 8: B'_i <-- X */ + pwxform(Bout, ctx); + + /* 3: for i = 0 to r_1 - 1 do */ + for (i = 1; i < r1; i++) { + /* 5: X <-- X \xor B'_i */ + blkcpy(&Bout[i * PWXwords], &Bout[(i - 1) * PWXwords], + PWXwords); + blkxor(&Bout[i * PWXwords], &Bin[i * PWXwords], PWXwords); + + /* 7: X <-- pwxform(X) */ + /* 8: B'_i <-- X */ + pwxform(&Bout[i * PWXwords], ctx); + } + +#if PWXbytes > 128 + /* + * Handle partial blocks. If we were using just one buffer, like in + * the algorithm specification, the data would already be there, but + * since we use separate input and output buffers, we may have to copy + * some data over (which will then be processed by the Salsa20/8 + * invocations below) in this special case - that is, when 128r is not + * a multiple of PWXbytes. Since PWXgather and PWXsimple must each be + * a power of 2 (per the specification), PWXbytes is also a power of 2. + * Thus, 128r is obviously a multiple of valid values of PWXbytes up to + * 128, inclusive. When PWXbytes is larger than that (thus, 256 or + * larger) we perform this extra check. + */ + if (i * PWXwords < r * 16) + blkcpy(&Bout[i * PWXwords], &Bin[i * PWXwords], + r * 16 - i * PWXwords); +#endif + + /* 10: i <-- floor((r_1 - 1) * PWXbytes / 64) */ + i = (r1 - 1) * PWXbytes / 64; + + /* Convert 128-byte blocks to 64-byte blocks */ + r2 = r * 2; + + /* 11: B_i <-- H(B_i) */ + salsa20(&Bout[i * 8], 1); + + for (i++; i < r2; i++) { + /* 13: B_i <-- H(B_i \xor B_{i-1}) */ + blkxor(&Bout[i * 8], &Bout[(i - 1) * 8], 8); + salsa20(&Bout[i * 8], 1); + } +} + +/** + * integerify(B, r): + * Return the result of parsing B_{2r-1} as a little-endian integer. + */ +static inline uint64_t +integerify(const uint64_t * B, size_t r) +{ +/* + * Our 64-bit words are in host byte order, and word 6 holds the second 32-bit + * word of B_{2r-1} due to SIMD shuffling. The 64-bit value we return is also + * in host byte order, as it should be. + */ + const uint64_t * X = &B[(2 * r - 1) * 8]; + uint32_t lo = X[0]; + uint32_t hi = X[6] >> 32; + return ((uint64_t)hi << 32) + lo; +} + +/** + * smix1(B, r, N, flags, V, NROM, VROM, XY, ctx): + * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 256r + 64 bytes in length. The value N must be even and + * no smaller than 2. + */ +static void +smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags, + uint64_t * V, uint64_t NROM, const uint64_t * VROM, + uint64_t * XY, pwxform_ctx_t * ctx) +{ + size_t s = 16 * r; + uint64_t * X = V; + uint64_t * Y = &XY[s]; + uint64_t * Z = &XY[2 * s]; + uint64_t n, i, j; + size_t k; + + /* 1: X <-- B */ + /* 3: V_i <-- X */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8]; + for (k = 0; k < 16; k++) + tmp->w[k] = le32dec(&src->w[k]); + salsa20_simd_shuffle(tmp, dst); + } + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + if (ctx) + blockmix_pwxform(X, Y, ctx, r); + else + blockmix_salsa8(X, Y, Z, r); + blkcpy(&V[s], Y, s); + + X = XY; + + if (VROM) { + /* j <-- Integerify(X) mod NROM */ + j = integerify(Y, r) & (NROM - 1); + + /* X <-- H(X \xor VROM_j) */ + blkxor(Y, &VROM[j * s], s); + + blockmix_pwxform(Y, X, ctx, r); + + /* 2: for i = 0 to N - 1 do */ + for (n = 1, i = 2; i < N; i += 2) { + /* 3: V_i <-- X */ + blkcpy(&V[i * s], X, s); + + if ((i & (i - 1)) == 0) + n <<= 1; + + /* j <-- Wrap(Integerify(X), i) */ + j = integerify(X, r) & (n - 1); + j += i - n; + + /* X <-- X \xor V_j */ + blkxor(X, &V[j * s], s); + + /* 4: X <-- H(X) */ + blockmix_pwxform(X, Y, ctx, r); + + /* 3: V_i <-- X */ + blkcpy(&V[(i + 1) * s], Y, s); + + /* j <-- Integerify(X) mod NROM */ + j = integerify(Y, r) & (NROM - 1); + + /* X <-- H(X \xor VROM_j) */ + blkxor(Y, &VROM[j * s], s); + + blockmix_pwxform(Y, X, ctx, r); + } + } else if (flags & YESCRYPT_RW) { + /* 4: X <-- H(X) */ + blockmix_pwxform(Y, X, ctx, r); + + /* 2: for i = 0 to N - 1 do */ + for (n = 1, i = 2; i < N; i += 2) { + /* 3: V_i <-- X */ + blkcpy(&V[i * s], X, s); + + if ((i & (i - 1)) == 0) + n <<= 1; + + /* j <-- Wrap(Integerify(X), i) */ + j = integerify(X, r) & (n - 1); + j += i - n; + + /* X <-- X \xor V_j */ + blkxor(X, &V[j * s], s); + + /* 4: X <-- H(X) */ + blockmix_pwxform(X, Y, ctx, r); + + /* 3: V_i <-- X */ + blkcpy(&V[(i + 1) * s], Y, s); + + /* j <-- Wrap(Integerify(X), i) */ + j = integerify(Y, r) & (n - 1); + j += (i + 1) - n; + + /* X <-- X \xor V_j */ + blkxor(Y, &V[j * s], s); + + /* 4: X <-- H(X) */ + blockmix_pwxform(Y, X, ctx, r); + } + } else { + /* 4: X <-- H(X) */ + blockmix_salsa8(Y, X, Z, r); + + /* 2: for i = 0 to N - 1 do */ + for (n = 1, i = 2; i < N; i += 2) { + /* 3: V_i <-- X */ + blkcpy(&V[i * s], X, s); + + /* 4: X <-- H(X) */ + blockmix_salsa8(X, Y, Z, r); + + /* 3: V_i <-- X */ + blkcpy(&V[(i + 1) * s], Y, s); + + /* 4: X <-- H(X) */ + blockmix_salsa8(Y, X, Z, r); + } + } + + /* B' <-- X */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8]; + for (k = 0; k < 16; k++) + le32enc(&tmp->w[k], src->w[k]); + salsa20_simd_unshuffle(tmp, dst); + } +} + +/** + * smix2(B, r, N, Nloop, flags, V, NROM, VROM, XY, ctx): + * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 256r + 64 bytes in length. The value N must be a + * power of 2 greater than 1. The value Nloop must be even. + */ +static void +smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop, + yescrypt_flags_t flags, + uint64_t * V, uint64_t NROM, const uint64_t * VROM, + uint64_t * XY, pwxform_ctx_t * ctx) +{ + size_t s = 16 * r; + uint64_t * X = XY; + uint64_t * Y = &XY[s]; + uint64_t i, j; + + if (Nloop == 0) + return; + + /* X <-- B' */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8]; + size_t k; + for (k = 0; k < 16; k++) + tmp->w[k] = le32dec(&src->w[k]); + salsa20_simd_shuffle(tmp, dst); + } + + if (VROM) { + yescrypt_flags_t rw = flags & YESCRYPT_RW; + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < Nloop; i += 2) { + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(X, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + if (rw) + blkcpy(&V[j * s], X, s); + blockmix_pwxform(X, Y, ctx, r); + + /* j <-- Integerify(X) mod NROM */ + j = integerify(Y, r) & (NROM - 1); + + /* X <-- H(X \xor VROM_j) */ + blkxor(Y, &VROM[j * s], s); + + blockmix_pwxform(Y, X, ctx, r); + } + } else if (ctx) { + yescrypt_flags_t rw = flags & YESCRYPT_RW; + + /* 6: for i = 0 to N - 1 do */ + i = Nloop / 2; + do { + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(X, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + if (rw) + blkcpy(&V[j * s], X, s); + blockmix_pwxform(X, Y, ctx, r); + + /* 7: j <-- Integerify(X) mod N */ + j = integerify(Y, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(Y, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + if (rw) + blkcpy(&V[j * s], Y, s); + blockmix_pwxform(Y, X, ctx, r); + } while (--i); + } else { + uint64_t * Z = &XY[2 * s]; + + /* 6: for i = 0 to N - 1 do */ + i = Nloop / 2; + do { + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(X, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + blockmix_salsa8(X, Y, Z, r); + + /* 7: j <-- Integerify(X) mod N */ + j = integerify(Y, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(Y, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + blockmix_salsa8(Y, X, Z, r); + } while (--i); + } + + /* 10: B' <-- X */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8]; + size_t k; + for (k = 0; k < 16; k++) + le32enc(&tmp->w[k], src->w[k]); + salsa20_simd_unshuffle(tmp, dst); + } +} + +/** + * p2floor(x): + * Largest power of 2 not greater than argument. + */ +static uint64_t +p2floor(uint64_t x) +{ + uint64_t y; + while ((y = x & (x - 1))) + x = y; + return x; +} + +/** + * smix(B, r, N, p, t, flags, V, NROM, VROM, XY, S, passwd): + * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the + * temporary storage V must be 128rN bytes in length; the temporary storage + * XY must be 256r+64 or (256r+64)*p bytes in length (the larger size is + * required with OpenMP-enabled builds). The value N must be a power of 2 + * greater than 1. + */ +static void +smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t, + yescrypt_flags_t flags, + uint64_t * V, uint64_t NROM, const uint64_t * VROM, + uint64_t * XY, uint8_t * S, uint8_t * passwd) +{ + size_t s = 16 * r; + uint64_t Nchunk, Nloop_all, Nloop_rw; + uint32_t i; + + /* 1: n <-- N / p */ + Nchunk = N / p; + + /* 2: Nloop_all <-- fNloop(n, t, flags) */ + Nloop_all = Nchunk; + if (flags & YESCRYPT_RW) { + if (t <= 1) { + if (t) + Nloop_all *= 2; /* 2/3 */ + Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ + } else { + Nloop_all *= t - 1; + } + } else if (t) { + if (t == 1) + Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ + Nloop_all *= t; + } + + /* 6: Nloop_rw <-- 0 */ + Nloop_rw = 0; + if (flags & __YESCRYPT_INIT_SHARED) { + Nloop_rw = Nloop_all; + } else { + /* 3: if YESCRYPT_RW flag is set */ + if (flags & YESCRYPT_RW) { + /* 4: Nloop_rw <-- Nloop_all / p */ + Nloop_rw = Nloop_all / p; + } + } + + /* 8: n <-- n - (n mod 2) */ + Nchunk &= ~(uint64_t)1; /* round down to even */ + /* 9: Nloop_all <-- Nloop_all + (Nloop_all mod 2) */ + Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ + /* 10: Nloop_rw <-- Nloop_rw + (Nloop_rw mod 2) */ + Nloop_rw++; Nloop_rw &= ~(uint64_t)1; /* round up to even */ + + /* 11: for i = 0 to p - 1 do */ +#ifdef _OPENMP +#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, VROM, XY, S, passwd, s, Nchunk, Nloop_all, Nloop_rw) + { +#pragma omp for +#endif + for (i = 0; i < p; i++) { + /* 12: u <-- in */ + uint64_t Vchunk = i * Nchunk; + /* 13: if i = p - 1 */ + /* 14: n <-- N - u */ + /* 15: end if */ + /* 16: v <-- u + n - 1 */ + uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); + uint64_t * Bp = &B[i * s]; + uint64_t * Vp = &V[Vchunk * s]; +#ifdef _OPENMP + uint64_t * XYp = &XY[i * (2 * s + 8)]; +#else + uint64_t * XYp = XY; +#endif + pwxform_ctx_t * ctx_i = NULL; + /* 17: if YESCRYPT_RW flag is set */ + if (flags & YESCRYPT_RW) { + uint64_t *Si = (uint64_t *)(S + i * Salloc); + /* 18: SMix1_1(B_i, Sbytes / 128, S_i, no flags) */ + smix1(Bp, 1, Sbytes / 128, 0 /* no flags */, + Si, 0, NULL, XYp, NULL); + ctx_i = (pwxform_ctx_t *)(Si + Swords); + /* 19: S2_i <-- S_{i,0...2^Swidth-1} */ + ctx_i->S2 = Si; + /* 20: S1_i <-- S_{i,2^Swidth...2*2^Swidth-1} */ + ctx_i->S1 = Si + Swords / 3; + /* 21: S0_i <-- S_{i,2*2^Swidth...3*2^Swidth-1} */ + ctx_i->S0 = Si + Swords / 3 * 2; + /* 22: w_i <-- 0 */ + ctx_i->w = 0; + /* 23: if i = 0 */ + if (i == 0) { + /* 24: passwd <-- HMAC-SHA256(B_{0,2r-1}, passwd) */ + HMAC_SHA256_CTX_Y ctx; + HMAC_SHA256_Init_Y(&ctx, Bp + (s - 8), 64); + HMAC_SHA256_Update_Y(&ctx, passwd, 32); + HMAC_SHA256_Final_Y(passwd, &ctx); + } + } + if (!(flags & __YESCRYPT_INIT_SHARED_2)) { + /* 27: SMix1_r(B_i, n, V_{u..v}, flags) */ + smix1(Bp, r, Np, flags, Vp, NROM, VROM, XYp, ctx_i); + } + /* 28: SMix2_r(B_i, p2floor(n), Nloop_rw, V_{u..v}, flags) */ + smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, + NROM, VROM, XYp, ctx_i); + } + + /* 30: for i = 0 to p - 1 do */ + if (Nloop_all > Nloop_rw) { +#ifdef _OPENMP +#pragma omp for +#endif + for (i = 0; i < p; i++) { + uint64_t * Bp = &B[i * s]; +#ifdef _OPENMP + uint64_t * XYp = &XY[i * (2 * s + 8)]; +#else + uint64_t * XYp = XY; +#endif + pwxform_ctx_t * ctx_i = NULL; + if (flags & YESCRYPT_RW) + ctx_i = (pwxform_ctx_t *)(S + i * Salloc + Sbytes); + /* 31: SMix2_r(B_i, N, Nloop_all - Nloop_rw, V, flags excluding YESCRYPT_RW) */ + smix2(Bp, r, N, Nloop_all - Nloop_rw, + flags & ~YESCRYPT_RW, V, NROM, VROM, XYp, ctx_i); + } + } +#ifdef _OPENMP + } +#endif +} + +/** + * yescrypt_kdf_body(shared, local, passwd, passwdlen, salt, saltlen, + * N, r, p, t, flags, buf, buflen): + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, + * p, buflen), or a revision of scrypt as requested by flags and shared, and + * write the result into buf. The parameters r, p, and buflen must satisfy + * r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power + * of 2 greater than 1. + * + * t controls computation time while not affecting peak memory usage. shared + * and flags may request special modes as described in yescrypt.h. local is + * the thread-local data structure, allowing to preserve and reuse a memory + * allocation across calls, thereby reducing its overhead. + * + * Return 0 on success; or -1 on error. + */ +static int +yescrypt_kdf_body(const yescrypt_shared_t * shared, yescrypt_local_t * local, + const uint8_t * passwd, size_t passwdlen, + const uint8_t * salt, size_t saltlen, + uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, + uint8_t * buf, size_t buflen) +{ + yescrypt_region_t tmp; + uint64_t NROM; + const uint64_t * VROM; + size_t B_size, V_size, XY_size, need; + uint64_t * B, * V, * XY; + uint8_t * S; + uint64_t sha256[4]; + uint8_t dk[sizeof(sha256)], * dkp = buf; + + /* Sanity-check parameters */ + if ((flags & ~YESCRYPT_KNOWN_FLAGS) || (!flags && t)) { + errno = EINVAL; + return -1; + } +#if SIZE_MAX > UINT32_MAX + if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { + errno = EFBIG; + return -1; + } +#endif + if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { + errno = EFBIG; + return -1; + } + if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) { + errno = EINVAL; + return -1; + } + if ((p > SIZE_MAX / ((size_t)256 * r + 64)) || +#if SIZE_MAX / 256 <= UINT32_MAX + (r > SIZE_MAX / 256) || +#endif + (N > SIZE_MAX / 128 / r)) { + errno = ENOMEM; + return -1; + } + if (N > UINT64_MAX / ((uint64_t)t + 1)) { + errno = EFBIG; + return -1; + } + if (flags & YESCRYPT_RW) { + if ((flags & YESCRYPT_WORM) || (N / p <= 1) || (r < rmin)) { + errno = EINVAL; + return -1; + } + if (p > SIZE_MAX / Salloc) { + errno = ENOMEM; + return -1; + } + } +#ifdef _OPENMP + else if (N > SIZE_MAX / 128 / (r * p)) { + errno = ENOMEM; + return -1; + } +#endif + + NROM = 0; + VROM = NULL; + if (shared) { + NROM = shared->aligned_size / ((size_t)128 * r); + if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) || + !(flags & YESCRYPT_RW)) { + errno = EINVAL; + return -1; + } + VROM = shared->aligned; + } + + /* Allocate memory */ + V = NULL; + V_size = (size_t)128 * r * N; +#ifdef _OPENMP + if (!(flags & YESCRYPT_RW)) + V_size *= p; +#endif + need = V_size; + if (flags & __YESCRYPT_INIT_SHARED) { + if (local->aligned_size < need) { + if (local->base || local->aligned || + local->base_size || local->aligned_size) { + errno = EINVAL; + return -1; + } + if (!alloc_region(local, need)) + return -1; + } + V = (uint64_t *)local->aligned; + need = 0; + } + B_size = (size_t)128 * r * p; + need += B_size; + if (need < B_size) { + errno = ENOMEM; + return -1; + } + XY_size = (size_t)256 * r + 64; +#ifdef _OPENMP + XY_size *= p; +#endif + need += XY_size; + if (need < XY_size) { + errno = ENOMEM; + return -1; + } + if (flags & YESCRYPT_RW) { + size_t S_size = (size_t)Salloc * p; + need += S_size; + if (need < S_size) { + errno = ENOMEM; + return -1; + } + } + if (flags & __YESCRYPT_INIT_SHARED) { + if (!alloc_region(&tmp, need)) + return -1; + B = (uint64_t *)tmp.aligned; + XY = (uint64_t *)((uint8_t *)B + B_size); + } else { + init_region(&tmp); + if (local->aligned_size < need) { + if (free_region(local)) + return -1; + if (!alloc_region(local, need)) + return -1; + } + B = (uint64_t *)local->aligned; + V = (uint64_t *)((uint8_t *)B + B_size); + XY = (uint64_t *)((uint8_t *)V + V_size); + } + S = NULL; + if (flags & YESCRYPT_RW) + S = (uint8_t *)XY + XY_size; + + if (flags) { + HMAC_SHA256_CTX_Y ctx; + HMAC_SHA256_Init_Y(&ctx, "yescrypt-prehash", + (flags & __YESCRYPT_PREHASH) ? 16 : 8); + HMAC_SHA256_Update_Y(&ctx, passwd, passwdlen); + HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx); + passwd = (uint8_t *)sha256; + passwdlen = sizeof(sha256); + } + + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, + (uint8_t *)B, B_size); + + if (flags) + blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0])); + + if (p == 1 || (flags & YESCRYPT_RW)) { + smix(B, r, N, p, t, flags, V, NROM, VROM, XY, S, + (uint8_t *)sha256); + } else { + uint32_t i; + + /* 2: for i = 0 to p - 1 do */ +#ifdef _OPENMP +#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, VROM, XY, S) +#endif + for (i = 0; i < p; i++) { + /* 3: B_i <-- MF(B_i, N) */ +#ifdef _OPENMP + smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, + &V[(size_t)16 * r * i * N], + NROM, VROM, + &XY[((size_t)32 * r + 8) * i], NULL, NULL); +#else + smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, V, + NROM, VROM, XY, NULL, NULL); +#endif + } + } + + dkp = buf; + if (flags && buflen < sizeof(dk)) { + PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, + dk, sizeof(dk)); + dkp = dk; + } + + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen); + + /* + * Except when computing classic scrypt, allow all computation so far + * to be performed on the client. The final steps below match those of + * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so + * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of + * SCRAM's use of SHA-1) would be usable with yescrypt hashes. + */ + if (flags && !(flags & __YESCRYPT_PREHASH)) { + /* Compute ClientKey */ + { + HMAC_SHA256_CTX_Y ctx; + HMAC_SHA256_Init_Y(&ctx, dkp, sizeof(dk)); + HMAC_SHA256_Update_Y(&ctx, "Client Key", 10); + HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx); + } + /* Compute StoredKey */ + { + SHA256_CTX_Y ctx; + size_t clen = buflen; + if (clen > sizeof(dk)) + clen = sizeof(dk); + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256)); + SHA256_Final_Y(dk, &ctx); + memcpy(buf, dk, clen); + } + } + + if (free_region(&tmp)) + return -1; + + /* Success! */ + return 0; +} + +/** + * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, + * N, r, p, t, g, flags, buf, buflen): + * Compute scrypt or its revision as requested by the parameters. The inputs + * to this function are the same as those for yescrypt_kdf_body() above, with + * the addition of g, which controls hash upgrades (0 for no upgrades so far). + */ +int +yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local, + const uint8_t * passwd, size_t passwdlen, + const uint8_t * salt, size_t saltlen, + uint64_t N, uint32_t r, uint32_t p, uint32_t t, uint32_t g, + yescrypt_flags_t flags, + uint8_t * buf, size_t buflen) +{ + uint8_t dk[32]; + + if ((flags & (YESCRYPT_RW | __YESCRYPT_INIT_SHARED)) == YESCRYPT_RW && + p >= 1 && N / p >= 0x100 && N / p * r >= 0x20000) { + int retval = yescrypt_kdf_body(shared, local, + passwd, passwdlen, salt, saltlen, + N >> 6, r, p, 0, flags | __YESCRYPT_PREHASH, + dk, sizeof(dk)); + if (retval) + return retval; + passwd = dk; + passwdlen = sizeof(dk); + } + + do { + uint8_t * dkp = g ? dk : buf; + size_t dklen = g ? sizeof(dk) : buflen; + int retval = yescrypt_kdf_body(shared, local, + passwd, passwdlen, salt, saltlen, + N, r, p, t, flags, dkp, dklen); + if (retval) + return retval; + + passwd = dkp; + passwdlen = dklen; + + N <<= 2; + if (!N) + return -1; + t >>= 1; + } while (g--); + + return 0; +} diff --git a/src/crypto/defyx/yescrypt-platform.c b/src/crypto/defyx/yescrypt-platform.c new file mode 100644 index 00000000..8f6c22ba --- /dev/null +++ b/src/crypto/defyx/yescrypt-platform.c @@ -0,0 +1,188 @@ +/*- + * Copyright 2013-2015 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include "yescrypt.h" + +#define HUGEPAGE_THRESHOLD (12 * 1024 * 1024) + +#ifdef __x86_64__ +#define HUGEPAGE_SIZE (2 * 1024 * 1024) +#else +#undef HUGEPAGE_SIZE +#endif + +static void * +alloc_region(yescrypt_region_t * region, size_t size) +{ + size_t base_size = size; + uint8_t * base, * aligned; +#ifdef MAP_ANON + int flags = +#ifdef MAP_NOCORE + MAP_NOCORE | +#endif + MAP_ANON | MAP_PRIVATE; +#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE) + size_t new_size = size; + const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1; + if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) { + flags |= MAP_HUGETLB; +/* + * Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of + * huge page size, so let's round up to huge page size here. + */ + new_size = size + hugepage_mask; + new_size &= ~hugepage_mask; + } + base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0); + if (base != MAP_FAILED) { + base_size = new_size; + } else + if (flags & MAP_HUGETLB) { + flags &= ~MAP_HUGETLB; + base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); + } + +#else + base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); +#endif + if (base == MAP_FAILED) + base = NULL; + aligned = base; +#elif defined(HAVE_POSIX_MEMALIGN) + if ((errno = posix_memalign((void **)&base, 64, size)) != 0) + base = NULL; + aligned = base; +#else + base = aligned = NULL; + if (size + 63 < size) { + errno = ENOMEM; + } else if ((base = malloc(size + 63)) != NULL) { + aligned = base + 63; + aligned -= (uintptr_t)aligned & 63; + } +#endif + region->base = base; + region->aligned = aligned; + region->base_size = base ? base_size : 0; + region->aligned_size = base ? size : 0; + return aligned; +} + +static inline void +init_region(yescrypt_region_t * region) +{ + region->base = region->aligned = NULL; + region->base_size = region->aligned_size = 0; +} + +static int +free_region(yescrypt_region_t * region) +{ + if (region->base) { +#ifdef MAP_ANON + if (munmap(region->base, region->base_size)) + return -1; +#else + free(region->base); +#endif + } + init_region(region); + return 0; +} + +int +yescrypt_init_shared(yescrypt_shared_t * shared, + const uint8_t * param, size_t paramlen, + uint64_t N, uint32_t r, uint32_t p, + yescrypt_init_shared_flags_t flags, + uint8_t * buf, size_t buflen) +{ + yescrypt_shared_t half1, half2; + uint8_t salt[32]; + + if (flags & YESCRYPT_SHARED_PREALLOCATED) { + if (!shared->aligned || !shared->aligned_size) + return -1; + } else { + init_region(shared); + } + if (!param && !paramlen && !N && !r && !p && !buf && !buflen) + return 0; + + if (yescrypt_kdf(NULL, shared, + param, paramlen, NULL, 0, N, r, p, 0, 0, + YESCRYPT_RW | __YESCRYPT_INIT_SHARED_1, + salt, sizeof(salt))) + goto out; + + half1 = half2 = *shared; + half1.aligned_size /= 2; + half2.aligned += half1.aligned_size; + half2.aligned_size = half1.aligned_size; + N /= 2; + + if (p > 1 && yescrypt_kdf(&half1, &half2, + param, paramlen, salt, sizeof(salt), N, r, p, 0, 0, + YESCRYPT_RW | __YESCRYPT_INIT_SHARED_2, + salt, sizeof(salt))) + goto out; + + if (yescrypt_kdf(&half2, &half1, + param, paramlen, salt, sizeof(salt), N, r, p, 0, 0, + YESCRYPT_RW | __YESCRYPT_INIT_SHARED_1, + salt, sizeof(salt))) + goto out; + + if (yescrypt_kdf(&half1, &half2, + param, paramlen, salt, sizeof(salt), N, r, p, 0, 0, + YESCRYPT_RW | __YESCRYPT_INIT_SHARED_1, + buf, buflen)) + goto out; + + return 0; + +out: + if (!(flags & YESCRYPT_SHARED_PREALLOCATED)) + free_region(shared); + return -1; +} + +int +yescrypt_free_shared(yescrypt_shared_t * shared) +{ + return free_region(shared); +} + +int +yescrypt_init_local(yescrypt_local_t * local) +{ + init_region(local); + return 0; +} + +int +yescrypt_free_local(yescrypt_local_t * local) +{ + return free_region(local); +} diff --git a/src/crypto/defyx/yescrypt-simd.c b/src/crypto/defyx/yescrypt-simd.c new file mode 100644 index 00000000..884b2076 --- /dev/null +++ b/src/crypto/defyx/yescrypt-simd.c @@ -0,0 +1,1367 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2012-2015 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +/* + * On 64-bit, enabling SSE4.1 helps our pwxform code indirectly, via avoiding + * gcc bug 54349 (fixed for gcc 4.9+). On 32-bit, it's of direct help. AVX + * and XOP are of further help either way. + */ + +#include +#ifdef __XOP__ +#include +#endif + +#include +#include +#include +#include + +#include "sha256.h" +#include "sysendian.h" + +#include "yescrypt.h" + +#include "yescrypt-platform.c" + +#if __STDC_VERSION__ >= 199901L +/* have restrict */ +#elif defined(__GNUC__) +#define restrict __restrict +#else +#define restrict +#endif + +#ifdef __GNUC__ +#define unlikely(exp) __builtin_expect(exp, 0) +#else +#define unlikely(exp) (exp) +#endif + +#define PREFETCH(x, hint) _mm_prefetch((const char *)(x), (hint)); + +#ifdef __XOP__ +#define ARX(out, in1, in2, s) \ + out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s)); +#else +#define ARX(out, in1, in2, s) \ + { \ + __m128i T = _mm_add_epi32(in1, in2); \ + out = _mm_xor_si128(out, _mm_slli_epi32(T, s)); \ + out = _mm_xor_si128(out, _mm_srli_epi32(T, 32-s)); \ + } +#endif + +#define SALSA20_2ROUNDS \ + /* Operate on "columns" */ \ + ARX(X1, X0, X3, 7) \ + ARX(X2, X1, X0, 9) \ + ARX(X3, X2, X1, 13) \ + ARX(X0, X3, X2, 18) \ +\ + /* Rearrange data */ \ + X1 = _mm_shuffle_epi32(X1, 0x93); \ + X2 = _mm_shuffle_epi32(X2, 0x4E); \ + X3 = _mm_shuffle_epi32(X3, 0x39); \ +\ + /* Operate on "rows" */ \ + ARX(X3, X0, X1, 7) \ + ARX(X2, X3, X0, 9) \ + ARX(X1, X2, X3, 13) \ + ARX(X0, X1, X2, 18) \ +\ + /* Rearrange data */ \ + X1 = _mm_shuffle_epi32(X1, 0x39); \ + X2 = _mm_shuffle_epi32(X2, 0x4E); \ + X3 = _mm_shuffle_epi32(X3, 0x93); + +/** + * Apply the Salsa20/2 core to the block provided in (X0 ... X3). + */ +#define SALSA20_2(out) \ + { \ + __m128i Y0 = X0; \ + __m128i Y1 = X1; \ + __m128i Y2 = X2; \ + __m128i Y3 = X3; \ + SALSA20_2ROUNDS \ + (out)[0] = X0 = _mm_add_epi32(X0, Y0); \ + (out)[1] = X1 = _mm_add_epi32(X1, Y1); \ + (out)[2] = X2 = _mm_add_epi32(X2, Y2); \ + (out)[3] = X3 = _mm_add_epi32(X3, Y3); \ + } + +/** + * Apply the Salsa20/8 core to the block provided in (X0 ... X3) ^ (Z0 ... Z3). + */ +#define SALSA20_8_XOR_ANY(maybe_decl, Z0, Z1, Z2, Z3, out) \ + X0 = _mm_xor_si128(X0, Z0); \ + X1 = _mm_xor_si128(X1, Z1); \ + X2 = _mm_xor_si128(X2, Z2); \ + X3 = _mm_xor_si128(X3, Z3); \ + { \ + maybe_decl Y0 = X0; \ + maybe_decl Y1 = X1; \ + maybe_decl Y2 = X2; \ + maybe_decl Y3 = X3; \ + SALSA20_2ROUNDS \ + SALSA20_2ROUNDS \ + SALSA20_2ROUNDS \ + SALSA20_2ROUNDS \ + (out)[0] = X0 = _mm_add_epi32(X0, Y0); \ + (out)[1] = X1 = _mm_add_epi32(X1, Y1); \ + (out)[2] = X2 = _mm_add_epi32(X2, Y2); \ + (out)[3] = X3 = _mm_add_epi32(X3, Y3); \ + } + +#define SALSA20_8_XOR_MEM(in, out) \ + SALSA20_8_XOR_ANY(__m128i, (in)[0], (in)[1], (in)[2], (in)[3], out) + +#define SALSA20_8_XOR_REG(out) \ + SALSA20_8_XOR_ANY(/* empty */, Y0, Y1, Y2, Y3, out) + +typedef union { + uint32_t w[16]; + __m128i q[4]; +} salsa20_blk_t; + +/** + * blockmix_salsa8(Bin, Bout, r): + * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r + * bytes in length; the output Bout must also be the same size. + */ +static void +blockmix_salsa8(const salsa20_blk_t *restrict Bin, + salsa20_blk_t *restrict Bout, size_t r) +{ + size_t i; + __m128i X0, X1, X2, X3; + + r--; + PREFETCH(&Bin[r * 2 + 1], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin[i * 2], _MM_HINT_T0) + PREFETCH(&Bin[i * 2 + 1], _MM_HINT_T0) + } + PREFETCH(&Bin[r * 2], _MM_HINT_T0) + + /* 1: X <-- B_{2r - 1} */ + X0 = Bin[r * 2 + 1].q[0]; + X1 = Bin[r * 2 + 1].q[1]; + X2 = Bin[r * 2 + 1].q[2]; + X3 = Bin[r * 2 + 1].q[3]; + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i <= r; i++) { + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + SALSA20_8_XOR_MEM(Bin[i * 2].q, Bout[i].q) + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + SALSA20_8_XOR_MEM(Bin[i * 2 + 1].q, Bout[r + 1 + i].q) + } +} + +/* + * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs + * starting with Sandy Bridge. Additionally, PSHUFD uses separate source and + * destination registers, whereas the shifts would require an extra move + * instruction for our code when building without AVX. Unfortunately, PSHUFD + * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ) + * and somewhat slower on some non-Intel CPUs (luckily not including AMD + * Bulldozer and Piledriver). + */ +#ifdef __AVX__ +#define HI32(X) \ + _mm_srli_si128((X), 4) +#elif 1 /* As an option, check for __SSE4_1__ here not to hurt Conroe */ +#define HI32(X) \ + _mm_shuffle_epi32((X), _MM_SHUFFLE(2,3,0,1)) +#else +#define HI32(X) \ + _mm_srli_epi64((X), 32) +#endif + +#if defined(__x86_64__) && (defined(__ICC) || defined(__llvm__)) +/* Intel's name, also supported by recent gcc */ +#define EXTRACT64(X) _mm_cvtsi128_si64(X) +#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__) +/* gcc got the 'x' name earlier than non-'x', MSVC and Open64 had bugs */ +#define EXTRACT64(X) _mm_cvtsi128_si64x(X) +#elif defined(__x86_64__) && defined(__SSE4_1__) +/* No known bugs for this intrinsic */ +#include +#define EXTRACT64(X) _mm_extract_epi64((X), 0) +#elif defined(__SSE4_1__) +/* 32-bit */ +#include +#if 0 +/* This is currently unused by the code below, which instead uses these two + * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */ +#define EXTRACT64(X) \ + ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ + ((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32)) +#endif +#else +/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64*() */ +#define EXTRACT64(X) \ + ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ + ((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32)) +#endif + +/* This is tunable */ +#define Swidth 8 + +/* Not tunable in this implementation, hard-coded in a few places */ +#define PWXsimple 2 +#define PWXgather 4 + +/* Derived values. Not tunable except via Swidth above. */ +#define PWXbytes (PWXgather * PWXsimple * 8) +#define Sbytes (3 * (1 << Swidth) * PWXsimple * 8) +#define Smask (((1 << Swidth) - 1) * PWXsimple * 8) +#define Smask2 (((uint64_t)Smask << 32) | Smask) + +#if !defined(__x86_64__) && defined(__SSE4_1__) +/* 32-bit with SSE4.1 */ +#define PWXFORM_X_T __m128i +#define PWXFORM_SIMD(X, x, s0, s1) \ + x = _mm_and_si128(X, _mm_set1_epi64x(Smask2)); \ + s0 = *(__m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \ + s1 = *(__m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1)); \ + X = _mm_mul_epu32(HI32(X), X); \ + X = _mm_add_epi64(X, s0); \ + X = _mm_xor_si128(X, s1); +#else +/* 64-bit, or 32-bit without SSE4.1 */ +#define PWXFORM_X_T uint64_t +#define PWXFORM_SIMD(X, x, s0, s1) \ + x = EXTRACT64(X) & Smask2; \ + s0 = *(__m128i *)(S0 + (uint32_t)x); \ + s1 = *(__m128i *)(S1 + (x >> 32)); \ + X = _mm_mul_epu32(HI32(X), X); \ + X = _mm_add_epi64(X, s0); \ + X = _mm_xor_si128(X, s1); +#endif + +#define PWXFORM_WRITE \ + *(__m128i *)(S2 + w) = X0; \ + *(__m128i *)(S2 + w + 16) = X1; \ + *(__m128i *)(S2 + w + 32) = X2; \ + *(__m128i *)(S2 + w + 48) = X3; \ + w += 64; + +#define PWXFORM_ROUND \ + PWXFORM_SIMD(X0, x0, s00, s01) \ + PWXFORM_SIMD(X1, x1, s10, s11) \ + PWXFORM_SIMD(X2, x2, s20, s21) \ + PWXFORM_SIMD(X3, x3, s30, s31) + +#define PWXFORM \ + { \ + PWXFORM_X_T x0, x1, x2, x3; \ + __m128i s00, s01, s10, s11, s20, s21, s30, s31; \ + PWXFORM_ROUND \ + PWXFORM_ROUND PWXFORM_WRITE \ + PWXFORM_ROUND PWXFORM_WRITE \ + PWXFORM_ROUND PWXFORM_WRITE \ + PWXFORM_ROUND PWXFORM_WRITE \ + PWXFORM_ROUND \ + w &= Smask; \ + { \ + uint8_t * Stmp = S2; \ + S2 = S1; \ + S1 = S0; \ + S0 = Stmp; \ + } \ + } + +#define XOR4(in) \ + X0 = _mm_xor_si128(X0, (in)[0]); \ + X1 = _mm_xor_si128(X1, (in)[1]); \ + X2 = _mm_xor_si128(X2, (in)[2]); \ + X3 = _mm_xor_si128(X3, (in)[3]); + +#define OUT(out) \ + (out)[0] = X0; \ + (out)[1] = X1; \ + (out)[2] = X2; \ + (out)[3] = X3; + +typedef struct { + uint8_t *S0, *S1, *S2; + size_t w; +} pwxform_ctx_t; + +#define Salloc (Sbytes + ((sizeof(pwxform_ctx_t) + 63) & ~63U)) + +/** + * blockmix_pwxform(Bin, Bout, r, S): + * Compute Bout = BlockMix_pwxform{salsa20/8, r, S}(Bin). The input Bin must + * be 128r bytes in length; the output Bout must also be the same size. + */ +static void +blockmix(const salsa20_blk_t *restrict Bin, salsa20_blk_t *restrict Bout, + size_t r, pwxform_ctx_t *restrict ctx) +{ + uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; + size_t w = ctx->w; + size_t i; + __m128i X0, X1, X2, X3; + + /* Convert 128-byte blocks to 64-byte blocks */ + /* 1: r_1 <-- 128r / PWXbytes */ + r *= 2; + + r--; + PREFETCH(&Bin[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin[i], _MM_HINT_T0) + } + + /* 2: X <-- B'_{r_1 - 1} */ + X0 = Bin[r].q[0]; + X1 = Bin[r].q[1]; + X2 = Bin[r].q[2]; + X3 = Bin[r].q[3]; + + /* 3: for i = 0 to r_1 - 1 do */ + i = 0; + do { + /* 5: X <-- X \xor B'_i */ + XOR4(Bin[i].q) + /* 7: X <-- pwxform(X) */ + PWXFORM + + if (unlikely(i >= r)) + break; + + /* 8: B'_i <-- X */ + OUT(Bout[i].q) + + i++; + } while (1); + + ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; + ctx->w = w; + + /* 11: B_i <-- H(B_i) */ + SALSA20_2(Bout[i].q) +} + +#define XOR4_2(in1, in2) \ + X0 = _mm_xor_si128((in1)[0], (in2)[0]); \ + X1 = _mm_xor_si128((in1)[1], (in2)[1]); \ + X2 = _mm_xor_si128((in1)[2], (in2)[2]); \ + X3 = _mm_xor_si128((in1)[3], (in2)[3]); + +static uint32_t +blockmix_salsa8_xor(const salsa20_blk_t *restrict Bin1, + const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, + size_t r) +{ + size_t i; + __m128i X0, X1, X2, X3; + + r--; + PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0) + PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i * 2], _MM_HINT_T0) + PREFETCH(&Bin1[i * 2], _MM_HINT_T0) + PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0) + PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) + } + PREFETCH(&Bin2[r * 2], _MM_HINT_T0) + PREFETCH(&Bin1[r * 2], _MM_HINT_T0) + + /* 1: X <-- B_{2r - 1} */ + XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i <= r; i++) { + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[i * 2].q) + SALSA20_8_XOR_MEM(Bin2[i * 2].q, Bout[i].q) + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[i * 2 + 1].q) + SALSA20_8_XOR_MEM(Bin2[i * 2 + 1].q, Bout[r + 1 + i].q) + } + + return _mm_cvtsi128_si32(X0); +} + +static uint32_t +blockmix_xor(const salsa20_blk_t *restrict Bin1, + const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, + size_t r, int Bin2_in_ROM, pwxform_ctx_t *restrict ctx) +{ + uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; + size_t w = ctx->w; + size_t i; + __m128i X0, X1, X2, X3; + + /* Convert 128-byte blocks to 64-byte blocks */ + /* 1: r_1 <-- 128r / PWXbytes */ + r *= 2; + + r--; + if (Bin2_in_ROM) { + PREFETCH(&Bin2[r], _MM_HINT_NTA) + PREFETCH(&Bin1[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i], _MM_HINT_NTA) + PREFETCH(&Bin1[i], _MM_HINT_T0) + } + } else { + PREFETCH(&Bin2[r], _MM_HINT_T0) + PREFETCH(&Bin1[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i], _MM_HINT_T0) + PREFETCH(&Bin1[i], _MM_HINT_T0) + } + } + + /* 2: X <-- B'_{r_1 - 1} */ + XOR4_2(Bin1[r].q, Bin2[r].q) + + /* 3: for i = 0 to r_1 - 1 do */ + i = 0; + r--; + do { + /* 5: X <-- X \xor B'_i */ + XOR4(Bin1[i].q) + XOR4(Bin2[i].q) + /* 7: X <-- pwxform(X) */ + PWXFORM + /* 8: B'_i <-- X */ + OUT(Bout[i].q) + + /* 5: X <-- X \xor B'_i */ + XOR4(Bin1[i + 1].q) + XOR4(Bin2[i + 1].q) + /* 7: X <-- pwxform(X) */ + PWXFORM + + if (unlikely(i >= r)) + break; + + /* 8: B'_i <-- X */ + OUT(Bout[i + 1].q) + + i += 2; + } while (1); + i++; + + ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; + ctx->w = w; + + /* 11: B_i <-- H(B_i) */ + SALSA20_2(Bout[i].q) + + return _mm_cvtsi128_si32(X0); +} + +#undef XOR4 +#define XOR4(in, out) \ + (out)[0] = Y0 = _mm_xor_si128((in)[0], (out)[0]); \ + (out)[1] = Y1 = _mm_xor_si128((in)[1], (out)[1]); \ + (out)[2] = Y2 = _mm_xor_si128((in)[2], (out)[2]); \ + (out)[3] = Y3 = _mm_xor_si128((in)[3], (out)[3]); + +#define XOR4_Y \ + X0 = _mm_xor_si128(X0, Y0); \ + X1 = _mm_xor_si128(X1, Y1); \ + X2 = _mm_xor_si128(X2, Y2); \ + X3 = _mm_xor_si128(X3, Y3); + +static uint32_t +blockmix_xor_save(const salsa20_blk_t *restrict Bin1, + salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, + size_t r, pwxform_ctx_t *restrict ctx) +{ + __m128i X0, X1, X2, X3, Y0, Y1, Y2, Y3; + uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; + size_t w = ctx->w; + size_t i; + + /* Convert 128-byte blocks to 64-byte blocks */ + /* 1: r_1 <-- 128r / PWXbytes */ + r *= 2; + + r--; + PREFETCH(&Bin2[r], _MM_HINT_T0) + PREFETCH(&Bin1[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i], _MM_HINT_T0) + PREFETCH(&Bin1[i], _MM_HINT_T0) + } + + /* 2: X <-- B'_{r_1 - 1} */ + XOR4_2(Bin1[r].q, Bin2[r].q) + + /* 3: for i = 0 to r_1 - 1 do */ + i = 0; + r--; + do { + XOR4(Bin1[i].q, Bin2[i].q) + /* 5: X <-- X \xor B'_i */ + XOR4_Y + /* 7: X <-- pwxform(X) */ + PWXFORM + /* 8: B'_i <-- X */ + OUT(Bout[i].q) + + XOR4(Bin1[i + 1].q, Bin2[i + 1].q) + /* 5: X <-- X \xor B'_i */ + XOR4_Y + /* 7: X <-- pwxform(X) */ + PWXFORM + + if (unlikely(i >= r)) + break; + + /* 8: B'_i <-- X */ + OUT(Bout[i + 1].q) + + i += 2; + } while (1); + i++; + + ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; + ctx->w = w; + + /* 11: B_i <-- H(B_i) */ + SALSA20_2(Bout[i].q) + + return _mm_cvtsi128_si32(X0); +} + +#undef ARX +#undef SALSA20_2ROUNDS +#undef SALSA20_2 +#undef SALSA20_8_XOR_ANY +#undef SALSA20_8_XOR_MEM +#undef SALSA20_8_XOR_REG +#undef PWXFORM_X_T +#undef PWXFORM_SIMD +#undef PWXFORM_ROUND +#undef PWXFORM +#undef OUT +#undef XOR4 +#undef XOR4_2 +#undef XOR4_Y + +/** + * integerify(B, r): + * Return the result of parsing B_{2r-1} as a little-endian integer. + */ +static inline uint32_t +integerify(const salsa20_blk_t * B, size_t r) +{ + return B[2 * r - 1].w[0]; +} + +/** + * smix1(B, r, N, flags, V, NROM, VROM, XY, ctx): + * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 128r bytes in length. The value N must be even and no + * smaller than 2. The array V must be aligned to a multiple of 64 bytes, and + * arrays B and XY to a multiple of at least 16 bytes (aligning them to 64 + * bytes as well saves cache lines, but might result in cache bank conflicts). + */ +static void +smix1(uint8_t * B, size_t r, uint32_t N, yescrypt_flags_t flags, + salsa20_blk_t * V, uint32_t NROM, const salsa20_blk_t * VROM, + salsa20_blk_t * XY, pwxform_ctx_t * ctx) +{ + size_t s = 2 * r; + salsa20_blk_t * X = V, * Y; + uint32_t i, j; + size_t k; + + /* 1: X <-- B */ + /* 3: V_i <-- X */ + for (k = 0; k < 2 * r; k++) { + for (i = 0; i < 16; i++) { + X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); + } + } + + if (VROM) { + uint32_t n; + salsa20_blk_t * V_n; + const salsa20_blk_t * V_j; + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[s]; + blockmix(X, Y, r, ctx); + + X = &V[2 * s]; + /* j <-- Integerify(X) mod NROM */ + j = integerify(Y, r) & (NROM - 1); + V_j = &VROM[j * s]; + + /* X <-- H(X \xor VROM_j) */ + j = blockmix_xor(Y, V_j, X, r, 1, ctx); + + for (n = 2; n < N; n <<= 1) { + uint32_t m = (n < N / 2) ? n : (N - 1 - n); + + V_n = &V[n * s]; + + /* 2: for i = 0 to N - 1 do */ + for (i = 1; i < m; i += 2) { + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += i - 1; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V_n[i * s]; + + /* j <-- Integerify(X) mod NROM */ + j = blockmix_xor(X, V_j, Y, r, 0, ctx) & (NROM - 1); + V_j = &VROM[j * s]; + + /* X <-- H(X \xor VROM_j) */ + X = &V_n[(i + 1) * s]; + j = blockmix_xor(Y, V_j, X, r, 1, ctx); + } + } + + n >>= 1; + + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += N - 2 - n; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[(N - 1) * s]; + + /* j <-- Integerify(X) mod NROM */ + j = blockmix_xor(X, V_j, Y, r, 0, ctx) & (NROM - 1); + V_j = &VROM[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + X = XY; + blockmix_xor(Y, V_j, X, r, 1, ctx); + } else if (flags & YESCRYPT_RW) { + uint32_t n; + salsa20_blk_t * V_n, * V_j; + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[s]; + blockmix(X, Y, r, ctx); + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + X = &V[2 * s]; + blockmix(Y, X, r, ctx); + j = integerify(X, r); + + for (n = 2; n < N; n <<= 1) { + uint32_t m = (n < N / 2) ? n : (N - 1 - n); + + V_n = &V[n * s]; + + /* 2: for i = 0 to N - 1 do */ + for (i = 1; i < m; i += 2) { + Y = &V_n[i * s]; + + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += i - 1; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + j = blockmix_xor(X, V_j, Y, r, 0, ctx); + + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += i; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + X = &V_n[(i + 1) * s]; + j = blockmix_xor(Y, V_j, X, r, 0, ctx); + } + } + + n >>= 1; + + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += N - 2 - n; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[(N - 1) * s]; + j = blockmix_xor(X, V_j, Y, r, 0, ctx); + + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += N - 1 - n; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + X = XY; + blockmix_xor(Y, V_j, X, r, 0, ctx); + } else { + /* 2: for i = 0 to N - 1 do */ + for (i = 1; i < N - 1; i += 2) { + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[i * s]; + blockmix_salsa8(X, Y, r); + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + X = &V[(i + 1) * s]; + blockmix_salsa8(Y, X, r); + } + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[i * s]; + blockmix_salsa8(X, Y, r); + + /* 4: X <-- H(X) */ + X = XY; + blockmix_salsa8(Y, X, r); + } + + /* B' <-- X */ + for (k = 0; k < 2 * r; k++) { + for (i = 0; i < 16; i++) { + le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]); + } + } +} + +/** + * smix2(B, r, N, Nloop, flags, V, NROM, VROM, XY, ctx): + * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 256r bytes in length. The value N must be a power of 2 + * greater than 1. The value Nloop must be even. The array V must be aligned + * to a multiple of 64 bytes, and arrays B and XY to a multiple of at least 16 + * bytes (aligning them to 64 bytes as well saves cache lines, but might result + * in cache bank conflicts). + */ +static void +smix2(uint8_t * B, size_t r, uint32_t N, uint64_t Nloop, + yescrypt_flags_t flags, salsa20_blk_t * V, uint32_t NROM, + const salsa20_blk_t * VROM, salsa20_blk_t * XY, pwxform_ctx_t * ctx) +{ + size_t s = 2 * r; + salsa20_blk_t * X = XY, * Y = &XY[s]; + uint64_t i; + uint32_t j; + size_t k; + + if (Nloop == 0) + return; + + /* X <-- B' */ + /* 3: V_i <-- X */ + for (k = 0; k < 2 * r; k++) { + for (i = 0; i < 16; i++) { + X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); + } + } + + i = Nloop / 2; + + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + +/* + * Normally, VROM implies YESCRYPT_RW, but we check for these separately + * because our SMix resets YESCRYPT_RW for the smix2() calls operating on the + * entire V when p > 1. + */ + if (VROM && (flags & YESCRYPT_RW)) { + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < Nloop; i += 2) { + salsa20_blk_t * V_j = &V[j * s]; + const salsa20_blk_t * VROM_j; + + /* 8: X <-- H(X \xor V_j) */ + /* V_j <-- Xprev \xor V_j */ + /* j <-- Integerify(X) mod NROM */ + j = blockmix_xor_save(X, V_j, Y, r, ctx) & (NROM - 1); + VROM_j = &VROM[j * s]; + + /* X <-- H(X \xor VROM_j) */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor(Y, VROM_j, X, r, 1, ctx) & (N - 1); + V_j = &V[j * s]; + } + } else if (VROM) { + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < Nloop; i += 2) { + const salsa20_blk_t * V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* V_j <-- Xprev \xor V_j */ + /* j <-- Integerify(X) mod NROM */ + j = blockmix_xor(X, V_j, Y, r, 0, ctx) & (NROM - 1); + V_j = &VROM[j * s]; + + /* X <-- H(X \xor VROM_j) */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor(Y, V_j, X, r, 1, ctx) & (N - 1); + V_j = &V[j * s]; + } + } else if (flags & YESCRYPT_RW) { + /* 6: for i = 0 to N - 1 do */ + do { + salsa20_blk_t * V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* V_j <-- Xprev \xor V_j */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor_save(X, V_j, Y, r, ctx) & (N - 1); + V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* V_j <-- Xprev \xor V_j */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor_save(Y, V_j, X, r, ctx) & (N - 1); + } while (--i); + } else if (ctx) { + /* 6: for i = 0 to N - 1 do */ + do { + const salsa20_blk_t * V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor(X, V_j, Y, r, 0, ctx) & (N - 1); + V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor(Y, V_j, X, r, 0, ctx) & (N - 1); + } while (--i); + } else { + /* 6: for i = 0 to N - 1 do */ + do { + const salsa20_blk_t * V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_salsa8_xor(X, V_j, Y, r) & (N - 1); + V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_salsa8_xor(Y, V_j, X, r) & (N - 1); + } while (--i); + } + + /* 10: B' <-- X */ + for (k = 0; k < 2 * r; k++) { + for (i = 0; i < 16; i++) { + le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]); + } + } +} + +/** + * p2floor(x): + * Largest power of 2 not greater than argument. + */ +static uint64_t +p2floor(uint64_t x) +{ + uint64_t y; + while ((y = x & (x - 1))) + x = y; + return x; +} + +/** + * smix(B, r, N, p, t, flags, V, NROM, VROM, XY, S, passwd): + * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the + * temporary storage V must be 128rN bytes in length; the temporary storage XY + * must be 256r or 256rp bytes in length (the larger size is required with + * OpenMP-enabled builds). The value N must be a power of 2 greater than 1. + * The array V must be aligned to a multiple of 64 bytes, and arrays B and + * XY to a multiple of at least 16 bytes (aligning them to 64 bytes as well + * saves cache lines and helps avoid false sharing in OpenMP-enabled builds + * when p > 1, but it might also result in cache bank conflicts). + */ +static void +smix(uint8_t * B, size_t r, uint32_t N, uint32_t p, uint32_t t, + yescrypt_flags_t flags, + salsa20_blk_t * V, uint32_t NROM, const salsa20_blk_t * VROM, + salsa20_blk_t * XY, uint8_t * S, uint8_t * passwd) +{ + size_t s = 2 * r; + uint32_t Nchunk; + uint64_t Nloop_all, Nloop_rw; + uint32_t i; + + /* 1: n <-- N / p */ + Nchunk = N / p; + + /* 2: Nloop_all <-- fNloop(n, t, flags) */ + Nloop_all = Nchunk; + if (flags & YESCRYPT_RW) { + if (t <= 1) { + if (t) + Nloop_all *= 2; /* 2/3 */ + Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ + } else { + Nloop_all *= t - 1; + } + } else if (t) { + if (t == 1) + Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ + Nloop_all *= t; + } + + /* 6: Nloop_rw <-- 0 */ + Nloop_rw = 0; + if (flags & __YESCRYPT_INIT_SHARED) { + Nloop_rw = Nloop_all; + } else { + /* 3: if YESCRYPT_RW flag is set */ + if (flags & YESCRYPT_RW) { + /* 4: Nloop_rw <-- Nloop_all / p */ + Nloop_rw = Nloop_all / p; + } + } + + /* 8: n <-- n - (n mod 2) */ + Nchunk &= ~(uint32_t)1; /* round down to even */ + /* 9: Nloop_all <-- Nloop_all + (Nloop_all mod 2) */ + Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ + /* 10: Nloop_rw <-- Nloop_rw + (Nloop_rw mod 2) */ + Nloop_rw++; Nloop_rw &= ~(uint64_t)1; /* round up to even */ + + /* 11: for i = 0 to p - 1 do */ +#ifdef _OPENMP +#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, VROM, XY, S, passwd, s, Nchunk, Nloop_all, Nloop_rw) + { +#pragma omp for +#endif + for (i = 0; i < p; i++) { + /* 12: u <-- in */ + uint32_t Vchunk = i * Nchunk; + /* 13: if i = p - 1 */ + /* 14: n <-- N - u */ + /* 15: end if */ + /* 16: v <-- u + n - 1 */ + uint32_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); + uint8_t * Bp = &B[128 * r * i]; + salsa20_blk_t * Vp = &V[Vchunk * s]; +#ifdef _OPENMP + salsa20_blk_t * XYp = &XY[i * (2 * s)]; +#else + salsa20_blk_t * XYp = XY; +#endif + pwxform_ctx_t * ctx_i = NULL; + /* 17: if YESCRYPT_RW flag is set */ + if (flags & YESCRYPT_RW) { + uint8_t *Si = S + i * Salloc; + /* 18: SMix1_1(B_i, Sbytes / 128, S_i, no flags) */ + smix1(Bp, 1, Sbytes / 128, 0 /* no flags */, + (salsa20_blk_t *)Si, 0, NULL, XYp, NULL); + ctx_i = (pwxform_ctx_t *)(Si + Sbytes); + /* 19: S2_i <-- S_{i,0...2^Swidth-1} */ + ctx_i->S2 = Si; + /* 20: S1_i <-- S_{i,2^Swidth...2*2^Swidth-1} */ + ctx_i->S1 = Si + Sbytes / 3; + /* 21: S0_i <-- S_{i,2*2^Swidth...3*2^Swidth-1} */ + ctx_i->S0 = Si + Sbytes / 3 * 2; + /* 22: w_i <-- 0 */ + ctx_i->w = 0; + /* 23: if i = 0 */ + if (i == 0) { + /* 24: passwd <-- HMAC-SHA256(B_{0,2r-1}, passwd) */ + HMAC_SHA256_CTX_Y ctx; + HMAC_SHA256_Init_Y(&ctx, Bp + (128 * r - 64), 64); + HMAC_SHA256_Update_Y(&ctx, passwd, 32); + HMAC_SHA256_Final_Y(passwd, &ctx); + } + } + if (!(flags & __YESCRYPT_INIT_SHARED_2)) { + /* 27: SMix1_r(B_i, n, V_{u..v}, flags) */ + smix1(Bp, r, Np, flags, Vp, NROM, VROM, XYp, ctx_i); + } + /* 28: SMix2_r(B_i, p2floor(n), Nloop_rw, V_{u..v}, flags) */ + smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, + NROM, VROM, XYp, ctx_i); + } + + /* 30: for i = 0 to p - 1 do */ + if (Nloop_all > Nloop_rw) { +#ifdef _OPENMP +#pragma omp for +#endif + for (i = 0; i < p; i++) { + uint8_t * Bp = &B[128 * r * i]; +#ifdef _OPENMP + salsa20_blk_t * XYp = &XY[i * (2 * s)]; +#else + salsa20_blk_t * XYp = XY; +#endif + pwxform_ctx_t * ctx_i = NULL; + if (flags & YESCRYPT_RW) { + uint8_t *Si = S + i * Salloc; + ctx_i = (pwxform_ctx_t *)(Si + Sbytes); + } + /* 31: SMix2_r(B_i, N, Nloop_all - Nloop_rw, V, flags excluding YESCRYPT_RW) */ + smix2(Bp, r, N, Nloop_all - Nloop_rw, + flags & ~YESCRYPT_RW, V, NROM, VROM, XYp, ctx_i); + } + } +#ifdef _OPENMP + } +#endif +} + +/** + * yescrypt_kdf_body(shared, local, passwd, passwdlen, salt, saltlen, + * N, r, p, t, flags, buf, buflen): + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, + * p, buflen), or a revision of scrypt as requested by flags and shared, and + * write the result into buf. The parameters r, p, and buflen must satisfy + * r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power + * of 2 greater than 1. (This optimized implementation currently additionally + * limits N to the range from 8 to 2^31, but other implementation might not.) + * + * t controls computation time while not affecting peak memory usage. shared + * and flags may request special modes as described in yescrypt.h. local is + * the thread-local data structure, allowing to preserve and reuse a memory + * allocation across calls, thereby reducing its overhead. + * + * Return 0 on success; or -1 on error. + */ +static int +yescrypt_kdf_body(const yescrypt_shared_t * shared, yescrypt_local_t * local, + const uint8_t * passwd, size_t passwdlen, + const uint8_t * salt, size_t saltlen, + uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, + uint8_t * buf, size_t buflen) +{ + yescrypt_region_t tmp; + uint64_t NROM; + const salsa20_blk_t * VROM; + size_t B_size, V_size, XY_size, need; + uint8_t * B, * S; + salsa20_blk_t * V, * XY; + uint8_t sha256[32]; + uint8_t dk[sizeof(sha256)], * dkp = buf; + + /* Sanity-check parameters */ + if (flags & ~YESCRYPT_KNOWN_FLAGS) { + errno = EINVAL; + return -1; + } +#if SIZE_MAX > UINT32_MAX + if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { + errno = EFBIG; + return -1; + } +#endif + if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { + errno = EFBIG; + return -1; + } + if (N > UINT32_MAX) { + errno = EFBIG; + return -1; + } + if (((N & (N - 1)) != 0) || (N <= 3) || (r < 1) || (p < 1)) { + errno = EINVAL; + return -1; + } + if ((r > SIZE_MAX / 256 / p) || + (N > SIZE_MAX / 128 / r)) { + errno = ENOMEM; + return -1; + } + if (flags & YESCRYPT_RW) { + if (N / p <= 3) { + errno = EINVAL; + return -1; + } + if (p > SIZE_MAX / Salloc) { + errno = ENOMEM; + return -1; + } + } +#ifdef _OPENMP + else if (N > SIZE_MAX / 128 / (r * p)) { + errno = ENOMEM; + return -1; + } +#endif + + NROM = 0; + VROM = NULL; + if (shared) { + NROM = shared->aligned_size / ((size_t)128 * r); + if (NROM > UINT32_MAX) { + errno = EFBIG; + return -1; + } + if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) || + !(flags & YESCRYPT_RW)) { + errno = EINVAL; + return -1; + } + VROM = shared->aligned; + } + + /* Allocate memory */ + V = NULL; + V_size = (size_t)128 * r * N; +#ifdef _OPENMP + if (!(flags & YESCRYPT_RW)) + V_size *= p; +#endif + need = V_size; + if (flags & __YESCRYPT_INIT_SHARED) { + if (local->aligned_size < need) { + if (local->base || local->aligned || + local->base_size || local->aligned_size) { + errno = EINVAL; + return -1; + } + if (!alloc_region(local, need)) + return -1; + } + V = (salsa20_blk_t *)local->aligned; + need = 0; + } + B_size = (size_t)128 * r * p; + need += B_size; + if (need < B_size) { + errno = ENOMEM; + return -1; + } + XY_size = (size_t)256 * r; +#ifdef _OPENMP + XY_size *= p; +#endif + need += XY_size; + if (need < XY_size) { + errno = ENOMEM; + return -1; + } + if (flags & YESCRYPT_RW) { + size_t S_size = (size_t)Salloc * p; + need += S_size; + if (need < S_size) { + errno = ENOMEM; + return -1; + } + } + if (flags & __YESCRYPT_INIT_SHARED) { + if (!alloc_region(&tmp, need)) + return -1; + B = (uint8_t *)tmp.aligned; + XY = (salsa20_blk_t *)((uint8_t *)B + B_size); + } else { + init_region(&tmp); + if (local->aligned_size < need) { + if (free_region(local)) + return -1; + if (!alloc_region(local, need)) + return -1; + } + B = (uint8_t *)local->aligned; + V = (salsa20_blk_t *)((uint8_t *)B + B_size); + XY = (salsa20_blk_t *)((uint8_t *)V + V_size); + } + S = NULL; + if (flags & YESCRYPT_RW) + S = (uint8_t *)XY + XY_size; + + if (flags) { + HMAC_SHA256_CTX_Y ctx; + HMAC_SHA256_Init_Y(&ctx, "yescrypt-prehash", + (flags & __YESCRYPT_PREHASH) ? 16 : 8); + HMAC_SHA256_Update_Y(&ctx, passwd, passwdlen); + HMAC_SHA256_Final_Y(sha256, &ctx); + passwd = sha256; + passwdlen = sizeof(sha256); + } + + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, B, B_size); + + if (t || flags) + memcpy(sha256, B, sizeof(sha256)); + + if (p == 1 || (flags & YESCRYPT_RW)) { + smix(B, r, N, p, t, flags, V, NROM, VROM, XY, S, sha256); + } else { + uint32_t i; + + /* 2: for i = 0 to p - 1 do */ +#ifdef _OPENMP +#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, VROM, XY, S) +#endif + for (i = 0; i < p; i++) { + /* 3: B_i <-- MF(B_i, N) */ +#ifdef _OPENMP + smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, + &V[(size_t)2 * r * i * N], + NROM, VROM, + &XY[(size_t)4 * r * i], NULL, NULL); +#else + smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, V, + NROM, VROM, XY, NULL, NULL); +#endif + } + } + + dkp = buf; + if (flags && buflen < sizeof(dk)) { + PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, dk, sizeof(dk)); + dkp = dk; + } + + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, buf, buflen); + + /* + * Except when computing classic scrypt, allow all computation so far + * to be performed on the client. The final steps below match those of + * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so + * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of + * SCRAM's use of SHA-1) would be usable with yescrypt hashes. + */ + if (flags && !(flags & __YESCRYPT_PREHASH)) { + /* Compute ClientKey */ + { + HMAC_SHA256_CTX_Y ctx; + HMAC_SHA256_Init_Y(&ctx, dkp, sizeof(dk)); + HMAC_SHA256_Update_Y(&ctx, "Client Key", 10); + HMAC_SHA256_Final_Y(sha256, &ctx); + } + /* Compute StoredKey */ + { + SHA256_CTX_Y ctx; + size_t clen = buflen; + if (clen > sizeof(dk)) + clen = sizeof(dk); + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, sha256, sizeof(sha256)); + SHA256_Final_Y(dk, &ctx); + memcpy(buf, dk, clen); + } + } + + if (free_region(&tmp)) + return -1; + + /* Success! */ + return 0; +} + +/** + * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, + * N, r, p, t, g, flags, buf, buflen): + * Compute scrypt or its revision as requested by the parameters. The inputs + * to this function are the same as those for yescrypt_kdf_body() above, with + * the addition of g, which controls hash upgrades (0 for no upgrades so far). + */ +int +yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local, + const uint8_t * passwd, size_t passwdlen, + const uint8_t * salt, size_t saltlen, + uint64_t N, uint32_t r, uint32_t p, uint32_t t, uint32_t g, + yescrypt_flags_t flags, + uint8_t * buf, size_t buflen) +{ + uint8_t dk[32]; + + if ((flags & (YESCRYPT_RW | __YESCRYPT_INIT_SHARED)) == YESCRYPT_RW && + p >= 1 && N / p >= 0x100 && N / p * r >= 0x20000) { + int retval = yescrypt_kdf_body(shared, local, + passwd, passwdlen, salt, saltlen, + N >> 6, r, p, 0, flags | __YESCRYPT_PREHASH, + dk, sizeof(dk)); + if (retval) + return retval; + passwd = dk; + passwdlen = sizeof(dk); + } + + do { + uint8_t * dkp = g ? dk : buf; + size_t dklen = g ? sizeof(dk) : buflen; + int retval = yescrypt_kdf_body(shared, local, + passwd, passwdlen, salt, saltlen, + N, r, p, t, flags, dkp, dklen); + if (retval) + return retval; + + passwd = dkp; + passwdlen = dklen; + + N <<= 2; + if (!N) + return -1; + t >>= 1; + } while (g--); + + return 0; +} diff --git a/src/crypto/defyx/yescrypt.h b/src/crypto/defyx/yescrypt.h new file mode 100644 index 00000000..4af307e8 --- /dev/null +++ b/src/crypto/defyx/yescrypt.h @@ -0,0 +1,326 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2013-2015 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ +#ifndef _YESCRYPT_H_ +#define _YESCRYPT_H_ + +#include +#include /* for size_t */ + +/** + * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen): + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, + * p, buflen) and write the result into buf. The parameters r, p, and buflen + * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N + * must be a power of 2 greater than 1. + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as buf is local to the thread. + */ +extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen, + const uint8_t * __salt, size_t __saltlen, + uint64_t __N, uint32_t __r, uint32_t __p, + uint8_t * __buf, size_t __buflen); + +/** + * Internal type used by the memory allocator. Please do not use it directly. + * Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since + * they might differ from each other in a future version. + */ +typedef struct { + void * base, * aligned; + size_t base_size, aligned_size; +} yescrypt_region_t; + +/** + * Types for shared (ROM) and thread-local (RAM) data structures. + */ +typedef yescrypt_region_t yescrypt_shared_t; +typedef yescrypt_region_t yescrypt_local_t; + +/** + * Possible values for yescrypt_init_shared()'s flags argument. + */ +typedef enum { + YESCRYPT_SHARED_DEFAULTS = 0, + YESCRYPT_SHARED_PREALLOCATED = 0x100 +} yescrypt_init_shared_flags_t; + +/** + * Possible values for the flags argument of yescrypt_kdf(), + * yescrypt_gensalt_r(), yescrypt_gensalt(). These may be OR'ed together, + * except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive. + * Please refer to the description of yescrypt_kdf() below for the meaning of + * these flags. + */ +typedef enum { +/* public */ + YESCRYPT_WORM = 2, + YESCRYPT_RW = 1, +/* private */ + __YESCRYPT_INIT_SHARED_1 = 0x10000, + __YESCRYPT_INIT_SHARED_2 = 0x20000, + __YESCRYPT_INIT_SHARED = 0x30000, + __YESCRYPT_PREHASH = 0x100000 +} yescrypt_flags_t; + +#define YESCRYPT_KNOWN_FLAGS \ + (YESCRYPT_WORM | YESCRYPT_RW | \ + __YESCRYPT_INIT_SHARED | __YESCRYPT_PREHASH) + +/** + * yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, buf, buflen): + * Optionally allocate memory for and initialize the shared (ROM) data + * structure. The parameters N, r, and p must satisfy the same conditions as + * with crypto_scrypt(). param and paramlen specify a local parameter with + * which the ROM is seeded. If buf is not NULL, then it is used to return + * buflen bytes of message digest for the initialized ROM (the caller may use + * this to verify that the ROM has been computed in the same way that it was on + * a previous run). + * + * Return 0 on success; or -1 on error. + * + * If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the + * ROM is assumed to have been preallocated by the caller, with shared->aligned + * being the start address of the ROM and shared->aligned_size being its size + * (which must be consistent with N, r, and p). This may be used e.g. when the + * ROM is to be placed in a SysV shared memory segment allocated by the caller. + * + * MT-safe as long as shared is local to the thread. + */ +extern int yescrypt_init_shared(yescrypt_shared_t * __shared, + const uint8_t * __param, size_t __paramlen, + uint64_t __N, uint32_t __r, uint32_t __p, + yescrypt_init_shared_flags_t __flags, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt_free_shared(shared): + * Free memory that had been allocated with yescrypt_init_shared(). + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as shared is local to the thread. + */ +extern int yescrypt_free_shared(yescrypt_shared_t * __shared); + +/** + * yescrypt_init_local(local): + * Initialize the thread-local (RAM) data structure. Actual memory allocation + * is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r(). + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as local is local to the thread. + */ +extern int yescrypt_init_local(yescrypt_local_t * __local); + +/** + * yescrypt_free_local(local): + * Free memory that may have been allocated for an initialized thread-local + * (RAM) data structure. + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as local is local to the thread. + */ +extern int yescrypt_free_local(yescrypt_local_t * __local); + +/** + * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, + * N, r, p, t, g, flags, buf, buflen): + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, + * p, buflen), or a revision of scrypt as requested by flags and shared, and + * write the result into buf. The parameters N, r, p, and buflen must satisfy + * the same conditions as with crypto_scrypt(). t controls computation time + * while not affecting peak memory usage. g controls hash upgrades (0 for no + * upgrades so far). shared and flags may request special modes as described + * below. local is the thread-local data structure, allowing to preserve and + * reuse a memory allocation across calls, thereby reducing its overhead. + * + * Return 0 on success; or -1 on error. + * + * t controls computation time. t = 0 is optimal in terms of achieving the + * highest area-time for ASIC attackers. Thus, higher computation time, if + * affordable, is best achieved by increasing N rather than by increasing t. + * However, if the higher memory usage (which goes along with higher N) is not + * affordable, or if fine-tuning of the time is needed (recall that N must be a + * power of 2), then t = 1 or above may be used to increase time while staying + * at the same peak memory usage. t = 1 increases the time by 25% and + * decreases the normalized area-time to 96% of optimal. (Of course, in + * absolute terms the area-time increases with higher t. It's just that it + * would increase slightly more with higher N*r rather than with higher t.) + * t = 2 increases the time by another 20% and decreases the normalized + * area-time to 89% of optimal. Thus, these two values are reasonable to use + * for fine-tuning. Values of t higher than 2 result in further increase in + * time while reducing the efficiency much further (e.g., down to around 50% of + * optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact + * numbers varying by the flags settings). + * + * Classic scrypt is available by setting t = 0, flags = 0, and shared = NULL. + * In this mode, the thread-local memory region (RAM) is first sequentially + * written to and then randomly read from. This algorithm is friendly towards + * time-memory tradeoffs (TMTO), available both to defenders (albeit not in + * this implementation) and to attackers. + * + * Setting YESCRYPT_WORM enables only minimal enhancements relative to classic + * scrypt: support for the t parameter, and pre- and post-hashing. + * + * Setting YESCRYPT_RW adds extra random reads and writes to the thread-local + * memory region (RAM), which makes TMTO a lot less efficient. This may be + * used to slow down the kinds of attackers who would otherwise benefit from + * classic scrypt's efficient TMTO. Since classic scrypt's TMTO allows not + * only for the tradeoff, but also for a decrease of attacker's area-time (by + * up to a constant factor), setting YESCRYPT_RW substantially increases the + * cost of attacks in area-time terms as well. Yet another benefit of it is + * that optimal area-time is reached at an earlier time than with classic + * scrypt, and t = 0 actually corresponds to this earlier completion time, + * resulting in quicker hash computations (and thus in higher request rate + * capacity). Due to these properties, YESCRYPT_RW should almost always be + * set, except when compatibility with classic scrypt or TMTO-friendliness are + * desired. + * + * YESCRYPT_RW also moves parallelism that is present with p > 1 to a + * lower level as compared to where it is in classic scrypt. This reduces + * flexibility for efficient computation (for both attackers and defenders) by + * requiring that, short of resorting to TMTO, the full amount of memory be + * allocated as needed for the specified p, regardless of whether that + * parallelism is actually being fully made use of or not. (For comparison, a + * single instance of classic scrypt may be computed in less memory without any + * CPU time overhead, but in more real time, by not making full use of the + * parallelism.) This may be desirable when the defender has enough memory + * with sufficiently low latency and high bandwidth for efficient full parallel + * execution, yet the required memory size is high enough that some likely + * attackers might end up being forced to choose between using higher latency + * memory than they could use otherwise (waiting for data longer) or using TMTO + * (waiting for data more times per one hash computation). The area-time cost + * for other kinds of attackers (who would use the same memory type and TMTO + * factor or no TMTO either way) remains roughly the same, given the same + * running time for the defender. + * + * As a side effect of differences between the algorithms, setting YESCRYPT_RW + * also changes the way the total processing time (combined for all threads) + * and memory allocation (if the parallelism is being made use of) is to be + * controlled from N*r*p (for classic scrypt) to N*r (in this modification). + * Obviously, these only differ for p > 1. + * + * Passing a shared structure, with ROM contents previously computed by + * yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for + * the thread-local RAM region. In order to allow for initialization of the + * ROM to be split into a separate program, the shared->aligned and + * shared->aligned_size fields may be set by the caller of yescrypt_kdf() + * manually rather than with yescrypt_init_shared(). + * + * local must be initialized with yescrypt_init_local(). + * + * MT-safe as long as local and buf are local to the thread. + */ +extern int yescrypt_kdf(const yescrypt_shared_t * __shared, + yescrypt_local_t * __local, + const uint8_t * __passwd, size_t __passwdlen, + const uint8_t * __salt, size_t __saltlen, + uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t, uint32_t __g, + yescrypt_flags_t __flags, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen): + * Compute and encode an scrypt or enhanced scrypt hash of passwd given the + * parameters and salt value encoded in setting. If shared is not NULL, a ROM + * is used and YESCRYPT_RW is required. Otherwise, whether to compute classic + * scrypt, YESCRYPT_WORM (a slight deviation from classic scrypt), or + * YESCRYPT_RW (time-memory tradeoff discouraging modification) is determined + * by the setting string. shared (if not NULL) and local must be initialized + * as described above for yescrypt_kdf(). buf must be large enough (as + * indicated by buflen) to hold the encoded hash string. + * + * Return the encoded hash string on success; or NULL on error. + * + * MT-safe as long as local and buf are local to the thread. + */ +extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared, + yescrypt_local_t * __local, + const uint8_t * __passwd, size_t __passwdlen, + const uint8_t * __setting, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt(passwd, setting): + * Compute and encode an scrypt or enhanced scrypt hash of passwd given the + * parameters and salt value encoded in setting. Whether to compute classic + * scrypt, YESCRYPT_WORM (a slight deviation from classic scrypt), or + * YESCRYPT_RW (time-memory tradeoff discouraging modification) is determined + * by the setting string. + * + * Return the encoded hash string on success; or NULL on error. + * + * This is a crypt(3)-like interface, which is simpler to use than + * yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM, + * and it is slower than yescrypt_r() for repeated calls because it allocates + * and frees memory on each call. + * + * MT-unsafe. + */ +extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting); + +/** + * yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen): + * Generate a setting string for use with yescrypt_r() and yescrypt() by + * encoding into it the parameters N_log2 (which is to be set to base 2 + * logarithm of the desired value for N), r, p, flags, and a salt given by src + * (of srclen bytes). buf must be large enough (as indicated by buflen) to + * hold the setting string. + * + * Return the setting string on success; or NULL on error. + * + * MT-safe as long as buf is local to the thread. + */ +extern uint8_t * yescrypt_gensalt_r( + uint32_t __N_log2, uint32_t __r, uint32_t __p, + yescrypt_flags_t __flags, + const uint8_t * __src, size_t __srclen, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt_gensalt(N_log2, r, p, flags, src, srclen): + * Generate a setting string for use with yescrypt_r() and yescrypt(). This + * function is the same as yescrypt_gensalt_r() except that it uses a static + * buffer and thus is not MT-safe. + * + * Return the setting string on success; or NULL on error. + * + * MT-unsafe. + */ +extern uint8_t * yescrypt_gensalt( + uint32_t __N_log2, uint32_t __r, uint32_t __p, + yescrypt_flags_t __flags, + const uint8_t * __src, size_t __srclen); + +#endif /* !_YESCRYPT_H_ */ diff --git a/src/crypto/rx/RxAlgo.cpp b/src/crypto/rx/RxAlgo.cpp index b0e92e6e..3b7766ae 100644 --- a/src/crypto/rx/RxAlgo.cpp +++ b/src/crypto/rx/RxAlgo.cpp @@ -26,6 +26,7 @@ #include "crypto/randomx/randomx.h" +#include "crypto/defyx/defyx.h" #include "crypto/rx/RxAlgo.h" @@ -40,6 +41,10 @@ xmrig::Algorithm::Id xmrig::RxAlgo::apply(Algorithm::Id algorithm) randomx_apply_config(RandomX_LokiConfig); break; + case Algorithm::DEFYX: + randomx_apply_config(RandomX_ScalaConfig); + break; + default: randomx_apply_config(RandomX_MoneroConfig); break; @@ -61,6 +66,9 @@ size_t xmrig::RxAlgo::l3(Algorithm::Id algorithm) case Algorithm::RX_LOKI: return RandomX_LokiConfig.ScratchpadL3_Size; + case Algorithm::DEFYX: + return RandomX_ScalaConfig.ScratchpadL3_Size; + default: break; }