From 176f0edef3a5a3f1363d69a1addaf00806eaf6e7 Mon Sep 17 00:00:00 2001 From: MoneroOcean Date: Thu, 16 Jul 2020 17:03:34 -0700 Subject: [PATCH] Fixed bug --- src/crypto/randomx/defyx/insecure_memzero.h | 1 + src/crypto/randomx/defyx/sha256.h | 107 +- src/crypto/randomx/defyx/sysendian.h | 76 +- src/crypto/randomx/defyx/yescrypt-best.c | 4 +- src/crypto/randomx/defyx/yescrypt-common.c | 703 ++++++++++ src/crypto/randomx/defyx/yescrypt-neon.c | 1326 +++++++++++++++++++ src/crypto/randomx/defyx/yescrypt-opt.c | 33 +- src/crypto/randomx/defyx/yescrypt-simd.c | 33 +- 8 files changed, 2170 insertions(+), 113 deletions(-) create mode 100644 src/crypto/randomx/defyx/insecure_memzero.h create mode 100644 src/crypto/randomx/defyx/yescrypt-common.c create mode 100644 src/crypto/randomx/defyx/yescrypt-neon.c diff --git a/src/crypto/randomx/defyx/insecure_memzero.h b/src/crypto/randomx/defyx/insecure_memzero.h new file mode 100644 index 00000000..5a0ba75c --- /dev/null +++ b/src/crypto/randomx/defyx/insecure_memzero.h @@ -0,0 +1 @@ +#define insecure_memzero(buf, len) /* empty */ diff --git a/src/crypto/randomx/defyx/sha256.h b/src/crypto/randomx/defyx/sha256.h index f935cfaa..6210502f 100644 --- a/src/crypto/randomx/defyx/sha256.h +++ b/src/crypto/randomx/defyx/sha256.h @@ -1,5 +1,5 @@ /*- - * Copyright 2005,2007,2009 Colin Percival + * Copyright 2005-2016 Colin Percival * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -22,41 +22,108 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * $FreeBSD: src/lib/libmd/sha256_Y.h,v 1.2 2006/01/17 15:35:56 phk Exp $ */ #ifndef _SHA256_H_ #define _SHA256_H_ -#include - +#include #include -typedef struct SHA256Context { +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Use #defines in order to avoid namespace collisions with anyone else's + * SHA256 code (e.g., the code in OpenSSL). + */ +#define SHA256_Init libcperciva_SHA256_Init +#define SHA256_Update libcperciva_SHA256_Update +#define SHA256_Final libcperciva_SHA256_Final +#define SHA256_Buf libcperciva_SHA256_Buf +#define SHA256_CTX libcperciva_SHA256_CTX +#define HMAC_SHA256_Init libcperciva_HMAC_SHA256_Init +#define HMAC_SHA256_Update libcperciva_HMAC_SHA256_Update +#define HMAC_SHA256_Final libcperciva_HMAC_SHA256_Final +#define HMAC_SHA256_Buf libcperciva_HMAC_SHA256_Buf +#define HMAC_SHA256_CTX libcperciva_HMAC_SHA256_CTX + +/* Context structure for SHA256 operations. */ +typedef struct { uint32_t state[8]; - uint32_t count[2]; - unsigned char buf[64]; -} SHA256_CTX_Y; + uint64_t count; + uint8_t buf[64]; +} SHA256_CTX; -typedef struct HMAC_SHA256Context { - SHA256_CTX_Y ictx; - SHA256_CTX_Y octx; -} HMAC_SHA256_CTX_Y; +/** + * SHA256_Init(ctx): + * Initialize the SHA256 context ${ctx}. + */ +void SHA256_Init(SHA256_CTX *); -void SHA256_Init_Y(SHA256_CTX_Y *); -void SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t); -void SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *); -void HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t); -void HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t); -void HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *); +/** + * SHA256_Update(ctx, in, len): + * Input ${len} bytes from ${in} into the SHA256 context ${ctx}. + */ +void SHA256_Update(SHA256_CTX *, const void *, size_t); + +/** + * SHA256_Final(digest, ctx): + * Output the SHA256 hash of the data input to the context ${ctx} into the + * buffer ${digest}. + */ +void SHA256_Final(uint8_t[32], SHA256_CTX *); + +/** + * SHA256_Buf(in, len, digest): + * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}. + */ +void SHA256_Buf(const void *, size_t, uint8_t[32]); + +/* Context structure for HMAC-SHA256 operations. */ +typedef struct { + SHA256_CTX ictx; + SHA256_CTX octx; +} HMAC_SHA256_CTX; + +/** + * HMAC_SHA256_Init(ctx, K, Klen): + * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from + * ${K}. + */ +void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t); + +/** + * HMAC_SHA256_Update(ctx, in, len): + * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}. + */ +void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t); + +/** + * HMAC_SHA256_Final(digest, ctx): + * Output the HMAC-SHA256 of the data input to the context ${ctx} into the + * buffer ${digest}. + */ +void HMAC_SHA256_Final(uint8_t[32], HMAC_SHA256_CTX *); + +/** + * HMAC_SHA256_Buf(K, Klen, in, len, digest): + * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of + * length ${Klen}, and write the result to ${digest}. + */ +void HMAC_SHA256_Buf(const void *, size_t, const void *, size_t, uint8_t[32]); /** * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). */ -void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t, +void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t, uint64_t, uint8_t *, size_t); +#ifdef __cplusplus +} +#endif + #endif /* !_SHA256_H_ */ diff --git a/src/crypto/randomx/defyx/sysendian.h b/src/crypto/randomx/defyx/sysendian.h index 04951946..52c1fe73 100644 --- a/src/crypto/randomx/defyx/sysendian.h +++ b/src/crypto/randomx/defyx/sysendian.h @@ -1,5 +1,5 @@ /*- - * Copyright 2007-2009 Colin Percival + * Copyright 2007-2014 Colin Percival * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -22,37 +22,31 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. */ + #ifndef _SYSENDIAN_H_ #define _SYSENDIAN_H_ -/* If we don't have be64enc, the we have isn't usable. */ -#if !HAVE_DECL_BE64ENC -#undef HAVE_SYS_ENDIAN_H -#endif - -#ifdef HAVE_SYS_ENDIAN_H - -#include - -#else - #include +/* Avoid namespace collisions with BSD . */ +#define be32dec libcperciva_be32dec +#define be32enc libcperciva_be32enc +#define be64enc libcperciva_be64enc +#define le32dec libcperciva_le32dec +#define le32enc libcperciva_le32enc + static inline uint32_t -be32dec(const void *pp) +be32dec(const void * pp) { - const uint8_t *p = (uint8_t const *)pp; + const uint8_t * p = (uint8_t const *)pp; return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); } static inline void -be32enc(void *pp, uint32_t x) +be32enc(void * pp, uint32_t x) { uint8_t * p = (uint8_t *)pp; @@ -62,19 +56,8 @@ be32enc(void *pp, uint32_t x) p[0] = (x >> 24) & 0xff; } -static inline uint64_t -be64dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - - return ((uint64_t)(p[7]) + ((uint64_t)(p[6]) << 8) + - ((uint64_t)(p[5]) << 16) + ((uint64_t)(p[4]) << 24) + - ((uint64_t)(p[3]) << 32) + ((uint64_t)(p[2]) << 40) + - ((uint64_t)(p[1]) << 48) + ((uint64_t)(p[0]) << 56)); -} - static inline void -be64enc(void *pp, uint64_t x) +be64enc(void * pp, uint64_t x) { uint8_t * p = (uint8_t *)pp; @@ -89,16 +72,16 @@ be64enc(void *pp, uint64_t x) } static inline uint32_t -le32dec(const void *pp) +le32dec(const void * pp) { - const uint8_t *p = (uint8_t const *)pp; + const uint8_t * p = (uint8_t const *)pp; return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); } static inline void -le32enc(void *pp, uint32_t x) +le32enc(void * pp, uint32_t x) { uint8_t * p = (uint8_t *)pp; @@ -108,31 +91,4 @@ le32enc(void *pp, uint32_t x) p[3] = (x >> 24) & 0xff; } -static inline uint64_t -le64dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - - return ((uint64_t)(p[0]) + ((uint64_t)(p[1]) << 8) + - ((uint64_t)(p[2]) << 16) + ((uint64_t)(p[3]) << 24) + - ((uint64_t)(p[4]) << 32) + ((uint64_t)(p[5]) << 40) + - ((uint64_t)(p[6]) << 48) + ((uint64_t)(p[7]) << 56)); -} - -static inline void -le64enc(void *pp, uint64_t x) -{ - uint8_t * p = (uint8_t *)pp; - - p[0] = x & 0xff; - p[1] = (x >> 8) & 0xff; - p[2] = (x >> 16) & 0xff; - p[3] = (x >> 24) & 0xff; - p[4] = (x >> 32) & 0xff; - p[5] = (x >> 40) & 0xff; - p[6] = (x >> 48) & 0xff; - p[7] = (x >> 56) & 0xff; -} -#endif /* !HAVE_SYS_ENDIAN_H */ - #endif /* !_SYSENDIAN_H_ */ diff --git a/src/crypto/randomx/defyx/yescrypt-best.c b/src/crypto/randomx/defyx/yescrypt-best.c index 4e836215..b4029fbb 100644 --- a/src/crypto/randomx/defyx/yescrypt-best.c +++ b/src/crypto/randomx/defyx/yescrypt-best.c @@ -1,4 +1,6 @@ -#ifdef __SSE2__ +#ifdef __ARM__ +#include "yescrypt-neon.c" +#elif defined __SSE2__ #include "yescrypt-simd.c" #else #include "yescrypt-opt.c" diff --git a/src/crypto/randomx/defyx/yescrypt-common.c b/src/crypto/randomx/defyx/yescrypt-common.c new file mode 100644 index 00000000..3a0a0870 --- /dev/null +++ b/src/crypto/randomx/defyx/yescrypt-common.c @@ -0,0 +1,703 @@ +/*- + * Copyright 2013-2018 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include "insecure_memzero.h" +#include "sha256.h" + +#define YESCRYPT_INTERNAL +#include "yescrypt.h" + +#define BYTES2CHARS(bytes) ((((bytes) * 8) + 5) / 6) + +#define HASH_SIZE sizeof(yescrypt_binary_t) /* bytes */ +#define HASH_LEN BYTES2CHARS(HASH_SIZE) /* base-64 chars */ + +/* + * "$y$", up to 8 params of up to 6 chars each, '$', salt + * Alternatively, but that's smaller: + * "$7$", 3 params encoded as 1+5+5 chars, salt + */ +#define PREFIX_LEN (3 + 8 * 6 + 1 + BYTES2CHARS(32)) + +static const char * const itoa64 = + "./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +static const uint8_t atoi64_partial[77] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 64, 64, 64, 64, 64, 64, 64, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 64, 64, 64, 64, 64, 64, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 +}; + +static uint8_t *encode64_uint32(uint8_t *dst, size_t dstlen, + uint32_t src, uint32_t min) +{ + uint32_t start = 0, end = 47, chars = 1, bits = 0; + + if (src < min) + return NULL; + src -= min; + + do { + uint32_t count = (end + 1 - start) << bits; + if (src < count) + break; + if (start >= 63) + return NULL; + start = end + 1; + end = start + (62 - end) / 2; + src -= count; + chars++; + bits += 6; + } while (1); + + if (dstlen <= chars) /* require room for a NUL terminator */ + return NULL; + + *dst++ = itoa64[start + (src >> bits)]; + + while (--chars) { + bits -= 6; + *dst++ = itoa64[(src >> bits) & 0x3f]; + } + + *dst = 0; /* NUL terminate just in case */ + + return dst; +} + +static inline uint32_t atoi64(uint8_t src) +{ + if (src >= '.' && src <= 'z') + return atoi64_partial[src - '.']; + + return 64; +} + +static const uint8_t *decode64_uint32(uint32_t *dst, + const uint8_t *src, uint32_t min) +{ + uint32_t start = 0, end = 47, chars = 1, bits = 0; + uint32_t c; + + c = atoi64(*src++); + if (c > 63) + goto fail; + + *dst = min; + while (c > end) { + *dst += (end + 1 - start) << bits; + start = end + 1; + end = start + (62 - end) / 2; + chars++; + bits += 6; + } + + *dst += (c - start) << bits; + + while (--chars) { + c = atoi64(*src++); + if (c > 63) + goto fail; + bits -= 6; + *dst += c << bits; + } + + return src; + +fail: + *dst = 0; + return NULL; +} + +static uint8_t *encode64_uint32_fixed(uint8_t *dst, size_t dstlen, + uint32_t src, uint32_t srcbits) +{ + uint32_t bits; + + for (bits = 0; bits < srcbits; bits += 6) { + if (dstlen < 2) + return NULL; + *dst++ = itoa64[src & 0x3f]; + dstlen--; + src >>= 6; + } + + if (src || dstlen < 1) + return NULL; + + *dst = 0; /* NUL terminate just in case */ + + return dst; +} + +static uint8_t *encode64(uint8_t *dst, size_t dstlen, + const uint8_t *src, size_t srclen) +{ + size_t i; + + for (i = 0; i < srclen; ) { + uint8_t *dnext; + uint32_t value = 0, bits = 0; + do { + value |= (uint32_t)src[i++] << bits; + bits += 8; + } while (bits < 24 && i < srclen); + dnext = encode64_uint32_fixed(dst, dstlen, value, bits); + if (!dnext) + return NULL; + dstlen -= dnext - dst; + dst = dnext; + } + + if (dstlen < 1) + return NULL; + + *dst = 0; /* NUL terminate just in case */ + + return dst; +} + +static const uint8_t *decode64_uint32_fixed(uint32_t *dst, uint32_t dstbits, + const uint8_t *src) +{ + uint32_t bits; + + *dst = 0; + for (bits = 0; bits < dstbits; bits += 6) { + uint32_t c = atoi64(*src++); + if (c > 63) { + *dst = 0; + return NULL; + } + *dst |= c << bits; + } + + return src; +} + +static const uint8_t *decode64(uint8_t *dst, size_t *dstlen, + const uint8_t *src, size_t srclen) +{ + size_t dstpos = 0; + + while (dstpos <= *dstlen && srclen) { + uint32_t value = 0, bits = 0; + while (srclen--) { + uint32_t c = atoi64(*src); + if (c > 63) { + srclen = 0; + break; + } + src++; + value |= c << bits; + bits += 6; + if (bits >= 24) + break; + } + if (!bits) + break; + if (bits < 12) /* must have at least one full byte */ + goto fail; + while (dstpos++ < *dstlen) { + *dst++ = value; + value >>= 8; + bits -= 8; + if (bits < 8) { /* 2 or 4 */ + if (value) /* must be 0 */ + goto fail; + bits = 0; + break; + } + } + if (bits) + goto fail; + } + + if (!srclen && dstpos <= *dstlen) { + *dstlen = dstpos; + return src; + } + +fail: + *dstlen = 0; + return NULL; +} + +typedef enum { ENC = 1, DEC = -1 } encrypt_dir_t; + +static void memxor(unsigned char *dst, unsigned char *src, size_t size) +{ + while (size--) + *dst++ ^= *src++; +} + +static void encrypt(unsigned char *data, size_t datalen, + const yescrypt_binary_t *key, encrypt_dir_t dir) +{ + SHA256_CTX ctx; + unsigned char f[32 + 4]; + size_t halflen, which; + unsigned char mask, round, target; + + if (!datalen) + return; + if (datalen > 64) + datalen = 64; + + halflen = datalen >> 1; + + which = 0; /* offset to half we are working on (0 or halflen) */ + mask = 0x0f; /* current half's extra nibble mask if datalen is odd */ + + round = 0; + target = 5; /* 6 rounds due to Jacques Patarin's CRYPTO 2004 paper */ + + if (dir == DEC) { + which = halflen; /* even round count, so swap the halves */ + mask ^= 0xff; + + round = target; + target = 0; + } + + f[32] = 0; + f[33] = sizeof(*key); + f[34] = datalen; + + do { + SHA256_Init(&ctx); + f[35] = round; + SHA256_Update(&ctx, &f[32], 4); + SHA256_Update(&ctx, key, sizeof(*key)); + SHA256_Update(&ctx, &data[which], halflen); + if (datalen & 1) { + f[0] = data[datalen - 1] & mask; + SHA256_Update(&ctx, f, 1); + } + SHA256_Final(f, &ctx); + which ^= halflen; + memxor(&data[which], f, halflen); + if (datalen & 1) { + mask ^= 0xff; + data[datalen - 1] ^= f[halflen] & mask; + } + if (round == target) + break; + round += dir; + } while (1); + + /* ctx is presumably zeroized by SHA256_Final() */ + insecure_memzero(f, sizeof(f)); +} + +uint8_t *yescrypt_r(const yescrypt_shared_t *shared, yescrypt_local_t *local, + const uint8_t *passwd, size_t passwdlen, + const uint8_t *setting, + const yescrypt_binary_t *key, + uint8_t *buf, size_t buflen) +{ + unsigned char saltbin[64], hashbin[32]; + const uint8_t *src, *saltstr, *salt; + uint8_t *dst; + size_t need, prefixlen, saltstrlen, saltlen; + yescrypt_params_t params = { .p = 1 }; + + if (setting[0] != '$' || + (setting[1] != '7' && setting[1] != 'y') || + setting[2] != '$') + return NULL; + src = setting + 3; + + if (setting[1] == '7') { + uint32_t N_log2 = atoi64(*src++); + if (N_log2 < 1 || N_log2 > 63) + return NULL; + params.N = (uint64_t)1 << N_log2; + + src = decode64_uint32_fixed(¶ms.r, 30, src); + if (!src) + return NULL; + + src = decode64_uint32_fixed(¶ms.p, 30, src); + if (!src) + return NULL; + + if (key) + return NULL; + } else { + uint32_t flavor, N_log2; + + src = decode64_uint32(&flavor, src, 0); + if (!src) + return NULL; + + if (flavor < YESCRYPT_RW) { + params.flags = flavor; + } else if (flavor <= YESCRYPT_RW + (YESCRYPT_RW_FLAVOR_MASK >> 2)) { + params.flags = YESCRYPT_RW + ((flavor - YESCRYPT_RW) << 2); + } else { + return NULL; + } + + src = decode64_uint32(&N_log2, src, 1); + if (!src || N_log2 > 63) + return NULL; + params.N = (uint64_t)1 << N_log2; + + src = decode64_uint32(¶ms.r, src, 1); + if (!src) + return NULL; + + if (*src != '$') { + uint32_t have; + + src = decode64_uint32(&have, src, 1); + if (!src) + return NULL; + + if (have & 1) { + src = decode64_uint32(¶ms.p, src, 2); + if (!src) + return NULL; + } + + if (have & 2) { + src = decode64_uint32(¶ms.t, src, 1); + if (!src) + return NULL; + } + + if (have & 4) { + src = decode64_uint32(¶ms.g, src, 1); + if (!src) + return NULL; + } + + if (have & 8) { + uint32_t NROM_log2; + src = decode64_uint32(&NROM_log2, src, 1); + if (!src || NROM_log2 > 63) + return NULL; + params.NROM = (uint64_t)1 << NROM_log2; + } + } + + if (*src++ != '$') + return NULL; + } + + prefixlen = src - setting; + + saltstr = src; + src = (uint8_t *)strrchr((char *)saltstr, '$'); + if (src) + saltstrlen = src - saltstr; + else + saltstrlen = strlen((char *)saltstr); + + if (setting[1] == '7') { + salt = saltstr; + saltlen = saltstrlen; + } else { + const uint8_t *saltend; + + saltlen = sizeof(saltbin); + saltend = decode64(saltbin, &saltlen, saltstr, saltstrlen); + + if (!saltend || (size_t)(saltend - saltstr) != saltstrlen) + goto fail; + + salt = saltbin; + + if (key) + encrypt(saltbin, saltlen, key, ENC); + } + + need = prefixlen + saltstrlen + 1 + HASH_LEN + 1; + if (need > buflen || need < saltstrlen) + goto fail; + + if (yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, + ¶ms, hashbin, sizeof(hashbin))) + goto fail; + + if (key) { + insecure_memzero(saltbin, sizeof(saltbin)); + encrypt(hashbin, sizeof(hashbin), key, ENC); + } + + dst = buf; + memcpy(dst, setting, prefixlen + saltstrlen); + dst += prefixlen + saltstrlen; + *dst++ = '$'; + + dst = encode64(dst, buflen - (dst - buf), hashbin, sizeof(hashbin)); + insecure_memzero(hashbin, sizeof(hashbin)); + if (!dst || dst >= buf + buflen) + return NULL; + + *dst = 0; /* NUL termination */ + + return buf; + +fail: + insecure_memzero(saltbin, sizeof(saltbin)); + insecure_memzero(hashbin, sizeof(hashbin)); + return NULL; +} + +uint8_t *yescrypt(const uint8_t *passwd, const uint8_t *setting) +{ + /* prefix, '$', hash, NUL */ + static uint8_t buf[PREFIX_LEN + 1 + HASH_LEN + 1]; + yescrypt_local_t local; + uint8_t *retval; + + if (yescrypt_init_local(&local)) + return NULL; + retval = yescrypt_r(NULL, &local, + passwd, strlen((char *)passwd), setting, NULL, buf, sizeof(buf)); + if (yescrypt_free_local(&local)) + return NULL; + return retval; +} + +uint8_t *yescrypt_reencrypt(uint8_t *hash, + const yescrypt_binary_t *from_key, + const yescrypt_binary_t *to_key) +{ + uint8_t *retval = NULL, *saltstart, *hashstart; + const uint8_t *hashend; + unsigned char saltbin[64], hashbin[32]; + size_t saltstrlen, saltlen, hashlen; + + if (strncmp((char *)hash, "$y$", 3)) + return NULL; + + saltstart = NULL; + hashstart = (uint8_t *)strrchr((char *)hash, '$'); + if (hashstart) { + if (hashstart > (uint8_t *)hash) { + saltstart = hashstart - 1; + while (*saltstart != '$' && saltstart > hash) + saltstart--; + if (*saltstart == '$') + saltstart++; + } + hashstart++; + } else { + hashstart = hash; + } + saltstrlen = saltstart ? (hashstart - 1 - saltstart) : 0; + if (saltstrlen > BYTES2CHARS(64) || + strlen((char *)hashstart) != HASH_LEN) + return NULL; + + if (saltstrlen) { + const uint8_t *saltend; + saltlen = sizeof(saltbin); + saltend = decode64(saltbin, &saltlen, saltstart, saltstrlen); + if (!saltend || *saltend != '$' || saltlen < 1 || saltlen > 64) + goto out; + + if (from_key) + encrypt(saltbin, saltlen, from_key, ENC); + if (to_key) + encrypt(saltbin, saltlen, to_key, DEC); + } + + hashlen = sizeof(hashbin); + hashend = decode64(hashbin, &hashlen, hashstart, HASH_LEN); + if (!hashend || *hashend || hashlen != sizeof(hashbin)) + goto out; + + if (from_key) + encrypt(hashbin, hashlen, from_key, DEC); + if (to_key) + encrypt(hashbin, hashlen, to_key, ENC); + + if (saltstrlen) { + if (!encode64(saltstart, saltstrlen + 1, saltbin, saltlen)) + goto out; /* can't happen */ + *(saltstart + saltstrlen) = '$'; + } + + if (!encode64(hashstart, HASH_LEN + 1, hashbin, hashlen)) + goto out; /* can't happen */ + + retval = hash; + +out: + insecure_memzero(saltbin, sizeof(saltbin)); + insecure_memzero(hashbin, sizeof(hashbin)); + + return retval; +} + +static uint32_t N2log2(uint64_t N) +{ + uint32_t N_log2; + + if (N < 2) + return 0; + + N_log2 = 2; + while (N >> N_log2 != 0) + N_log2++; + N_log2--; + + if (N >> N_log2 != 1) + return 0; + + return N_log2; +} + +uint8_t *yescrypt_encode_params_r(const yescrypt_params_t *params, + const uint8_t *src, size_t srclen, + uint8_t *buf, size_t buflen) +{ + uint32_t flavor, N_log2, NROM_log2, have; + uint8_t *dst; + + if (srclen > SIZE_MAX / 16) + return NULL; + + if (params->flags < YESCRYPT_RW) { + flavor = params->flags; + } else if ((params->flags & YESCRYPT_MODE_MASK) == YESCRYPT_RW && + params->flags <= (YESCRYPT_RW | YESCRYPT_RW_FLAVOR_MASK)) { + flavor = YESCRYPT_RW + (params->flags >> 2); + } else { + return NULL; + } + + N_log2 = N2log2(params->N); + if (!N_log2) + return NULL; + + NROM_log2 = N2log2(params->NROM); + if (params->NROM && !NROM_log2) + return NULL; + + if ((uint64_t)params->r * (uint64_t)params->p >= (1U << 30)) + return NULL; + + dst = buf; + *dst++ = '$'; + *dst++ = 'y'; + *dst++ = '$'; + + dst = encode64_uint32(dst, buflen - (dst - buf), flavor, 0); + if (!dst) + return NULL; + + dst = encode64_uint32(dst, buflen - (dst - buf), N_log2, 1); + if (!dst) + return NULL; + + dst = encode64_uint32(dst, buflen - (dst - buf), params->r, 1); + if (!dst) + return NULL; + + have = 0; + if (params->p != 1) + have |= 1; + if (params->t) + have |= 2; + if (params->g) + have |= 4; + if (NROM_log2) + have |= 8; + + if (have) { + dst = encode64_uint32(dst, buflen - (dst - buf), have, 1); + if (!dst) + return NULL; + } + + if (params->p != 1) { + dst = encode64_uint32(dst, buflen - (dst - buf), params->p, 2); + if (!dst) + return NULL; + } + + if (params->t) { + dst = encode64_uint32(dst, buflen - (dst - buf), params->t, 1); + if (!dst) + return NULL; + } + + if (params->g) { + dst = encode64_uint32(dst, buflen - (dst - buf), params->g, 1); + if (!dst) + return NULL; + } + + if (NROM_log2) { + dst = encode64_uint32(dst, buflen - (dst - buf), NROM_log2, 1); + if (!dst) + return NULL; + } + + if (dst >= buf + buflen) + return NULL; + + *dst++ = '$'; + + dst = encode64(dst, buflen - (dst - buf), src, srclen); + if (!dst || dst >= buf + buflen) + return NULL; + + *dst = 0; /* NUL termination */ + + return buf; +} + +uint8_t *yescrypt_encode_params(const yescrypt_params_t *params, + const uint8_t *src, size_t srclen) +{ + /* prefix, NUL */ + static uint8_t buf[PREFIX_LEN + 1]; + return yescrypt_encode_params_r(params, src, srclen, buf, sizeof(buf)); +} + +int crypto_scrypt(const uint8_t *passwd, size_t passwdlen, + const uint8_t *salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p, + uint8_t *buf, size_t buflen) +{ + yescrypt_local_t local; + yescrypt_params_t params = { .flags = 0, .N = N, .r = r, .p = p }; + int retval; + + if (yescrypt_init_local(&local)) + return -1; + retval = yescrypt_kdf(NULL, &local, + passwd, passwdlen, salt, saltlen, ¶ms, buf, buflen); + if (yescrypt_free_local(&local)) + return -1; + return retval; +} diff --git a/src/crypto/randomx/defyx/yescrypt-neon.c b/src/crypto/randomx/defyx/yescrypt-neon.c new file mode 100644 index 00000000..ed6ff6e0 --- /dev/null +++ b/src/crypto/randomx/defyx/yescrypt-neon.c @@ -0,0 +1,1326 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2012-2014 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +/* + * On 64-bit, enabling SSE4.1 helps our pwxform code indirectly, via avoiding + * gcc bug 54349 (fixed for gcc 4.9+). On 32-bit, it's of direct help. AVX + * and XOP are of further help either way. + */ +#include + +#include +#include +#include +#include + +#include "insecure_memzero.h" +#include "sha256.h" +#include "sysendian.h" + +#include "yescrypt.h" + +#include "yescrypt-platform.c" + +#if __STDC_VERSION__ >= 199901L +/* have restrict */ +#elif defined(__GNUC__) +#define restrict __restrict +#else +#define restrict +#endif + +#define PREFETCH(x, hint) /*_mm_prefetch((const char *)(x), (hint));*/ +#define PREFETCH_OUT(x, hint) /* disabled */ + +#define ARX(out, in1, in2, s) \ + { \ + uint32x4_t T = vaddq_u32(in1, in2); \ + out = veorq_u32(out, vshlq_n_u32(T, s)); \ + out = veorq_u32(out, vshrq_n_u32(T, 32-s)); \ + } + +#define SALSA20_2ROUNDS \ + /* Operate on "columns" */ \ + ARX(X1, X0, X3, 7) \ + ARX(X2, X1, X0, 9) \ + ARX(X3, X2, X1, 13) \ + ARX(X0, X3, X2, 18) \ +\ + /* Rearrange data */ \ + X1 = vextq_u32(X1, X1, 3); \ + X2 = vextq_u32(X2, X2, 2); \ + X3 = vextq_u32(X3, X3, 1); \ +\ + /* Operate on "rows" */ \ + ARX(X3, X0, X1, 7) \ + ARX(X2, X3, X0, 9) \ + ARX(X1, X2, X3, 13) \ + ARX(X0, X1, X2, 18) \ +\ + /* Rearrange data */ \ + X1 = vextq_u32(X1, X1, 1); \ + X2 = vextq_u32(X2, X2, 2); \ + X3 = vextq_u32(X3, X3, 3); + +/** + * Apply the salsa20/8 core to the block provided in (X0 ... X3). + */ +#define SALSA20_8_BASE(maybe_decl, out) \ + { \ + maybe_decl Y0 = X0; \ + maybe_decl Y1 = X1; \ + maybe_decl Y2 = X2; \ + maybe_decl Y3 = X3; \ + SALSA20_2ROUNDS \ + SALSA20_2ROUNDS \ + SALSA20_2ROUNDS \ + SALSA20_2ROUNDS \ + (out)[0] = X0 = vaddq_u32(X0, Y0); \ + (out)[1] = X1 = vaddq_u32(X1, Y1); \ + (out)[2] = X2 = vaddq_u32(X2, Y2); \ + (out)[3] = X3 = vaddq_u32(X3, Y3); \ + } +#define SALSA20_8(out) \ + SALSA20_8_BASE(uint32x4_t, out) + +/** + * Apply the salsa20/8 core to the block provided in (X0 ... X3) ^ (Z0 ... Z3). + */ +#define SALSA20_8_XOR_ANY(maybe_decl, Z0, Z1, Z2, Z3, out) \ + X0 = veorq_u32(X0, Z0); \ + X1 = veorq_u32(X1, Z1); \ + X2 = veorq_u32(X2, Z2); \ + X3 = veorq_u32(X3, Z3); \ + SALSA20_8_BASE(maybe_decl, out) + +#define SALSA20_8_XOR_MEM(in, out) \ + SALSA20_8_XOR_ANY(uint32x4_t, (in)[0], (in)[1], (in)[2], (in)[3], out) + +#define SALSA20_8_XOR_REG(out) \ + SALSA20_8_XOR_ANY(/* empty */, Y0, Y1, Y2, Y3, out) + +typedef union { + uint32_t w[16]; + uint32x4_t q[4]; +} salsa20_blk_t; + +/** + * blockmix_salsa8(Bin, Bout, r): + * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r + * bytes in length; the output Bout must also be the same size. + */ +static inline void +blockmix_salsa8(const salsa20_blk_t *restrict Bin, + salsa20_blk_t *restrict Bout, size_t r) +{ + uint32x4_t X0, X1, X2, X3; + size_t i; + + r--; + PREFETCH(&Bin[r * 2 + 1], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin[i * 2], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + PREFETCH(&Bin[i * 2 + 1], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) + } + PREFETCH(&Bin[r * 2], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) + + /* 1: X <-- B_{2r - 1} */ + X0 = Bin[r * 2 + 1].q[0]; + X1 = Bin[r * 2 + 1].q[1]; + X2 = Bin[r * 2 + 1].q[2]; + X3 = Bin[r * 2 + 1].q[3]; + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + SALSA20_8_XOR_MEM(Bin[0].q, Bout[0].q) + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < r;) { + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + SALSA20_8_XOR_MEM(Bin[i * 2 + 1].q, Bout[r + 1 + i].q) + + i++; + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + SALSA20_8_XOR_MEM(Bin[i * 2].q, Bout[i].q) + } + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + SALSA20_8_XOR_MEM(Bin[r * 2 + 1].q, Bout[r * 2 + 1].q) +} + +/* + * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs + * starting with Sandy Bridge. Additionally, PSHUFD uses separate source and + * destination registers, whereas the shifts would require an extra move + * instruction for our code when building without AVX. Unfortunately, PSHUFD + * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ) + * and somewhat slower on some non-Intel CPUs (luckily not including AMD + * Bulldozer and Piledriver). Since for many other CPUs using (V)PSHUFD is a + * win in terms of throughput or/and not needing a move instruction, we + * currently use it despite of the higher latency on some older CPUs. As an + * alternative, the #if below may be patched to only enable use of (V)PSHUFD + * when building with SSE4.1 or newer, which is not available on older CPUs + * where this instruction has higher latency. + */ +#define LO32(X) \ + vmovn_u64(vreinterpretq_u64_u32(X)) +#define HI32(X) \ + LO32(vrev64q_u32(X)) + +#define EXTRACT64(X) \ + vgetq_lane_u64(vreinterpretq_u64_u32(X), 0) + +/* This is tunable */ +#define S_BITS 8 + +/* Not tunable in this implementation, hard-coded in a few places */ +#define S_SIMD 2 +#define S_P 4 + +/* Number of S-boxes. Not tunable by design, hard-coded in a few places. */ +#define S_N 2 + +/* Derived values. Not tunable except via S_BITS above. */ +#define S_SIZE1 (1 << S_BITS) +#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8) +#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK) +#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD * 8) + +#define PWXFORM_X_T uint64_t +#define PWXFORM_SIMD(X, x, s0, s1) \ + x = EXTRACT64(X) & S_MASK2; \ + s0 = *(const uint32x4_t *)(S0 + (uint32_t)x); \ + s1 = *(const uint32x4_t *)(S1 + (x >> 32)); \ + X = vreinterpretq_u32_u64(vmull_u32(HI32(X), LO32(X))); \ + X = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(X), vreinterpretq_u64_u32(s0))); \ + X = veorq_u32(X, s1); + +#define PWXFORM_ROUND \ + PWXFORM_SIMD(X0, x0, s00, s01) \ + PWXFORM_SIMD(X1, x1, s10, s11) \ + PWXFORM_SIMD(X2, x2, s20, s21) \ + PWXFORM_SIMD(X3, x3, s30, s31) + +#define PWXFORM \ + { \ + PWXFORM_X_T x0, x1, x2, x3; \ + uint32x4_t s00, s01, s10, s11, s20, s21, s30, s31; \ + PWXFORM_ROUND PWXFORM_ROUND \ + PWXFORM_ROUND PWXFORM_ROUND \ + PWXFORM_ROUND PWXFORM_ROUND \ + } + +#define XOR4(in) \ + X0 = veorq_u32(X0, (in)[0]); \ + X1 = veorq_u32(X1, (in)[1]); \ + X2 = veorq_u32(X2, (in)[2]); \ + X3 = veorq_u32(X3, (in)[3]); + +#define OUT(out) \ + (out)[0] = X0; \ + (out)[1] = X1; \ + (out)[2] = X2; \ + (out)[3] = X3; + +/** + * blockmix_pwxform(Bin, Bout, r, S): + * Compute Bout = BlockMix_pwxform{salsa20/8, r, S}(Bin). The input Bin must + * be 128r bytes in length; the output Bout must also be the same size. + */ +static void +blockmix(const salsa20_blk_t *restrict Bin, salsa20_blk_t *restrict Bout, + size_t r, const uint32x4_t *restrict S) +{ + const uint8_t * S0, * S1; + uint32x4_t X0, X1, X2, X3; + size_t i; + + if (!S) { + blockmix_salsa8(Bin, Bout, r); + return; + } + + S0 = (const uint8_t *)S; + S1 = (const uint8_t *)S + S_SIZE_ALL / 2; + + /* Convert 128-byte blocks to 64-byte blocks */ + r *= 2; + + r--; + PREFETCH(&Bin[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin[i], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + } + PREFETCH_OUT(&Bout[r], _MM_HINT_T0) + + /* X <-- B_{r1 - 1} */ + X0 = Bin[r].q[0]; + X1 = Bin[r].q[1]; + X2 = Bin[r].q[2]; + X3 = Bin[r].q[3]; + + /* for i = 0 to r1 - 1 do */ + for (i = 0; i < r; i++) { + /* X <-- H'(X \xor B_i) */ + XOR4(Bin[i].q) + PWXFORM + /* B'_i <-- X */ + OUT(Bout[i].q) + } + + /* Last iteration of the loop above */ + XOR4(Bin[i].q) + PWXFORM + + /* B'_i <-- H(B'_i) */ + SALSA20_8(Bout[i].q) +} + +#define XOR4_2(in1, in2) \ + X0 = veorq_u32((in1)[0], (in2)[0]); \ + X1 = veorq_u32((in1)[1], (in2)[1]); \ + X2 = veorq_u32((in1)[2], (in2)[2]); \ + X3 = veorq_u32((in1)[3], (in2)[3]); + +static inline uint32_t +blockmix_salsa8_xor(const salsa20_blk_t *restrict Bin1, + const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, + size_t r, int Bin2_in_ROM) +{ + uint32x4_t X0, X1, X2, X3; + size_t i; + + r--; + if (Bin2_in_ROM) { + PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_NTA) + PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i * 2], _MM_HINT_NTA) + PREFETCH(&Bin1[i * 2], _MM_HINT_T0) + PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_NTA) + PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) + } + PREFETCH(&Bin2[r * 2], _MM_HINT_T0) + } else { + PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0) + PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i * 2], _MM_HINT_T0) + PREFETCH(&Bin1[i * 2], _MM_HINT_T0) + PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0) + PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) + } + PREFETCH(&Bin2[r * 2], _MM_HINT_T0) + } + PREFETCH(&Bin1[r * 2], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) + + /* 1: X <-- B_{2r - 1} */ + XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[0].q) + SALSA20_8_XOR_MEM(Bin2[0].q, Bout[0].q) + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < r;) { + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[i * 2 + 1].q) + SALSA20_8_XOR_MEM(Bin2[i * 2 + 1].q, Bout[r + 1 + i].q) + + i++; + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[i * 2].q) + SALSA20_8_XOR_MEM(Bin2[i * 2].q, Bout[i].q) + } + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[r * 2 + 1].q) + SALSA20_8_XOR_MEM(Bin2[r * 2 + 1].q, Bout[r * 2 + 1].q) + + return vgetq_lane_u32(X0, 0); +} + +static uint32_t +blockmix_xor(const salsa20_blk_t *restrict Bin1, + const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, + size_t r, int Bin2_in_ROM, const uint32x4_t *restrict S) +{ + const uint8_t * S0, * S1; + uint32x4_t X0, X1, X2, X3; + size_t i; + + if (!S) + return blockmix_salsa8_xor(Bin1, Bin2, Bout, r, Bin2_in_ROM); + + S0 = (const uint8_t *)S; + S1 = (const uint8_t *)S + S_SIZE_ALL / 2; + + /* Convert 128-byte blocks to 64-byte blocks */ + r *= 2; + + r--; + if (Bin2_in_ROM) { + PREFETCH(&Bin2[r], _MM_HINT_NTA) + PREFETCH(&Bin1[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i], _MM_HINT_NTA) + PREFETCH(&Bin1[i], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + } + } else { + PREFETCH(&Bin2[r], _MM_HINT_T0) + PREFETCH(&Bin1[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i], _MM_HINT_T0) + PREFETCH(&Bin1[i], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + } + } + PREFETCH_OUT(&Bout[r], _MM_HINT_T0); + + /* X <-- B_{r1 - 1} */ + XOR4_2(Bin1[r].q, Bin2[r].q) + + /* for i = 0 to r1 - 1 do */ + for (i = 0; i < r; i++) { + /* X <-- H'(X \xor B_i) */ + XOR4(Bin1[i].q) + XOR4(Bin2[i].q) + PWXFORM + /* B'_i <-- X */ + OUT(Bout[i].q) + } + + /* Last iteration of the loop above */ + XOR4(Bin1[i].q) + XOR4(Bin2[i].q) + PWXFORM + + /* B'_i <-- H(B'_i) */ + SALSA20_8(Bout[i].q) + + return vgetq_lane_u32(X0, 0); +} + +#undef XOR4 +#define XOR4(in, out) \ + (out)[0] = Y0 = veorq_u32((in)[0], (out)[0]); \ + (out)[1] = Y1 = veorq_u32((in)[1], (out)[1]); \ + (out)[2] = Y2 = veorq_u32((in)[2], (out)[2]); \ + (out)[3] = Y3 = veorq_u32((in)[3], (out)[3]); + +static inline uint32_t +blockmix_salsa8_xor_save(const salsa20_blk_t *restrict Bin1, + salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, + size_t r) +{ + uint32x4_t X0, X1, X2, X3, Y0, Y1, Y2, Y3; + size_t i; + + r--; + PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0) + PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i * 2], _MM_HINT_T0) + PREFETCH(&Bin1[i * 2], _MM_HINT_T0) + PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0) + PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) + } + PREFETCH(&Bin2[r * 2], _MM_HINT_T0) + PREFETCH(&Bin1[r * 2], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) + + /* 1: X <-- B_{2r - 1} */ + XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[0].q, Bin2[0].q) + SALSA20_8_XOR_REG(Bout[0].q) + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < r;) { + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[i * 2 + 1].q, Bin2[i * 2 + 1].q) + SALSA20_8_XOR_REG(Bout[r + 1 + i].q) + + i++; + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[i * 2].q, Bin2[i * 2].q) + SALSA20_8_XOR_REG(Bout[i].q) + } + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) + SALSA20_8_XOR_REG(Bout[r * 2 + 1].q) + + return vgetq_lane_u32(X0, 0); +} + +#define XOR4_Y \ + X0 = veorq_u32(X0, Y0); \ + X1 = veorq_u32(X1, Y1); \ + X2 = veorq_u32(X2, Y2); \ + X3 = veorq_u32(X3, Y3); + +static uint32_t +blockmix_xor_save(const salsa20_blk_t *restrict Bin1, + salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, + size_t r, const uint32x4_t *restrict S) +{ + const uint8_t * S0, * S1; + uint32x4_t X0, X1, X2, X3, Y0, Y1, Y2, Y3; + size_t i; + + if (!S) + return blockmix_salsa8_xor_save(Bin1, Bin2, Bout, r); + + S0 = (const uint8_t *)S; + S1 = (const uint8_t *)S + S_SIZE_ALL / 2; + + /* Convert 128-byte blocks to 64-byte blocks */ + r *= 2; + + r--; + PREFETCH(&Bin2[r], _MM_HINT_T0) + PREFETCH(&Bin1[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i], _MM_HINT_T0) + PREFETCH(&Bin1[i], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + } + PREFETCH_OUT(&Bout[r], _MM_HINT_T0); + + /* X <-- B_{r1 - 1} */ + XOR4_2(Bin1[r].q, Bin2[r].q) + + /* for i = 0 to r1 - 1 do */ + for (i = 0; i < r; i++) { + XOR4(Bin1[i].q, Bin2[i].q) + /* X <-- H'(X \xor B_i) */ + XOR4_Y + PWXFORM + /* B'_i <-- X */ + OUT(Bout[i].q) + } + + /* Last iteration of the loop above */ + XOR4(Bin1[i].q, Bin2[i].q) + XOR4_Y + PWXFORM + + /* B'_i <-- H(B'_i) */ + SALSA20_8(Bout[i].q) + + return vgetq_lane_u32(X0, 0); +} + +#undef ARX +#undef SALSA20_2ROUNDS +#undef SALSA20_8 +#undef SALSA20_8_XOR_ANY +#undef SALSA20_8_XOR_MEM +#undef SALSA20_8_XOR_REG +#undef PWXFORM_SIMD_1 +#undef PWXFORM_SIMD_2 +#undef PWXFORM_ROUND +#undef PWXFORM +#undef OUT +#undef XOR4 +#undef XOR4_2 +#undef XOR4_Y + +/** + * integerify(B, r): + * Return the result of parsing B_{2r-1} as a little-endian integer. + */ +static inline uint32_t +integerify(const salsa20_blk_t * B, size_t r) +{ + return B[2 * r - 1].w[0]; +} + +/** + * smix1(B, r, N, flags, V, NROM, shared, XY, S): + * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 128r bytes in length. The value N must be even and no + * smaller than 2. The array V must be aligned to a multiple of 64 bytes, and + * arrays B and XY to a multiple of at least 16 bytes (aligning them to 64 + * bytes as well saves cache lines, but might result in cache bank conflicts). + */ +static void +smix1(uint8_t * B, size_t r, uint32_t N, yescrypt_flags_t flags, + salsa20_blk_t * V, uint32_t NROM, const yescrypt_shared_t * shared, + salsa20_blk_t * XY, void * S) +{ + const salsa20_blk_t * VROM = shared->shared1.aligned; + uint32_t VROM_mask = shared->mask1; + size_t s = 2 * r; + salsa20_blk_t * X = V, * Y; + uint32_t i, j; + size_t k; + + /* 1: X <-- B */ + /* 3: V_i <-- X */ + for (k = 0; k < 2 * r; k++) { + for (i = 0; i < 16; i++) { + X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); + } + } + + if (NROM && (VROM_mask & 1)) { + uint32_t n; + salsa20_blk_t * V_n; + const salsa20_blk_t * V_j; + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[s]; + blockmix(X, Y, r, S); + + X = &V[2 * s]; + if ((1 & VROM_mask) == 1) { + /* j <-- Integerify(X) mod NROM */ + j = integerify(Y, r) & (NROM - 1); + V_j = &VROM[j * s]; + + /* X <-- H(X \xor VROM_j) */ + j = blockmix_xor(Y, V_j, X, r, 1, S); + } else { + /* X <-- H(X) */ + blockmix(Y, X, r, S); + j = integerify(X, r); + } + + for (n = 2; n < N; n <<= 1) { + uint32_t m = (n < N / 2) ? n : (N - 1 - n); + + V_n = &V[n * s]; + + /* 2: for i = 0 to N - 1 do */ + for (i = 1; i < m; i += 2) { + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += i - 1; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V_n[i * s]; + j = blockmix_xor(X, V_j, Y, r, 0, S); + + if (((n + i) & VROM_mask) == 1) { + /* j <-- Integerify(X) mod NROM */ + j &= NROM - 1; + V_j = &VROM[j * s]; + } else { + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += i; + V_j = &V[j * s]; + } + + /* X <-- H(X \xor VROM_j) */ + X = &V_n[(i + 1) * s]; + j = blockmix_xor(Y, V_j, X, r, 1, S); + } + } + + n >>= 1; + + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += N - 2 - n; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[(N - 1) * s]; + j = blockmix_xor(X, V_j, Y, r, 0, S); + + if (((N - 1) & VROM_mask) == 1) { + /* j <-- Integerify(X) mod NROM */ + j &= NROM - 1; + V_j = &VROM[j * s]; + } else { + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += N - 1 - n; + V_j = &V[j * s]; + } + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + X = XY; + blockmix_xor(Y, V_j, X, r, 1, S); + } else if (flags & YESCRYPT_RW) { + uint32_t n; + salsa20_blk_t * V_n, * V_j; + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[s]; + blockmix(X, Y, r, S); + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + X = &V[2 * s]; + blockmix(Y, X, r, S); + j = integerify(X, r); + + for (n = 2; n < N; n <<= 1) { + uint32_t m = (n < N / 2) ? n : (N - 1 - n); + + V_n = &V[n * s]; + + /* 2: for i = 0 to N - 1 do */ + for (i = 1; i < m; i += 2) { + Y = &V_n[i * s]; + + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += i - 1; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + j = blockmix_xor(X, V_j, Y, r, 0, S); + + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += i; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + X = &V_n[(i + 1) * s]; + j = blockmix_xor(Y, V_j, X, r, 0, S); + } + } + + n >>= 1; + + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += N - 2 - n; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[(N - 1) * s]; + j = blockmix_xor(X, V_j, Y, r, 0, S); + + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += N - 1 - n; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + X = XY; + blockmix_xor(Y, V_j, X, r, 0, S); + } else { + /* 2: for i = 0 to N - 1 do */ + for (i = 1; i < N - 1; i += 2) { + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[i * s]; + blockmix(X, Y, r, S); + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + X = &V[(i + 1) * s]; + blockmix(Y, X, r, S); + } + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[i * s]; + blockmix(X, Y, r, S); + + /* 4: X <-- H(X) */ + X = XY; + blockmix(Y, X, r, S); + } + + /* B' <-- X */ + for (k = 0; k < 2 * r; k++) { + for (i = 0; i < 16; i++) { + le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]); + } + } +} + +/** + * smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S): + * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 256r bytes in length. The value N must be a power of 2 + * greater than 1. The value Nloop must be even. The array V must be aligned + * to a multiple of 64 bytes, and arrays B and XY to a multiple of at least 16 + * bytes (aligning them to 64 bytes as well saves cache lines, but might result + * in cache bank conflicts). + */ +static void +smix2(uint8_t * B, size_t r, uint32_t N, uint64_t Nloop, + yescrypt_flags_t flags, salsa20_blk_t * V, uint32_t NROM, + const yescrypt_shared_t * shared, salsa20_blk_t * XY, void * S) +{ + const salsa20_blk_t * VROM = shared->shared1.aligned; + uint32_t VROM_mask = shared->mask1; + size_t s = 2 * r; + salsa20_blk_t * X = XY, * Y = &XY[s]; + uint64_t i; + uint32_t j; + size_t k; + + if (Nloop == 0) + return; + + /* X <-- B' */ + /* 3: V_i <-- X */ + for (k = 0; k < 2 * r; k++) { + for (i = 0; i < 16; i++) { + X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); + } + } + + i = Nloop / 2; + + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + +/* + * Normally, NROM implies YESCRYPT_RW, but we check for these separately + * because YESCRYPT_PARALLEL_SMIX resets YESCRYPT_RW for the smix2() calls + * operating on the entire V. + */ + if (NROM && (flags & YESCRYPT_RW)) { + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < Nloop; i += 2) { + salsa20_blk_t * V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* V_j <-- Xprev \xor V_j */ + /* j <-- Integerify(X) mod NROM */ + j = blockmix_xor_save(X, V_j, Y, r, S); + + if (((i + 1) & VROM_mask) == 1) { + const salsa20_blk_t * VROM_j; + + j &= NROM - 1; + VROM_j = &VROM[j * s]; + + /* X <-- H(X \xor VROM_j) */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor(Y, VROM_j, X, r, 1, S); + } else { + j &= N - 1; + V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* V_j <-- Xprev \xor V_j */ + /* j <-- Integerify(X) mod NROM */ + j = blockmix_xor_save(Y, V_j, X, r, S); + } + j &= N - 1; + V_j = &V[j * s]; + } + } else if (NROM) { + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < Nloop; i += 2) { + const salsa20_blk_t * V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* V_j <-- Xprev \xor V_j */ + /* j <-- Integerify(X) mod NROM */ + j = blockmix_xor(X, V_j, Y, r, 0, S); + + if (((i + 1) & VROM_mask) == 1) { + j &= NROM - 1; + V_j = &VROM[j * s]; + } else { + j &= N - 1; + V_j = &V[j * s]; + } + + /* X <-- H(X \xor VROM_j) */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor(Y, V_j, X, r, 1, S); + j &= N - 1; + V_j = &V[j * s]; + } + } else if (flags & YESCRYPT_RW) { + /* 6: for i = 0 to N - 1 do */ + do { + salsa20_blk_t * V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* V_j <-- Xprev \xor V_j */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor_save(X, V_j, Y, r, S); + j &= N - 1; + V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* V_j <-- Xprev \xor V_j */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor_save(Y, V_j, X, r, S); + j &= N - 1; + } while (--i); + } else { + /* 6: for i = 0 to N - 1 do */ + do { + const salsa20_blk_t * V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor(X, V_j, Y, r, 0, S); + j &= N - 1; + V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor(Y, V_j, X, r, 0, S); + j &= N - 1; + } while (--i); + } + + /* 10: B' <-- X */ + for (k = 0; k < 2 * r; k++) { + for (i = 0; i < 16; i++) { + le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]); + } + } +} + +/** + * p2floor(x): + * Largest power of 2 not greater than argument. + */ +static uint64_t +p2floor(uint64_t x) +{ + uint64_t y; + while ((y = x & (x - 1))) + x = y; + return x; +} + +/** + * smix(B, r, N, p, t, flags, V, NROM, shared, XY, S): + * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the + * temporary storage V must be 128rN bytes in length; the temporary storage XY + * must be 256r or 256rp bytes in length (the larger size is required with + * OpenMP-enabled builds). The value N must be a power of 2 greater than 1. + * The array V must be aligned to a multiple of 64 bytes, and arrays B and + * XY to a multiple of at least 16 bytes (aligning them to 64 bytes as well + * saves cache lines and helps avoid false sharing in OpenMP-enabled builds + * when p > 1, but it might also result in cache bank conflicts). + */ +static void +smix(uint8_t * B, size_t r, uint32_t N, uint32_t p, uint32_t t, + yescrypt_flags_t flags, + salsa20_blk_t * V, uint32_t NROM, const yescrypt_shared_t * shared, + salsa20_blk_t * XY, void * S) +{ + size_t s = 2 * r; + uint32_t Nchunk = N / p; + uint64_t Nloop_all, Nloop_rw; + uint32_t i; + + Nloop_all = Nchunk; + if (flags & YESCRYPT_RW) { + if (t <= 1) { + if (t) + Nloop_all *= 2; /* 2/3 */ + Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ + } else { + Nloop_all *= t - 1; + } + } else if (t) { + if (t == 1) + Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ + Nloop_all *= t; + } + + Nloop_rw = 0; + if (flags & __YESCRYPT_INIT_SHARED) + Nloop_rw = Nloop_all; + else if (flags & YESCRYPT_RW) + Nloop_rw = Nloop_all / p; + + Nchunk &= ~(uint32_t)1; /* round down to even */ + Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ + Nloop_rw &= ~(uint64_t)1; /* round down to even */ + +#ifdef _OPENMP +#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, shared, XY, S, s, Nchunk, Nloop_all, Nloop_rw) + { +#pragma omp for +#endif + for (i = 0; i < p; i++) { + uint32_t Vchunk = i * Nchunk; + uint8_t * Bp = &B[128 * r * i]; + salsa20_blk_t * Vp = &V[Vchunk * s]; +#ifdef _OPENMP + salsa20_blk_t * XYp = &XY[i * (2 * s)]; +#else + salsa20_blk_t * XYp = XY; +#endif + uint32_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); + void * Sp = S ? ((uint8_t *)S + i * S_SIZE_ALL) : S; + if (Sp) + smix1(Bp, 1, S_SIZE_ALL / 128, + flags & ~YESCRYPT_PWXFORM, + Sp, NROM, shared, XYp, NULL); + if (!(flags & __YESCRYPT_INIT_SHARED_2)) + smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp); + smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, + NROM, shared, XYp, Sp); + } + + if (Nloop_all > Nloop_rw) { +#ifdef _OPENMP +#pragma omp for +#endif + for (i = 0; i < p; i++) { + uint8_t * Bp = &B[128 * r * i]; +#ifdef _OPENMP + salsa20_blk_t * XYp = &XY[i * (2 * s)]; +#else + salsa20_blk_t * XYp = XY; +#endif + void * Sp = S ? ((uint8_t *)S + i * S_SIZE_ALL) : S; + smix2(Bp, r, N, Nloop_all - Nloop_rw, + flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp); + } + } +#ifdef _OPENMP + } +#endif +} + +/** + * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, + * N, r, p, t, flags, buf, buflen): + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, + * p, buflen), or a revision of scrypt as requested by flags and shared, and + * write the result into buf. The parameters r, p, and buflen must satisfy + * r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power + * of 2 greater than 1. (This optimized implementation currently additionally + * limits N to the range from 8 to 2^31, but other implementation might not.) + * + * t controls computation time while not affecting peak memory usage. shared + * and flags may request special modes as described in yescrypt.h. local is + * the thread-local data structure, allowing to preserve and reuse a memory + * allocation across calls, thereby reducing its overhead. + * + * Return 0 on success; or -1 on error. + */ +int +yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local, + const uint8_t * passwd, size_t passwdlen, + const uint8_t * salt, size_t saltlen, + uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, + uint8_t * buf, size_t buflen) +{ + yescrypt_region_t tmp; + uint64_t NROM; + size_t B_size, V_size, XY_size, need; + uint8_t * B, * S; + salsa20_blk_t * V, * XY; + uint8_t sha256[32]; + + /* + * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose, + * so don't let it have side-effects. Without this adjustment, it'd + * enable the SHA-256 password pre-hashing and output post-hashing, + * because any deviation from classic scrypt implies those. + */ + if (p == 1) + flags &= ~YESCRYPT_PARALLEL_SMIX; + + /* Sanity-check parameters */ + if (flags & ~YESCRYPT_KNOWN_FLAGS) { + errno = EINVAL; + return -1; + } +#if SIZE_MAX > UINT32_MAX + if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { + errno = EFBIG; + return -1; + } +#endif + if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { + errno = EFBIG; + return -1; + } + if (N > UINT32_MAX) { + errno = EFBIG; + return -1; + } + if (((N & (N - 1)) != 0) || (N <= 7) || (r < 1) || (p < 1)) { + errno = EINVAL; + return -1; + } + if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 7)) { + errno = EINVAL; + return -1; + } + if ((r > SIZE_MAX / 256 / p) || + (N > SIZE_MAX / 128 / r)) { + errno = ENOMEM; + return -1; + } +#ifdef _OPENMP + if (!(flags & YESCRYPT_PARALLEL_SMIX) && + (N > SIZE_MAX / 128 / (r * p))) { + errno = ENOMEM; + return -1; + } +#endif + if ((flags & YESCRYPT_PWXFORM) && +#ifndef _OPENMP + (flags & YESCRYPT_PARALLEL_SMIX) && +#endif + p > SIZE_MAX / S_SIZE_ALL) { + errno = ENOMEM; + return -1; + } + + NROM = 0; + if (shared->shared1.aligned) { + NROM = shared->shared1.aligned_size / ((size_t)128 * r); + if (NROM > UINT32_MAX) { + errno = EFBIG; + return -1; + } + if (((NROM & (NROM - 1)) != 0) || (NROM <= 7) || + !(flags & YESCRYPT_RW)) { + errno = EINVAL; + return -1; + } + } + + /* Allocate memory */ + V = NULL; + V_size = (size_t)128 * r * N; +#ifdef _OPENMP + if (!(flags & YESCRYPT_PARALLEL_SMIX)) + V_size *= p; +#endif + need = V_size; + if (flags & __YESCRYPT_INIT_SHARED) { + if (local->aligned_size < need) { + if (local->base || local->aligned || + local->base_size || local->aligned_size) { + errno = EINVAL; + return -1; + } + if (!alloc_region(local, need)) + return -1; + } + V = (salsa20_blk_t *)local->aligned; + need = 0; + } + B_size = (size_t)128 * r * p; + need += B_size; + if (need < B_size) { + errno = ENOMEM; + return -1; + } + XY_size = (size_t)256 * r; +#ifdef _OPENMP + XY_size *= p; +#endif + need += XY_size; + if (need < XY_size) { + errno = ENOMEM; + return -1; + } + if (flags & YESCRYPT_PWXFORM) { + size_t S_size = S_SIZE_ALL; +#ifdef _OPENMP + S_size *= p; +#else + if (flags & YESCRYPT_PARALLEL_SMIX) + S_size *= p; +#endif + need += S_size; + if (need < S_size) { + errno = ENOMEM; + return -1; + } + } + if (flags & __YESCRYPT_INIT_SHARED) { + if (!alloc_region(&tmp, need)) + return -1; + B = (uint8_t *)tmp.aligned; + XY = (salsa20_blk_t *)((uint8_t *)B + B_size); + } else { + init_region(&tmp); + if (local->aligned_size < need) { + if (free_region(local)) + return -1; + if (!alloc_region(local, need)) + return -1; + } + B = (uint8_t *)local->aligned; + V = (salsa20_blk_t *)((uint8_t *)B + B_size); + XY = (salsa20_blk_t *)((uint8_t *)V + V_size); + } + S = NULL; + if (flags & YESCRYPT_PWXFORM) + S = (uint8_t *)XY + XY_size; + + if (t || flags) { + SHA256_CTX ctx; + SHA256_Init(&ctx); + SHA256_Update(&ctx, passwd, passwdlen); + SHA256_Final(sha256, &ctx); + passwd = sha256; + passwdlen = sizeof(sha256); + } + + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, B, B_size); + + if (t || flags) + memcpy(sha256, B, sizeof(sha256)); + + if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) { + smix(B, r, N, p, t, flags, V, NROM, shared, XY, S); + } else { + uint32_t i; + + /* 2: for i = 0 to p - 1 do */ +#ifdef _OPENMP +#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, shared, XY, S) +#endif + for (i = 0; i < p; i++) { + /* 3: B_i <-- MF(B_i, N) */ +#ifdef _OPENMP + smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, + &V[(size_t)2 * r * i * N], + NROM, shared, + &XY[(size_t)4 * r * i], + S ? &S[S_SIZE_ALL * i] : S); +#else + smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, V, + NROM, shared, XY, S); +#endif + } + } + + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, buf, buflen); + + /* + * Except when computing classic scrypt, allow all computation so far + * to be performed on the client. The final steps below match those of + * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so + * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of + * SCRAM's use of SHA-1) would be usable with yescrypt hashes. + */ + if ((t || flags) && buflen == sizeof(sha256)) { + /* Compute ClientKey */ + { + HMAC_SHA256_CTX ctx; + HMAC_SHA256_Init(&ctx, buf, buflen); +#if 0 +/* Proper yescrypt */ + HMAC_SHA256_Update(&ctx, "Client Key", 10); +#else +/* GlobalBoost-Y buggy yescrypt */ + HMAC_SHA256_Update(&ctx, salt, saltlen); +#endif + HMAC_SHA256_Final(sha256, &ctx); + } + /* Compute StoredKey */ + { + SHA256_CTX ctx; + SHA256_Init(&ctx); + SHA256_Update(&ctx, sha256, sizeof(sha256)); + SHA256_Final(buf, &ctx); + } + } + + if (free_region(&tmp)) + return -1; + + /* Success! */ + return 0; +} diff --git a/src/crypto/randomx/defyx/yescrypt-opt.c b/src/crypto/randomx/defyx/yescrypt-opt.c index 3da0a532..c621af6e 100644 --- a/src/crypto/randomx/defyx/yescrypt-opt.c +++ b/src/crypto/randomx/defyx/yescrypt-opt.c @@ -33,6 +33,7 @@ #include #include +#include "insecure_memzero.h" #include "sha256.h" #include "sysendian.h" @@ -768,10 +769,10 @@ smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t, /* 23: if i = 0 */ if (i == 0) { /* 24: passwd <-- HMAC-SHA256(B_{0,2r-1}, passwd) */ - HMAC_SHA256_CTX_Y ctx; - HMAC_SHA256_Init_Y(&ctx, Bp + (s - 8), 64); - HMAC_SHA256_Update_Y(&ctx, passwd, 32); - HMAC_SHA256_Final_Y(passwd, &ctx); + HMAC_SHA256_CTX ctx; + HMAC_SHA256_Init(&ctx, Bp + (s - 8), 64); + HMAC_SHA256_Update(&ctx, passwd, 32); + HMAC_SHA256_Final(passwd, &ctx); } } if (!(flags & __YESCRYPT_INIT_SHARED_2)) { @@ -966,11 +967,11 @@ yescrypt_kdf_body(const yescrypt_shared_t * shared, yescrypt_local_t * local, S = (uint8_t *)XY + XY_size; if (flags) { - HMAC_SHA256_CTX_Y ctx; - HMAC_SHA256_Init_Y(&ctx, "yescrypt-prehash", + HMAC_SHA256_CTX ctx; + HMAC_SHA256_Init(&ctx, "yescrypt-prehash", (flags & __YESCRYPT_PREHASH) ? 16 : 8); - HMAC_SHA256_Update_Y(&ctx, passwd, passwdlen); - HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx); + HMAC_SHA256_Update(&ctx, passwd, passwdlen); + HMAC_SHA256_Final((uint8_t *)sha256, &ctx); passwd = (uint8_t *)sha256; passwdlen = sizeof(sha256); } @@ -1026,20 +1027,20 @@ yescrypt_kdf_body(const yescrypt_shared_t * shared, yescrypt_local_t * local, if (flags && !(flags & __YESCRYPT_PREHASH)) { /* Compute ClientKey */ { - HMAC_SHA256_CTX_Y ctx; - HMAC_SHA256_Init_Y(&ctx, dkp, sizeof(dk)); - HMAC_SHA256_Update_Y(&ctx, "Client Key", 10); - HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx); + HMAC_SHA256_CTX ctx; + HMAC_SHA256_Init(&ctx, dkp, sizeof(dk)); + HMAC_SHA256_Update(&ctx, "Client Key", 10); + HMAC_SHA256_Final((uint8_t *)sha256, &ctx); } /* Compute StoredKey */ { - SHA256_CTX_Y ctx; + SHA256_CTX ctx; size_t clen = buflen; if (clen > sizeof(dk)) clen = sizeof(dk); - SHA256_Init_Y(&ctx); - SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256)); - SHA256_Final_Y(dk, &ctx); + SHA256_Init(&ctx); + SHA256_Update(&ctx, (uint8_t *)sha256, sizeof(sha256)); + SHA256_Final(dk, &ctx); memcpy(buf, dk, clen); } } diff --git a/src/crypto/randomx/defyx/yescrypt-simd.c b/src/crypto/randomx/defyx/yescrypt-simd.c index 884b2076..6f3d5ad7 100644 --- a/src/crypto/randomx/defyx/yescrypt-simd.c +++ b/src/crypto/randomx/defyx/yescrypt-simd.c @@ -44,6 +44,7 @@ #include #include +#include "insecure_memzero.h" #include "sha256.h" #include "sysendian.h" @@ -1032,10 +1033,10 @@ smix(uint8_t * B, size_t r, uint32_t N, uint32_t p, uint32_t t, /* 23: if i = 0 */ if (i == 0) { /* 24: passwd <-- HMAC-SHA256(B_{0,2r-1}, passwd) */ - HMAC_SHA256_CTX_Y ctx; - HMAC_SHA256_Init_Y(&ctx, Bp + (128 * r - 64), 64); - HMAC_SHA256_Update_Y(&ctx, passwd, 32); - HMAC_SHA256_Final_Y(passwd, &ctx); + HMAC_SHA256_CTX ctx; + HMAC_SHA256_Init(&ctx, Bp + (128 * r - 64), 64); + HMAC_SHA256_Update(&ctx, passwd, 32); + HMAC_SHA256_Final(passwd, &ctx); } } if (!(flags & __YESCRYPT_INIT_SHARED_2)) { @@ -1234,11 +1235,11 @@ yescrypt_kdf_body(const yescrypt_shared_t * shared, yescrypt_local_t * local, S = (uint8_t *)XY + XY_size; if (flags) { - HMAC_SHA256_CTX_Y ctx; - HMAC_SHA256_Init_Y(&ctx, "yescrypt-prehash", + HMAC_SHA256_CTX ctx; + HMAC_SHA256_Init(&ctx, "yescrypt-prehash", (flags & __YESCRYPT_PREHASH) ? 16 : 8); - HMAC_SHA256_Update_Y(&ctx, passwd, passwdlen); - HMAC_SHA256_Final_Y(sha256, &ctx); + HMAC_SHA256_Update(&ctx, passwd, passwdlen); + HMAC_SHA256_Final(sha256, &ctx); passwd = sha256; passwdlen = sizeof(sha256); } @@ -1291,20 +1292,20 @@ yescrypt_kdf_body(const yescrypt_shared_t * shared, yescrypt_local_t * local, if (flags && !(flags & __YESCRYPT_PREHASH)) { /* Compute ClientKey */ { - HMAC_SHA256_CTX_Y ctx; - HMAC_SHA256_Init_Y(&ctx, dkp, sizeof(dk)); - HMAC_SHA256_Update_Y(&ctx, "Client Key", 10); - HMAC_SHA256_Final_Y(sha256, &ctx); + HMAC_SHA256_CTX ctx; + HMAC_SHA256_Init(&ctx, dkp, sizeof(dk)); + HMAC_SHA256_Update(&ctx, "Client Key", 10); + HMAC_SHA256_Final(sha256, &ctx); } /* Compute StoredKey */ { - SHA256_CTX_Y ctx; + SHA256_CTX ctx; size_t clen = buflen; if (clen > sizeof(dk)) clen = sizeof(dk); - SHA256_Init_Y(&ctx); - SHA256_Update_Y(&ctx, sha256, sizeof(sha256)); - SHA256_Final_Y(dk, &ctx); + SHA256_Init(&ctx); + SHA256_Update(&ctx, sha256, sizeof(sha256)); + SHA256_Final(dk, &ctx); memcpy(buf, dk, clen); } }