diff --git a/cmake/randomx.cmake b/cmake/randomx.cmake index 28b578ee..40971777 100644 --- a/cmake/randomx.cmake +++ b/cmake/randomx.cmake @@ -40,12 +40,13 @@ if (WITH_RANDOMX) src/crypto/rx/RxDataset.cpp src/crypto/rx/RxQueue.cpp src/crypto/rx/RxVm.cpp - src/crypto/randomx/defyx/KangarooTwelve.c - src/crypto/randomx/defyx/KeccakP-1600-reference.c - src/crypto/randomx/defyx/KeccakSpongeWidth1600.c - src/crypto/randomx/defyx/yescrypt-best.c - src/crypto/randomx/panthera/sha256.c - src/crypto/randomx/panthera/yespower-opt.c + + ### Removed useless includes + src/crypto/randomx/panthera/sha256.c + src/crypto/randomx/panthera/KangarooTwelve.c + src/crypto/randomx/panthera/KeccakP-1600-reference.c + src/crypto/randomx/panthera/KeccakSpongeWidth1600.c + src/crypto/randomx/panthera/yespower-opt.c ) if (CMAKE_C_COMPILER_ID MATCHES MSVC) diff --git a/src/crypto/randomx/defyx/insecure_memzero.h b/src/crypto/randomx/defyx/insecure_memzero.h deleted file mode 100644 index 5a0ba75c..00000000 --- a/src/crypto/randomx/defyx/insecure_memzero.h +++ /dev/null @@ -1 +0,0 @@ -#define insecure_memzero(buf, len) /* empty */ diff --git a/src/crypto/randomx/defyx/sha256.h b/src/crypto/randomx/defyx/sha256.h deleted file mode 100644 index 6210502f..00000000 --- a/src/crypto/randomx/defyx/sha256.h +++ /dev/null @@ -1,129 +0,0 @@ -/*- - * Copyright 2005-2016 Colin Percival - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifndef _SHA256_H_ -#define _SHA256_H_ - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Use #defines in order to avoid namespace collisions with anyone else's - * SHA256 code (e.g., the code in OpenSSL). - */ -#define SHA256_Init libcperciva_SHA256_Init -#define SHA256_Update libcperciva_SHA256_Update -#define SHA256_Final libcperciva_SHA256_Final -#define SHA256_Buf libcperciva_SHA256_Buf -#define SHA256_CTX libcperciva_SHA256_CTX -#define HMAC_SHA256_Init libcperciva_HMAC_SHA256_Init -#define HMAC_SHA256_Update libcperciva_HMAC_SHA256_Update -#define HMAC_SHA256_Final libcperciva_HMAC_SHA256_Final -#define HMAC_SHA256_Buf libcperciva_HMAC_SHA256_Buf -#define HMAC_SHA256_CTX libcperciva_HMAC_SHA256_CTX - -/* Context structure for SHA256 operations. */ -typedef struct { - uint32_t state[8]; - uint64_t count; - uint8_t buf[64]; -} SHA256_CTX; - -/** - * SHA256_Init(ctx): - * Initialize the SHA256 context ${ctx}. - */ -void SHA256_Init(SHA256_CTX *); - -/** - * SHA256_Update(ctx, in, len): - * Input ${len} bytes from ${in} into the SHA256 context ${ctx}. - */ -void SHA256_Update(SHA256_CTX *, const void *, size_t); - -/** - * SHA256_Final(digest, ctx): - * Output the SHA256 hash of the data input to the context ${ctx} into the - * buffer ${digest}. - */ -void SHA256_Final(uint8_t[32], SHA256_CTX *); - -/** - * SHA256_Buf(in, len, digest): - * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}. - */ -void SHA256_Buf(const void *, size_t, uint8_t[32]); - -/* Context structure for HMAC-SHA256 operations. */ -typedef struct { - SHA256_CTX ictx; - SHA256_CTX octx; -} HMAC_SHA256_CTX; - -/** - * HMAC_SHA256_Init(ctx, K, Klen): - * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from - * ${K}. - */ -void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t); - -/** - * HMAC_SHA256_Update(ctx, in, len): - * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}. - */ -void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t); - -/** - * HMAC_SHA256_Final(digest, ctx): - * Output the HMAC-SHA256 of the data input to the context ${ctx} into the - * buffer ${digest}. - */ -void HMAC_SHA256_Final(uint8_t[32], HMAC_SHA256_CTX *); - -/** - * HMAC_SHA256_Buf(K, Klen, in, len, digest): - * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of - * length ${Klen}, and write the result to ${digest}. - */ -void HMAC_SHA256_Buf(const void *, size_t, const void *, size_t, uint8_t[32]); - -/** - * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): - * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and - * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). - */ -void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t, - uint64_t, uint8_t *, size_t); - -#ifdef __cplusplus -} -#endif - -#endif /* !_SHA256_H_ */ diff --git a/src/crypto/randomx/defyx/sysendian.h b/src/crypto/randomx/defyx/sysendian.h deleted file mode 100644 index 52c1fe73..00000000 --- a/src/crypto/randomx/defyx/sysendian.h +++ /dev/null @@ -1,94 +0,0 @@ -/*- - * Copyright 2007-2014 Colin Percival - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifndef _SYSENDIAN_H_ -#define _SYSENDIAN_H_ - -#include - -/* Avoid namespace collisions with BSD . */ -#define be32dec libcperciva_be32dec -#define be32enc libcperciva_be32enc -#define be64enc libcperciva_be64enc -#define le32dec libcperciva_le32dec -#define le32enc libcperciva_le32enc - -static inline uint32_t -be32dec(const void * pp) -{ - const uint8_t * p = (uint8_t const *)pp; - - return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + - ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); -} - -static inline void -be32enc(void * pp, uint32_t x) -{ - uint8_t * p = (uint8_t *)pp; - - p[3] = x & 0xff; - p[2] = (x >> 8) & 0xff; - p[1] = (x >> 16) & 0xff; - p[0] = (x >> 24) & 0xff; -} - -static inline void -be64enc(void * pp, uint64_t x) -{ - uint8_t * p = (uint8_t *)pp; - - p[7] = x & 0xff; - p[6] = (x >> 8) & 0xff; - p[5] = (x >> 16) & 0xff; - p[4] = (x >> 24) & 0xff; - p[3] = (x >> 32) & 0xff; - p[2] = (x >> 40) & 0xff; - p[1] = (x >> 48) & 0xff; - p[0] = (x >> 56) & 0xff; -} - -static inline uint32_t -le32dec(const void * pp) -{ - const uint8_t * p = (uint8_t const *)pp; - - return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + - ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); -} - -static inline void -le32enc(void * pp, uint32_t x) -{ - uint8_t * p = (uint8_t *)pp; - - p[0] = x & 0xff; - p[1] = (x >> 8) & 0xff; - p[2] = (x >> 16) & 0xff; - p[3] = (x >> 24) & 0xff; -} - -#endif /* !_SYSENDIAN_H_ */ diff --git a/src/crypto/randomx/defyx/yescrypt-best.c b/src/crypto/randomx/defyx/yescrypt-best.c deleted file mode 100644 index b4029fbb..00000000 --- a/src/crypto/randomx/defyx/yescrypt-best.c +++ /dev/null @@ -1,7 +0,0 @@ -#ifdef __ARM__ -#include "yescrypt-neon.c" -#elif defined __SSE2__ -#include "yescrypt-simd.c" -#else -#include "yescrypt-opt.c" -#endif diff --git a/src/crypto/randomx/defyx/yescrypt-common.c b/src/crypto/randomx/defyx/yescrypt-common.c deleted file mode 100644 index 3a0a0870..00000000 --- a/src/crypto/randomx/defyx/yescrypt-common.c +++ /dev/null @@ -1,703 +0,0 @@ -/*- - * Copyright 2013-2018 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -#include - -#include "insecure_memzero.h" -#include "sha256.h" - -#define YESCRYPT_INTERNAL -#include "yescrypt.h" - -#define BYTES2CHARS(bytes) ((((bytes) * 8) + 5) / 6) - -#define HASH_SIZE sizeof(yescrypt_binary_t) /* bytes */ -#define HASH_LEN BYTES2CHARS(HASH_SIZE) /* base-64 chars */ - -/* - * "$y$", up to 8 params of up to 6 chars each, '$', salt - * Alternatively, but that's smaller: - * "$7$", 3 params encoded as 1+5+5 chars, salt - */ -#define PREFIX_LEN (3 + 8 * 6 + 1 + BYTES2CHARS(32)) - -static const char * const itoa64 = - "./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - -static const uint8_t atoi64_partial[77] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 64, 64, 64, 64, 64, 64, 64, - 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, - 64, 64, 64, 64, 64, 64, - 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, - 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 -}; - -static uint8_t *encode64_uint32(uint8_t *dst, size_t dstlen, - uint32_t src, uint32_t min) -{ - uint32_t start = 0, end = 47, chars = 1, bits = 0; - - if (src < min) - return NULL; - src -= min; - - do { - uint32_t count = (end + 1 - start) << bits; - if (src < count) - break; - if (start >= 63) - return NULL; - start = end + 1; - end = start + (62 - end) / 2; - src -= count; - chars++; - bits += 6; - } while (1); - - if (dstlen <= chars) /* require room for a NUL terminator */ - return NULL; - - *dst++ = itoa64[start + (src >> bits)]; - - while (--chars) { - bits -= 6; - *dst++ = itoa64[(src >> bits) & 0x3f]; - } - - *dst = 0; /* NUL terminate just in case */ - - return dst; -} - -static inline uint32_t atoi64(uint8_t src) -{ - if (src >= '.' && src <= 'z') - return atoi64_partial[src - '.']; - - return 64; -} - -static const uint8_t *decode64_uint32(uint32_t *dst, - const uint8_t *src, uint32_t min) -{ - uint32_t start = 0, end = 47, chars = 1, bits = 0; - uint32_t c; - - c = atoi64(*src++); - if (c > 63) - goto fail; - - *dst = min; - while (c > end) { - *dst += (end + 1 - start) << bits; - start = end + 1; - end = start + (62 - end) / 2; - chars++; - bits += 6; - } - - *dst += (c - start) << bits; - - while (--chars) { - c = atoi64(*src++); - if (c > 63) - goto fail; - bits -= 6; - *dst += c << bits; - } - - return src; - -fail: - *dst = 0; - return NULL; -} - -static uint8_t *encode64_uint32_fixed(uint8_t *dst, size_t dstlen, - uint32_t src, uint32_t srcbits) -{ - uint32_t bits; - - for (bits = 0; bits < srcbits; bits += 6) { - if (dstlen < 2) - return NULL; - *dst++ = itoa64[src & 0x3f]; - dstlen--; - src >>= 6; - } - - if (src || dstlen < 1) - return NULL; - - *dst = 0; /* NUL terminate just in case */ - - return dst; -} - -static uint8_t *encode64(uint8_t *dst, size_t dstlen, - const uint8_t *src, size_t srclen) -{ - size_t i; - - for (i = 0; i < srclen; ) { - uint8_t *dnext; - uint32_t value = 0, bits = 0; - do { - value |= (uint32_t)src[i++] << bits; - bits += 8; - } while (bits < 24 && i < srclen); - dnext = encode64_uint32_fixed(dst, dstlen, value, bits); - if (!dnext) - return NULL; - dstlen -= dnext - dst; - dst = dnext; - } - - if (dstlen < 1) - return NULL; - - *dst = 0; /* NUL terminate just in case */ - - return dst; -} - -static const uint8_t *decode64_uint32_fixed(uint32_t *dst, uint32_t dstbits, - const uint8_t *src) -{ - uint32_t bits; - - *dst = 0; - for (bits = 0; bits < dstbits; bits += 6) { - uint32_t c = atoi64(*src++); - if (c > 63) { - *dst = 0; - return NULL; - } - *dst |= c << bits; - } - - return src; -} - -static const uint8_t *decode64(uint8_t *dst, size_t *dstlen, - const uint8_t *src, size_t srclen) -{ - size_t dstpos = 0; - - while (dstpos <= *dstlen && srclen) { - uint32_t value = 0, bits = 0; - while (srclen--) { - uint32_t c = atoi64(*src); - if (c > 63) { - srclen = 0; - break; - } - src++; - value |= c << bits; - bits += 6; - if (bits >= 24) - break; - } - if (!bits) - break; - if (bits < 12) /* must have at least one full byte */ - goto fail; - while (dstpos++ < *dstlen) { - *dst++ = value; - value >>= 8; - bits -= 8; - if (bits < 8) { /* 2 or 4 */ - if (value) /* must be 0 */ - goto fail; - bits = 0; - break; - } - } - if (bits) - goto fail; - } - - if (!srclen && dstpos <= *dstlen) { - *dstlen = dstpos; - return src; - } - -fail: - *dstlen = 0; - return NULL; -} - -typedef enum { ENC = 1, DEC = -1 } encrypt_dir_t; - -static void memxor(unsigned char *dst, unsigned char *src, size_t size) -{ - while (size--) - *dst++ ^= *src++; -} - -static void encrypt(unsigned char *data, size_t datalen, - const yescrypt_binary_t *key, encrypt_dir_t dir) -{ - SHA256_CTX ctx; - unsigned char f[32 + 4]; - size_t halflen, which; - unsigned char mask, round, target; - - if (!datalen) - return; - if (datalen > 64) - datalen = 64; - - halflen = datalen >> 1; - - which = 0; /* offset to half we are working on (0 or halflen) */ - mask = 0x0f; /* current half's extra nibble mask if datalen is odd */ - - round = 0; - target = 5; /* 6 rounds due to Jacques Patarin's CRYPTO 2004 paper */ - - if (dir == DEC) { - which = halflen; /* even round count, so swap the halves */ - mask ^= 0xff; - - round = target; - target = 0; - } - - f[32] = 0; - f[33] = sizeof(*key); - f[34] = datalen; - - do { - SHA256_Init(&ctx); - f[35] = round; - SHA256_Update(&ctx, &f[32], 4); - SHA256_Update(&ctx, key, sizeof(*key)); - SHA256_Update(&ctx, &data[which], halflen); - if (datalen & 1) { - f[0] = data[datalen - 1] & mask; - SHA256_Update(&ctx, f, 1); - } - SHA256_Final(f, &ctx); - which ^= halflen; - memxor(&data[which], f, halflen); - if (datalen & 1) { - mask ^= 0xff; - data[datalen - 1] ^= f[halflen] & mask; - } - if (round == target) - break; - round += dir; - } while (1); - - /* ctx is presumably zeroized by SHA256_Final() */ - insecure_memzero(f, sizeof(f)); -} - -uint8_t *yescrypt_r(const yescrypt_shared_t *shared, yescrypt_local_t *local, - const uint8_t *passwd, size_t passwdlen, - const uint8_t *setting, - const yescrypt_binary_t *key, - uint8_t *buf, size_t buflen) -{ - unsigned char saltbin[64], hashbin[32]; - const uint8_t *src, *saltstr, *salt; - uint8_t *dst; - size_t need, prefixlen, saltstrlen, saltlen; - yescrypt_params_t params = { .p = 1 }; - - if (setting[0] != '$' || - (setting[1] != '7' && setting[1] != 'y') || - setting[2] != '$') - return NULL; - src = setting + 3; - - if (setting[1] == '7') { - uint32_t N_log2 = atoi64(*src++); - if (N_log2 < 1 || N_log2 > 63) - return NULL; - params.N = (uint64_t)1 << N_log2; - - src = decode64_uint32_fixed(¶ms.r, 30, src); - if (!src) - return NULL; - - src = decode64_uint32_fixed(¶ms.p, 30, src); - if (!src) - return NULL; - - if (key) - return NULL; - } else { - uint32_t flavor, N_log2; - - src = decode64_uint32(&flavor, src, 0); - if (!src) - return NULL; - - if (flavor < YESCRYPT_RW) { - params.flags = flavor; - } else if (flavor <= YESCRYPT_RW + (YESCRYPT_RW_FLAVOR_MASK >> 2)) { - params.flags = YESCRYPT_RW + ((flavor - YESCRYPT_RW) << 2); - } else { - return NULL; - } - - src = decode64_uint32(&N_log2, src, 1); - if (!src || N_log2 > 63) - return NULL; - params.N = (uint64_t)1 << N_log2; - - src = decode64_uint32(¶ms.r, src, 1); - if (!src) - return NULL; - - if (*src != '$') { - uint32_t have; - - src = decode64_uint32(&have, src, 1); - if (!src) - return NULL; - - if (have & 1) { - src = decode64_uint32(¶ms.p, src, 2); - if (!src) - return NULL; - } - - if (have & 2) { - src = decode64_uint32(¶ms.t, src, 1); - if (!src) - return NULL; - } - - if (have & 4) { - src = decode64_uint32(¶ms.g, src, 1); - if (!src) - return NULL; - } - - if (have & 8) { - uint32_t NROM_log2; - src = decode64_uint32(&NROM_log2, src, 1); - if (!src || NROM_log2 > 63) - return NULL; - params.NROM = (uint64_t)1 << NROM_log2; - } - } - - if (*src++ != '$') - return NULL; - } - - prefixlen = src - setting; - - saltstr = src; - src = (uint8_t *)strrchr((char *)saltstr, '$'); - if (src) - saltstrlen = src - saltstr; - else - saltstrlen = strlen((char *)saltstr); - - if (setting[1] == '7') { - salt = saltstr; - saltlen = saltstrlen; - } else { - const uint8_t *saltend; - - saltlen = sizeof(saltbin); - saltend = decode64(saltbin, &saltlen, saltstr, saltstrlen); - - if (!saltend || (size_t)(saltend - saltstr) != saltstrlen) - goto fail; - - salt = saltbin; - - if (key) - encrypt(saltbin, saltlen, key, ENC); - } - - need = prefixlen + saltstrlen + 1 + HASH_LEN + 1; - if (need > buflen || need < saltstrlen) - goto fail; - - if (yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, - ¶ms, hashbin, sizeof(hashbin))) - goto fail; - - if (key) { - insecure_memzero(saltbin, sizeof(saltbin)); - encrypt(hashbin, sizeof(hashbin), key, ENC); - } - - dst = buf; - memcpy(dst, setting, prefixlen + saltstrlen); - dst += prefixlen + saltstrlen; - *dst++ = '$'; - - dst = encode64(dst, buflen - (dst - buf), hashbin, sizeof(hashbin)); - insecure_memzero(hashbin, sizeof(hashbin)); - if (!dst || dst >= buf + buflen) - return NULL; - - *dst = 0; /* NUL termination */ - - return buf; - -fail: - insecure_memzero(saltbin, sizeof(saltbin)); - insecure_memzero(hashbin, sizeof(hashbin)); - return NULL; -} - -uint8_t *yescrypt(const uint8_t *passwd, const uint8_t *setting) -{ - /* prefix, '$', hash, NUL */ - static uint8_t buf[PREFIX_LEN + 1 + HASH_LEN + 1]; - yescrypt_local_t local; - uint8_t *retval; - - if (yescrypt_init_local(&local)) - return NULL; - retval = yescrypt_r(NULL, &local, - passwd, strlen((char *)passwd), setting, NULL, buf, sizeof(buf)); - if (yescrypt_free_local(&local)) - return NULL; - return retval; -} - -uint8_t *yescrypt_reencrypt(uint8_t *hash, - const yescrypt_binary_t *from_key, - const yescrypt_binary_t *to_key) -{ - uint8_t *retval = NULL, *saltstart, *hashstart; - const uint8_t *hashend; - unsigned char saltbin[64], hashbin[32]; - size_t saltstrlen, saltlen, hashlen; - - if (strncmp((char *)hash, "$y$", 3)) - return NULL; - - saltstart = NULL; - hashstart = (uint8_t *)strrchr((char *)hash, '$'); - if (hashstart) { - if (hashstart > (uint8_t *)hash) { - saltstart = hashstart - 1; - while (*saltstart != '$' && saltstart > hash) - saltstart--; - if (*saltstart == '$') - saltstart++; - } - hashstart++; - } else { - hashstart = hash; - } - saltstrlen = saltstart ? (hashstart - 1 - saltstart) : 0; - if (saltstrlen > BYTES2CHARS(64) || - strlen((char *)hashstart) != HASH_LEN) - return NULL; - - if (saltstrlen) { - const uint8_t *saltend; - saltlen = sizeof(saltbin); - saltend = decode64(saltbin, &saltlen, saltstart, saltstrlen); - if (!saltend || *saltend != '$' || saltlen < 1 || saltlen > 64) - goto out; - - if (from_key) - encrypt(saltbin, saltlen, from_key, ENC); - if (to_key) - encrypt(saltbin, saltlen, to_key, DEC); - } - - hashlen = sizeof(hashbin); - hashend = decode64(hashbin, &hashlen, hashstart, HASH_LEN); - if (!hashend || *hashend || hashlen != sizeof(hashbin)) - goto out; - - if (from_key) - encrypt(hashbin, hashlen, from_key, DEC); - if (to_key) - encrypt(hashbin, hashlen, to_key, ENC); - - if (saltstrlen) { - if (!encode64(saltstart, saltstrlen + 1, saltbin, saltlen)) - goto out; /* can't happen */ - *(saltstart + saltstrlen) = '$'; - } - - if (!encode64(hashstart, HASH_LEN + 1, hashbin, hashlen)) - goto out; /* can't happen */ - - retval = hash; - -out: - insecure_memzero(saltbin, sizeof(saltbin)); - insecure_memzero(hashbin, sizeof(hashbin)); - - return retval; -} - -static uint32_t N2log2(uint64_t N) -{ - uint32_t N_log2; - - if (N < 2) - return 0; - - N_log2 = 2; - while (N >> N_log2 != 0) - N_log2++; - N_log2--; - - if (N >> N_log2 != 1) - return 0; - - return N_log2; -} - -uint8_t *yescrypt_encode_params_r(const yescrypt_params_t *params, - const uint8_t *src, size_t srclen, - uint8_t *buf, size_t buflen) -{ - uint32_t flavor, N_log2, NROM_log2, have; - uint8_t *dst; - - if (srclen > SIZE_MAX / 16) - return NULL; - - if (params->flags < YESCRYPT_RW) { - flavor = params->flags; - } else if ((params->flags & YESCRYPT_MODE_MASK) == YESCRYPT_RW && - params->flags <= (YESCRYPT_RW | YESCRYPT_RW_FLAVOR_MASK)) { - flavor = YESCRYPT_RW + (params->flags >> 2); - } else { - return NULL; - } - - N_log2 = N2log2(params->N); - if (!N_log2) - return NULL; - - NROM_log2 = N2log2(params->NROM); - if (params->NROM && !NROM_log2) - return NULL; - - if ((uint64_t)params->r * (uint64_t)params->p >= (1U << 30)) - return NULL; - - dst = buf; - *dst++ = '$'; - *dst++ = 'y'; - *dst++ = '$'; - - dst = encode64_uint32(dst, buflen - (dst - buf), flavor, 0); - if (!dst) - return NULL; - - dst = encode64_uint32(dst, buflen - (dst - buf), N_log2, 1); - if (!dst) - return NULL; - - dst = encode64_uint32(dst, buflen - (dst - buf), params->r, 1); - if (!dst) - return NULL; - - have = 0; - if (params->p != 1) - have |= 1; - if (params->t) - have |= 2; - if (params->g) - have |= 4; - if (NROM_log2) - have |= 8; - - if (have) { - dst = encode64_uint32(dst, buflen - (dst - buf), have, 1); - if (!dst) - return NULL; - } - - if (params->p != 1) { - dst = encode64_uint32(dst, buflen - (dst - buf), params->p, 2); - if (!dst) - return NULL; - } - - if (params->t) { - dst = encode64_uint32(dst, buflen - (dst - buf), params->t, 1); - if (!dst) - return NULL; - } - - if (params->g) { - dst = encode64_uint32(dst, buflen - (dst - buf), params->g, 1); - if (!dst) - return NULL; - } - - if (NROM_log2) { - dst = encode64_uint32(dst, buflen - (dst - buf), NROM_log2, 1); - if (!dst) - return NULL; - } - - if (dst >= buf + buflen) - return NULL; - - *dst++ = '$'; - - dst = encode64(dst, buflen - (dst - buf), src, srclen); - if (!dst || dst >= buf + buflen) - return NULL; - - *dst = 0; /* NUL termination */ - - return buf; -} - -uint8_t *yescrypt_encode_params(const yescrypt_params_t *params, - const uint8_t *src, size_t srclen) -{ - /* prefix, NUL */ - static uint8_t buf[PREFIX_LEN + 1]; - return yescrypt_encode_params_r(params, src, srclen, buf, sizeof(buf)); -} - -int crypto_scrypt(const uint8_t *passwd, size_t passwdlen, - const uint8_t *salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p, - uint8_t *buf, size_t buflen) -{ - yescrypt_local_t local; - yescrypt_params_t params = { .flags = 0, .N = N, .r = r, .p = p }; - int retval; - - if (yescrypt_init_local(&local)) - return -1; - retval = yescrypt_kdf(NULL, &local, - passwd, passwdlen, salt, saltlen, ¶ms, buf, buflen); - if (yescrypt_free_local(&local)) - return -1; - return retval; -} diff --git a/src/crypto/randomx/defyx/yescrypt-neon.c b/src/crypto/randomx/defyx/yescrypt-neon.c deleted file mode 100644 index ed6ff6e0..00000000 --- a/src/crypto/randomx/defyx/yescrypt-neon.c +++ /dev/null @@ -1,1326 +0,0 @@ -/*- - * Copyright 2009 Colin Percival - * Copyright 2012-2014 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ - -/* - * On 64-bit, enabling SSE4.1 helps our pwxform code indirectly, via avoiding - * gcc bug 54349 (fixed for gcc 4.9+). On 32-bit, it's of direct help. AVX - * and XOP are of further help either way. - */ -#include - -#include -#include -#include -#include - -#include "insecure_memzero.h" -#include "sha256.h" -#include "sysendian.h" - -#include "yescrypt.h" - -#include "yescrypt-platform.c" - -#if __STDC_VERSION__ >= 199901L -/* have restrict */ -#elif defined(__GNUC__) -#define restrict __restrict -#else -#define restrict -#endif - -#define PREFETCH(x, hint) /*_mm_prefetch((const char *)(x), (hint));*/ -#define PREFETCH_OUT(x, hint) /* disabled */ - -#define ARX(out, in1, in2, s) \ - { \ - uint32x4_t T = vaddq_u32(in1, in2); \ - out = veorq_u32(out, vshlq_n_u32(T, s)); \ - out = veorq_u32(out, vshrq_n_u32(T, 32-s)); \ - } - -#define SALSA20_2ROUNDS \ - /* Operate on "columns" */ \ - ARX(X1, X0, X3, 7) \ - ARX(X2, X1, X0, 9) \ - ARX(X3, X2, X1, 13) \ - ARX(X0, X3, X2, 18) \ -\ - /* Rearrange data */ \ - X1 = vextq_u32(X1, X1, 3); \ - X2 = vextq_u32(X2, X2, 2); \ - X3 = vextq_u32(X3, X3, 1); \ -\ - /* Operate on "rows" */ \ - ARX(X3, X0, X1, 7) \ - ARX(X2, X3, X0, 9) \ - ARX(X1, X2, X3, 13) \ - ARX(X0, X1, X2, 18) \ -\ - /* Rearrange data */ \ - X1 = vextq_u32(X1, X1, 1); \ - X2 = vextq_u32(X2, X2, 2); \ - X3 = vextq_u32(X3, X3, 3); - -/** - * Apply the salsa20/8 core to the block provided in (X0 ... X3). - */ -#define SALSA20_8_BASE(maybe_decl, out) \ - { \ - maybe_decl Y0 = X0; \ - maybe_decl Y1 = X1; \ - maybe_decl Y2 = X2; \ - maybe_decl Y3 = X3; \ - SALSA20_2ROUNDS \ - SALSA20_2ROUNDS \ - SALSA20_2ROUNDS \ - SALSA20_2ROUNDS \ - (out)[0] = X0 = vaddq_u32(X0, Y0); \ - (out)[1] = X1 = vaddq_u32(X1, Y1); \ - (out)[2] = X2 = vaddq_u32(X2, Y2); \ - (out)[3] = X3 = vaddq_u32(X3, Y3); \ - } -#define SALSA20_8(out) \ - SALSA20_8_BASE(uint32x4_t, out) - -/** - * Apply the salsa20/8 core to the block provided in (X0 ... X3) ^ (Z0 ... Z3). - */ -#define SALSA20_8_XOR_ANY(maybe_decl, Z0, Z1, Z2, Z3, out) \ - X0 = veorq_u32(X0, Z0); \ - X1 = veorq_u32(X1, Z1); \ - X2 = veorq_u32(X2, Z2); \ - X3 = veorq_u32(X3, Z3); \ - SALSA20_8_BASE(maybe_decl, out) - -#define SALSA20_8_XOR_MEM(in, out) \ - SALSA20_8_XOR_ANY(uint32x4_t, (in)[0], (in)[1], (in)[2], (in)[3], out) - -#define SALSA20_8_XOR_REG(out) \ - SALSA20_8_XOR_ANY(/* empty */, Y0, Y1, Y2, Y3, out) - -typedef union { - uint32_t w[16]; - uint32x4_t q[4]; -} salsa20_blk_t; - -/** - * blockmix_salsa8(Bin, Bout, r): - * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r - * bytes in length; the output Bout must also be the same size. - */ -static inline void -blockmix_salsa8(const salsa20_blk_t *restrict Bin, - salsa20_blk_t *restrict Bout, size_t r) -{ - uint32x4_t X0, X1, X2, X3; - size_t i; - - r--; - PREFETCH(&Bin[r * 2 + 1], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin[i * 2], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - PREFETCH(&Bin[i * 2 + 1], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) - } - PREFETCH(&Bin[r * 2], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) - - /* 1: X <-- B_{2r - 1} */ - X0 = Bin[r * 2 + 1].q[0]; - X1 = Bin[r * 2 + 1].q[1]; - X2 = Bin[r * 2 + 1].q[2]; - X3 = Bin[r * 2 + 1].q[3]; - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[0].q, Bout[0].q) - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < r;) { - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[i * 2 + 1].q, Bout[r + 1 + i].q) - - i++; - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[i * 2].q, Bout[i].q) - } - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[r * 2 + 1].q, Bout[r * 2 + 1].q) -} - -/* - * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs - * starting with Sandy Bridge. Additionally, PSHUFD uses separate source and - * destination registers, whereas the shifts would require an extra move - * instruction for our code when building without AVX. Unfortunately, PSHUFD - * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ) - * and somewhat slower on some non-Intel CPUs (luckily not including AMD - * Bulldozer and Piledriver). Since for many other CPUs using (V)PSHUFD is a - * win in terms of throughput or/and not needing a move instruction, we - * currently use it despite of the higher latency on some older CPUs. As an - * alternative, the #if below may be patched to only enable use of (V)PSHUFD - * when building with SSE4.1 or newer, which is not available on older CPUs - * where this instruction has higher latency. - */ -#define LO32(X) \ - vmovn_u64(vreinterpretq_u64_u32(X)) -#define HI32(X) \ - LO32(vrev64q_u32(X)) - -#define EXTRACT64(X) \ - vgetq_lane_u64(vreinterpretq_u64_u32(X), 0) - -/* This is tunable */ -#define S_BITS 8 - -/* Not tunable in this implementation, hard-coded in a few places */ -#define S_SIMD 2 -#define S_P 4 - -/* Number of S-boxes. Not tunable by design, hard-coded in a few places. */ -#define S_N 2 - -/* Derived values. Not tunable except via S_BITS above. */ -#define S_SIZE1 (1 << S_BITS) -#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8) -#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK) -#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD * 8) - -#define PWXFORM_X_T uint64_t -#define PWXFORM_SIMD(X, x, s0, s1) \ - x = EXTRACT64(X) & S_MASK2; \ - s0 = *(const uint32x4_t *)(S0 + (uint32_t)x); \ - s1 = *(const uint32x4_t *)(S1 + (x >> 32)); \ - X = vreinterpretq_u32_u64(vmull_u32(HI32(X), LO32(X))); \ - X = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(X), vreinterpretq_u64_u32(s0))); \ - X = veorq_u32(X, s1); - -#define PWXFORM_ROUND \ - PWXFORM_SIMD(X0, x0, s00, s01) \ - PWXFORM_SIMD(X1, x1, s10, s11) \ - PWXFORM_SIMD(X2, x2, s20, s21) \ - PWXFORM_SIMD(X3, x3, s30, s31) - -#define PWXFORM \ - { \ - PWXFORM_X_T x0, x1, x2, x3; \ - uint32x4_t s00, s01, s10, s11, s20, s21, s30, s31; \ - PWXFORM_ROUND PWXFORM_ROUND \ - PWXFORM_ROUND PWXFORM_ROUND \ - PWXFORM_ROUND PWXFORM_ROUND \ - } - -#define XOR4(in) \ - X0 = veorq_u32(X0, (in)[0]); \ - X1 = veorq_u32(X1, (in)[1]); \ - X2 = veorq_u32(X2, (in)[2]); \ - X3 = veorq_u32(X3, (in)[3]); - -#define OUT(out) \ - (out)[0] = X0; \ - (out)[1] = X1; \ - (out)[2] = X2; \ - (out)[3] = X3; - -/** - * blockmix_pwxform(Bin, Bout, r, S): - * Compute Bout = BlockMix_pwxform{salsa20/8, r, S}(Bin). The input Bin must - * be 128r bytes in length; the output Bout must also be the same size. - */ -static void -blockmix(const salsa20_blk_t *restrict Bin, salsa20_blk_t *restrict Bout, - size_t r, const uint32x4_t *restrict S) -{ - const uint8_t * S0, * S1; - uint32x4_t X0, X1, X2, X3; - size_t i; - - if (!S) { - blockmix_salsa8(Bin, Bout, r); - return; - } - - S0 = (const uint8_t *)S; - S1 = (const uint8_t *)S + S_SIZE_ALL / 2; - - /* Convert 128-byte blocks to 64-byte blocks */ - r *= 2; - - r--; - PREFETCH(&Bin[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - } - PREFETCH_OUT(&Bout[r], _MM_HINT_T0) - - /* X <-- B_{r1 - 1} */ - X0 = Bin[r].q[0]; - X1 = Bin[r].q[1]; - X2 = Bin[r].q[2]; - X3 = Bin[r].q[3]; - - /* for i = 0 to r1 - 1 do */ - for (i = 0; i < r; i++) { - /* X <-- H'(X \xor B_i) */ - XOR4(Bin[i].q) - PWXFORM - /* B'_i <-- X */ - OUT(Bout[i].q) - } - - /* Last iteration of the loop above */ - XOR4(Bin[i].q) - PWXFORM - - /* B'_i <-- H(B'_i) */ - SALSA20_8(Bout[i].q) -} - -#define XOR4_2(in1, in2) \ - X0 = veorq_u32((in1)[0], (in2)[0]); \ - X1 = veorq_u32((in1)[1], (in2)[1]); \ - X2 = veorq_u32((in1)[2], (in2)[2]); \ - X3 = veorq_u32((in1)[3], (in2)[3]); - -static inline uint32_t -blockmix_salsa8_xor(const salsa20_blk_t *restrict Bin1, - const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r, int Bin2_in_ROM) -{ - uint32x4_t X0, X1, X2, X3; - size_t i; - - r--; - if (Bin2_in_ROM) { - PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_NTA) - PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i * 2], _MM_HINT_NTA) - PREFETCH(&Bin1[i * 2], _MM_HINT_T0) - PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_NTA) - PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) - } - PREFETCH(&Bin2[r * 2], _MM_HINT_T0) - } else { - PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0) - PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i * 2], _MM_HINT_T0) - PREFETCH(&Bin1[i * 2], _MM_HINT_T0) - PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0) - PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) - } - PREFETCH(&Bin2[r * 2], _MM_HINT_T0) - } - PREFETCH(&Bin1[r * 2], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) - - /* 1: X <-- B_{2r - 1} */ - XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[0].q) - SALSA20_8_XOR_MEM(Bin2[0].q, Bout[0].q) - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < r;) { - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2 + 1].q) - SALSA20_8_XOR_MEM(Bin2[i * 2 + 1].q, Bout[r + 1 + i].q) - - i++; - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2].q) - SALSA20_8_XOR_MEM(Bin2[i * 2].q, Bout[i].q) - } - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[r * 2 + 1].q) - SALSA20_8_XOR_MEM(Bin2[r * 2 + 1].q, Bout[r * 2 + 1].q) - - return vgetq_lane_u32(X0, 0); -} - -static uint32_t -blockmix_xor(const salsa20_blk_t *restrict Bin1, - const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r, int Bin2_in_ROM, const uint32x4_t *restrict S) -{ - const uint8_t * S0, * S1; - uint32x4_t X0, X1, X2, X3; - size_t i; - - if (!S) - return blockmix_salsa8_xor(Bin1, Bin2, Bout, r, Bin2_in_ROM); - - S0 = (const uint8_t *)S; - S1 = (const uint8_t *)S + S_SIZE_ALL / 2; - - /* Convert 128-byte blocks to 64-byte blocks */ - r *= 2; - - r--; - if (Bin2_in_ROM) { - PREFETCH(&Bin2[r], _MM_HINT_NTA) - PREFETCH(&Bin1[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i], _MM_HINT_NTA) - PREFETCH(&Bin1[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - } - } else { - PREFETCH(&Bin2[r], _MM_HINT_T0) - PREFETCH(&Bin1[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i], _MM_HINT_T0) - PREFETCH(&Bin1[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - } - } - PREFETCH_OUT(&Bout[r], _MM_HINT_T0); - - /* X <-- B_{r1 - 1} */ - XOR4_2(Bin1[r].q, Bin2[r].q) - - /* for i = 0 to r1 - 1 do */ - for (i = 0; i < r; i++) { - /* X <-- H'(X \xor B_i) */ - XOR4(Bin1[i].q) - XOR4(Bin2[i].q) - PWXFORM - /* B'_i <-- X */ - OUT(Bout[i].q) - } - - /* Last iteration of the loop above */ - XOR4(Bin1[i].q) - XOR4(Bin2[i].q) - PWXFORM - - /* B'_i <-- H(B'_i) */ - SALSA20_8(Bout[i].q) - - return vgetq_lane_u32(X0, 0); -} - -#undef XOR4 -#define XOR4(in, out) \ - (out)[0] = Y0 = veorq_u32((in)[0], (out)[0]); \ - (out)[1] = Y1 = veorq_u32((in)[1], (out)[1]); \ - (out)[2] = Y2 = veorq_u32((in)[2], (out)[2]); \ - (out)[3] = Y3 = veorq_u32((in)[3], (out)[3]); - -static inline uint32_t -blockmix_salsa8_xor_save(const salsa20_blk_t *restrict Bin1, - salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r) -{ - uint32x4_t X0, X1, X2, X3, Y0, Y1, Y2, Y3; - size_t i; - - r--; - PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0) - PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i * 2], _MM_HINT_T0) - PREFETCH(&Bin1[i * 2], _MM_HINT_T0) - PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0) - PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) - } - PREFETCH(&Bin2[r * 2], _MM_HINT_T0) - PREFETCH(&Bin1[r * 2], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) - - /* 1: X <-- B_{2r - 1} */ - XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[0].q, Bin2[0].q) - SALSA20_8_XOR_REG(Bout[0].q) - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < r;) { - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2 + 1].q, Bin2[i * 2 + 1].q) - SALSA20_8_XOR_REG(Bout[r + 1 + i].q) - - i++; - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2].q, Bin2[i * 2].q) - SALSA20_8_XOR_REG(Bout[i].q) - } - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) - SALSA20_8_XOR_REG(Bout[r * 2 + 1].q) - - return vgetq_lane_u32(X0, 0); -} - -#define XOR4_Y \ - X0 = veorq_u32(X0, Y0); \ - X1 = veorq_u32(X1, Y1); \ - X2 = veorq_u32(X2, Y2); \ - X3 = veorq_u32(X3, Y3); - -static uint32_t -blockmix_xor_save(const salsa20_blk_t *restrict Bin1, - salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r, const uint32x4_t *restrict S) -{ - const uint8_t * S0, * S1; - uint32x4_t X0, X1, X2, X3, Y0, Y1, Y2, Y3; - size_t i; - - if (!S) - return blockmix_salsa8_xor_save(Bin1, Bin2, Bout, r); - - S0 = (const uint8_t *)S; - S1 = (const uint8_t *)S + S_SIZE_ALL / 2; - - /* Convert 128-byte blocks to 64-byte blocks */ - r *= 2; - - r--; - PREFETCH(&Bin2[r], _MM_HINT_T0) - PREFETCH(&Bin1[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i], _MM_HINT_T0) - PREFETCH(&Bin1[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - } - PREFETCH_OUT(&Bout[r], _MM_HINT_T0); - - /* X <-- B_{r1 - 1} */ - XOR4_2(Bin1[r].q, Bin2[r].q) - - /* for i = 0 to r1 - 1 do */ - for (i = 0; i < r; i++) { - XOR4(Bin1[i].q, Bin2[i].q) - /* X <-- H'(X \xor B_i) */ - XOR4_Y - PWXFORM - /* B'_i <-- X */ - OUT(Bout[i].q) - } - - /* Last iteration of the loop above */ - XOR4(Bin1[i].q, Bin2[i].q) - XOR4_Y - PWXFORM - - /* B'_i <-- H(B'_i) */ - SALSA20_8(Bout[i].q) - - return vgetq_lane_u32(X0, 0); -} - -#undef ARX -#undef SALSA20_2ROUNDS -#undef SALSA20_8 -#undef SALSA20_8_XOR_ANY -#undef SALSA20_8_XOR_MEM -#undef SALSA20_8_XOR_REG -#undef PWXFORM_SIMD_1 -#undef PWXFORM_SIMD_2 -#undef PWXFORM_ROUND -#undef PWXFORM -#undef OUT -#undef XOR4 -#undef XOR4_2 -#undef XOR4_Y - -/** - * integerify(B, r): - * Return the result of parsing B_{2r-1} as a little-endian integer. - */ -static inline uint32_t -integerify(const salsa20_blk_t * B, size_t r) -{ - return B[2 * r - 1].w[0]; -} - -/** - * smix1(B, r, N, flags, V, NROM, shared, XY, S): - * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage XY must be 128r bytes in length. The value N must be even and no - * smaller than 2. The array V must be aligned to a multiple of 64 bytes, and - * arrays B and XY to a multiple of at least 16 bytes (aligning them to 64 - * bytes as well saves cache lines, but might result in cache bank conflicts). - */ -static void -smix1(uint8_t * B, size_t r, uint32_t N, yescrypt_flags_t flags, - salsa20_blk_t * V, uint32_t NROM, const yescrypt_shared_t * shared, - salsa20_blk_t * XY, void * S) -{ - const salsa20_blk_t * VROM = shared->shared1.aligned; - uint32_t VROM_mask = shared->mask1; - size_t s = 2 * r; - salsa20_blk_t * X = V, * Y; - uint32_t i, j; - size_t k; - - /* 1: X <-- B */ - /* 3: V_i <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); - } - } - - if (NROM && (VROM_mask & 1)) { - uint32_t n; - salsa20_blk_t * V_n; - const salsa20_blk_t * V_j; - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[s]; - blockmix(X, Y, r, S); - - X = &V[2 * s]; - if ((1 & VROM_mask) == 1) { - /* j <-- Integerify(X) mod NROM */ - j = integerify(Y, r) & (NROM - 1); - V_j = &VROM[j * s]; - - /* X <-- H(X \xor VROM_j) */ - j = blockmix_xor(Y, V_j, X, r, 1, S); - } else { - /* X <-- H(X) */ - blockmix(Y, X, r, S); - j = integerify(X, r); - } - - for (n = 2; n < N; n <<= 1) { - uint32_t m = (n < N / 2) ? n : (N - 1 - n); - - V_n = &V[n * s]; - - /* 2: for i = 0 to N - 1 do */ - for (i = 1; i < m; i += 2) { - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += i - 1; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V_n[i * s]; - j = blockmix_xor(X, V_j, Y, r, 0, S); - - if (((n + i) & VROM_mask) == 1) { - /* j <-- Integerify(X) mod NROM */ - j &= NROM - 1; - V_j = &VROM[j * s]; - } else { - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += i; - V_j = &V[j * s]; - } - - /* X <-- H(X \xor VROM_j) */ - X = &V_n[(i + 1) * s]; - j = blockmix_xor(Y, V_j, X, r, 1, S); - } - } - - n >>= 1; - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += N - 2 - n; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[(N - 1) * s]; - j = blockmix_xor(X, V_j, Y, r, 0, S); - - if (((N - 1) & VROM_mask) == 1) { - /* j <-- Integerify(X) mod NROM */ - j &= NROM - 1; - V_j = &VROM[j * s]; - } else { - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += N - 1 - n; - V_j = &V[j * s]; - } - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - X = XY; - blockmix_xor(Y, V_j, X, r, 1, S); - } else if (flags & YESCRYPT_RW) { - uint32_t n; - salsa20_blk_t * V_n, * V_j; - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[s]; - blockmix(X, Y, r, S); - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - X = &V[2 * s]; - blockmix(Y, X, r, S); - j = integerify(X, r); - - for (n = 2; n < N; n <<= 1) { - uint32_t m = (n < N / 2) ? n : (N - 1 - n); - - V_n = &V[n * s]; - - /* 2: for i = 0 to N - 1 do */ - for (i = 1; i < m; i += 2) { - Y = &V_n[i * s]; - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += i - 1; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - j = blockmix_xor(X, V_j, Y, r, 0, S); - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += i; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - X = &V_n[(i + 1) * s]; - j = blockmix_xor(Y, V_j, X, r, 0, S); - } - } - - n >>= 1; - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += N - 2 - n; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[(N - 1) * s]; - j = blockmix_xor(X, V_j, Y, r, 0, S); - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += N - 1 - n; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - X = XY; - blockmix_xor(Y, V_j, X, r, 0, S); - } else { - /* 2: for i = 0 to N - 1 do */ - for (i = 1; i < N - 1; i += 2) { - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[i * s]; - blockmix(X, Y, r, S); - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - X = &V[(i + 1) * s]; - blockmix(Y, X, r, S); - } - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[i * s]; - blockmix(X, Y, r, S); - - /* 4: X <-- H(X) */ - X = XY; - blockmix(Y, X, r, S); - } - - /* B' <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]); - } - } -} - -/** - * smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S): - * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage XY must be 256r bytes in length. The value N must be a power of 2 - * greater than 1. The value Nloop must be even. The array V must be aligned - * to a multiple of 64 bytes, and arrays B and XY to a multiple of at least 16 - * bytes (aligning them to 64 bytes as well saves cache lines, but might result - * in cache bank conflicts). - */ -static void -smix2(uint8_t * B, size_t r, uint32_t N, uint64_t Nloop, - yescrypt_flags_t flags, salsa20_blk_t * V, uint32_t NROM, - const yescrypt_shared_t * shared, salsa20_blk_t * XY, void * S) -{ - const salsa20_blk_t * VROM = shared->shared1.aligned; - uint32_t VROM_mask = shared->mask1; - size_t s = 2 * r; - salsa20_blk_t * X = XY, * Y = &XY[s]; - uint64_t i; - uint32_t j; - size_t k; - - if (Nloop == 0) - return; - - /* X <-- B' */ - /* 3: V_i <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); - } - } - - i = Nloop / 2; - - /* 7: j <-- Integerify(X) mod N */ - j = integerify(X, r) & (N - 1); - -/* - * Normally, NROM implies YESCRYPT_RW, but we check for these separately - * because YESCRYPT_PARALLEL_SMIX resets YESCRYPT_RW for the smix2() calls - * operating on the entire V. - */ - if (NROM && (flags & YESCRYPT_RW)) { - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < Nloop; i += 2) { - salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* j <-- Integerify(X) mod NROM */ - j = blockmix_xor_save(X, V_j, Y, r, S); - - if (((i + 1) & VROM_mask) == 1) { - const salsa20_blk_t * VROM_j; - - j &= NROM - 1; - VROM_j = &VROM[j * s]; - - /* X <-- H(X \xor VROM_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(Y, VROM_j, X, r, 1, S); - } else { - j &= N - 1; - V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* j <-- Integerify(X) mod NROM */ - j = blockmix_xor_save(Y, V_j, X, r, S); - } - j &= N - 1; - V_j = &V[j * s]; - } - } else if (NROM) { - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < Nloop; i += 2) { - const salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* j <-- Integerify(X) mod NROM */ - j = blockmix_xor(X, V_j, Y, r, 0, S); - - if (((i + 1) & VROM_mask) == 1) { - j &= NROM - 1; - V_j = &VROM[j * s]; - } else { - j &= N - 1; - V_j = &V[j * s]; - } - - /* X <-- H(X \xor VROM_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(Y, V_j, X, r, 1, S); - j &= N - 1; - V_j = &V[j * s]; - } - } else if (flags & YESCRYPT_RW) { - /* 6: for i = 0 to N - 1 do */ - do { - salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor_save(X, V_j, Y, r, S); - j &= N - 1; - V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor_save(Y, V_j, X, r, S); - j &= N - 1; - } while (--i); - } else { - /* 6: for i = 0 to N - 1 do */ - do { - const salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(X, V_j, Y, r, 0, S); - j &= N - 1; - V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(Y, V_j, X, r, 0, S); - j &= N - 1; - } while (--i); - } - - /* 10: B' <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]); - } - } -} - -/** - * p2floor(x): - * Largest power of 2 not greater than argument. - */ -static uint64_t -p2floor(uint64_t x) -{ - uint64_t y; - while ((y = x & (x - 1))) - x = y; - return x; -} - -/** - * smix(B, r, N, p, t, flags, V, NROM, shared, XY, S): - * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the - * temporary storage V must be 128rN bytes in length; the temporary storage XY - * must be 256r or 256rp bytes in length (the larger size is required with - * OpenMP-enabled builds). The value N must be a power of 2 greater than 1. - * The array V must be aligned to a multiple of 64 bytes, and arrays B and - * XY to a multiple of at least 16 bytes (aligning them to 64 bytes as well - * saves cache lines and helps avoid false sharing in OpenMP-enabled builds - * when p > 1, but it might also result in cache bank conflicts). - */ -static void -smix(uint8_t * B, size_t r, uint32_t N, uint32_t p, uint32_t t, - yescrypt_flags_t flags, - salsa20_blk_t * V, uint32_t NROM, const yescrypt_shared_t * shared, - salsa20_blk_t * XY, void * S) -{ - size_t s = 2 * r; - uint32_t Nchunk = N / p; - uint64_t Nloop_all, Nloop_rw; - uint32_t i; - - Nloop_all = Nchunk; - if (flags & YESCRYPT_RW) { - if (t <= 1) { - if (t) - Nloop_all *= 2; /* 2/3 */ - Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ - } else { - Nloop_all *= t - 1; - } - } else if (t) { - if (t == 1) - Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ - Nloop_all *= t; - } - - Nloop_rw = 0; - if (flags & __YESCRYPT_INIT_SHARED) - Nloop_rw = Nloop_all; - else if (flags & YESCRYPT_RW) - Nloop_rw = Nloop_all / p; - - Nchunk &= ~(uint32_t)1; /* round down to even */ - Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ - Nloop_rw &= ~(uint64_t)1; /* round down to even */ - -#ifdef _OPENMP -#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, shared, XY, S, s, Nchunk, Nloop_all, Nloop_rw) - { -#pragma omp for -#endif - for (i = 0; i < p; i++) { - uint32_t Vchunk = i * Nchunk; - uint8_t * Bp = &B[128 * r * i]; - salsa20_blk_t * Vp = &V[Vchunk * s]; -#ifdef _OPENMP - salsa20_blk_t * XYp = &XY[i * (2 * s)]; -#else - salsa20_blk_t * XYp = XY; -#endif - uint32_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); - void * Sp = S ? ((uint8_t *)S + i * S_SIZE_ALL) : S; - if (Sp) - smix1(Bp, 1, S_SIZE_ALL / 128, - flags & ~YESCRYPT_PWXFORM, - Sp, NROM, shared, XYp, NULL); - if (!(flags & __YESCRYPT_INIT_SHARED_2)) - smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp); - smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, - NROM, shared, XYp, Sp); - } - - if (Nloop_all > Nloop_rw) { -#ifdef _OPENMP -#pragma omp for -#endif - for (i = 0; i < p; i++) { - uint8_t * Bp = &B[128 * r * i]; -#ifdef _OPENMP - salsa20_blk_t * XYp = &XY[i * (2 * s)]; -#else - salsa20_blk_t * XYp = XY; -#endif - void * Sp = S ? ((uint8_t *)S + i * S_SIZE_ALL) : S; - smix2(Bp, r, N, Nloop_all - Nloop_rw, - flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp); - } - } -#ifdef _OPENMP - } -#endif -} - -/** - * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, - * N, r, p, t, flags, buf, buflen): - * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, - * p, buflen), or a revision of scrypt as requested by flags and shared, and - * write the result into buf. The parameters r, p, and buflen must satisfy - * r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power - * of 2 greater than 1. (This optimized implementation currently additionally - * limits N to the range from 8 to 2^31, but other implementation might not.) - * - * t controls computation time while not affecting peak memory usage. shared - * and flags may request special modes as described in yescrypt.h. local is - * the thread-local data structure, allowing to preserve and reuse a memory - * allocation across calls, thereby reducing its overhead. - * - * Return 0 on success; or -1 on error. - */ -int -yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local, - const uint8_t * passwd, size_t passwdlen, - const uint8_t * salt, size_t saltlen, - uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, - uint8_t * buf, size_t buflen) -{ - yescrypt_region_t tmp; - uint64_t NROM; - size_t B_size, V_size, XY_size, need; - uint8_t * B, * S; - salsa20_blk_t * V, * XY; - uint8_t sha256[32]; - - /* - * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose, - * so don't let it have side-effects. Without this adjustment, it'd - * enable the SHA-256 password pre-hashing and output post-hashing, - * because any deviation from classic scrypt implies those. - */ - if (p == 1) - flags &= ~YESCRYPT_PARALLEL_SMIX; - - /* Sanity-check parameters */ - if (flags & ~YESCRYPT_KNOWN_FLAGS) { - errno = EINVAL; - return -1; - } -#if SIZE_MAX > UINT32_MAX - if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { - errno = EFBIG; - return -1; - } -#endif - if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { - errno = EFBIG; - return -1; - } - if (N > UINT32_MAX) { - errno = EFBIG; - return -1; - } - if (((N & (N - 1)) != 0) || (N <= 7) || (r < 1) || (p < 1)) { - errno = EINVAL; - return -1; - } - if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 7)) { - errno = EINVAL; - return -1; - } - if ((r > SIZE_MAX / 256 / p) || - (N > SIZE_MAX / 128 / r)) { - errno = ENOMEM; - return -1; - } -#ifdef _OPENMP - if (!(flags & YESCRYPT_PARALLEL_SMIX) && - (N > SIZE_MAX / 128 / (r * p))) { - errno = ENOMEM; - return -1; - } -#endif - if ((flags & YESCRYPT_PWXFORM) && -#ifndef _OPENMP - (flags & YESCRYPT_PARALLEL_SMIX) && -#endif - p > SIZE_MAX / S_SIZE_ALL) { - errno = ENOMEM; - return -1; - } - - NROM = 0; - if (shared->shared1.aligned) { - NROM = shared->shared1.aligned_size / ((size_t)128 * r); - if (NROM > UINT32_MAX) { - errno = EFBIG; - return -1; - } - if (((NROM & (NROM - 1)) != 0) || (NROM <= 7) || - !(flags & YESCRYPT_RW)) { - errno = EINVAL; - return -1; - } - } - - /* Allocate memory */ - V = NULL; - V_size = (size_t)128 * r * N; -#ifdef _OPENMP - if (!(flags & YESCRYPT_PARALLEL_SMIX)) - V_size *= p; -#endif - need = V_size; - if (flags & __YESCRYPT_INIT_SHARED) { - if (local->aligned_size < need) { - if (local->base || local->aligned || - local->base_size || local->aligned_size) { - errno = EINVAL; - return -1; - } - if (!alloc_region(local, need)) - return -1; - } - V = (salsa20_blk_t *)local->aligned; - need = 0; - } - B_size = (size_t)128 * r * p; - need += B_size; - if (need < B_size) { - errno = ENOMEM; - return -1; - } - XY_size = (size_t)256 * r; -#ifdef _OPENMP - XY_size *= p; -#endif - need += XY_size; - if (need < XY_size) { - errno = ENOMEM; - return -1; - } - if (flags & YESCRYPT_PWXFORM) { - size_t S_size = S_SIZE_ALL; -#ifdef _OPENMP - S_size *= p; -#else - if (flags & YESCRYPT_PARALLEL_SMIX) - S_size *= p; -#endif - need += S_size; - if (need < S_size) { - errno = ENOMEM; - return -1; - } - } - if (flags & __YESCRYPT_INIT_SHARED) { - if (!alloc_region(&tmp, need)) - return -1; - B = (uint8_t *)tmp.aligned; - XY = (salsa20_blk_t *)((uint8_t *)B + B_size); - } else { - init_region(&tmp); - if (local->aligned_size < need) { - if (free_region(local)) - return -1; - if (!alloc_region(local, need)) - return -1; - } - B = (uint8_t *)local->aligned; - V = (salsa20_blk_t *)((uint8_t *)B + B_size); - XY = (salsa20_blk_t *)((uint8_t *)V + V_size); - } - S = NULL; - if (flags & YESCRYPT_PWXFORM) - S = (uint8_t *)XY + XY_size; - - if (t || flags) { - SHA256_CTX ctx; - SHA256_Init(&ctx); - SHA256_Update(&ctx, passwd, passwdlen); - SHA256_Final(sha256, &ctx); - passwd = sha256; - passwdlen = sizeof(sha256); - } - - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, B, B_size); - - if (t || flags) - memcpy(sha256, B, sizeof(sha256)); - - if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) { - smix(B, r, N, p, t, flags, V, NROM, shared, XY, S); - } else { - uint32_t i; - - /* 2: for i = 0 to p - 1 do */ -#ifdef _OPENMP -#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, shared, XY, S) -#endif - for (i = 0; i < p; i++) { - /* 3: B_i <-- MF(B_i, N) */ -#ifdef _OPENMP - smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, - &V[(size_t)2 * r * i * N], - NROM, shared, - &XY[(size_t)4 * r * i], - S ? &S[S_SIZE_ALL * i] : S); -#else - smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, V, - NROM, shared, XY, S); -#endif - } - } - - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, buf, buflen); - - /* - * Except when computing classic scrypt, allow all computation so far - * to be performed on the client. The final steps below match those of - * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so - * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of - * SCRAM's use of SHA-1) would be usable with yescrypt hashes. - */ - if ((t || flags) && buflen == sizeof(sha256)) { - /* Compute ClientKey */ - { - HMAC_SHA256_CTX ctx; - HMAC_SHA256_Init(&ctx, buf, buflen); -#if 0 -/* Proper yescrypt */ - HMAC_SHA256_Update(&ctx, "Client Key", 10); -#else -/* GlobalBoost-Y buggy yescrypt */ - HMAC_SHA256_Update(&ctx, salt, saltlen); -#endif - HMAC_SHA256_Final(sha256, &ctx); - } - /* Compute StoredKey */ - { - SHA256_CTX ctx; - SHA256_Init(&ctx); - SHA256_Update(&ctx, sha256, sizeof(sha256)); - SHA256_Final(buf, &ctx); - } - } - - if (free_region(&tmp)) - return -1; - - /* Success! */ - return 0; -} diff --git a/src/crypto/randomx/defyx/yescrypt-opt.c b/src/crypto/randomx/defyx/yescrypt-opt.c deleted file mode 100644 index c621af6e..00000000 --- a/src/crypto/randomx/defyx/yescrypt-opt.c +++ /dev/null @@ -1,1103 +0,0 @@ -/*- - * Copyright 2009 Colin Percival - * Copyright 2013-2015 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ - -#include -#include -#include -#include - -#include "insecure_memzero.h" -#include "sha256.h" -#include "sysendian.h" - -#include "yescrypt.h" - -#include "yescrypt-platform.c" - -static inline void -blkcpy(uint64_t * dest, const uint64_t * src, size_t count) -{ - do { - *dest++ = *src++; *dest++ = *src++; - *dest++ = *src++; *dest++ = *src++; - } while (count -= 4); -} - -static inline void -blkxor(uint64_t * dest, const uint64_t * src, size_t count) -{ - do { - *dest++ ^= *src++; *dest++ ^= *src++; - *dest++ ^= *src++; *dest++ ^= *src++; - } while (count -= 4); -} - -typedef union { - uint32_t w[16]; - uint64_t d[8]; -} salsa20_blk_t; - -static inline void -salsa20_simd_shuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout) -{ -#define COMBINE(out, in1, in2) \ - Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32); - COMBINE(0, 0, 2) - COMBINE(1, 5, 7) - COMBINE(2, 2, 4) - COMBINE(3, 7, 1) - COMBINE(4, 4, 6) - COMBINE(5, 1, 3) - COMBINE(6, 6, 0) - COMBINE(7, 3, 5) -#undef COMBINE -} - -static inline void -salsa20_simd_unshuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout) -{ -#define UNCOMBINE(out, in1, in2) \ - Bout->w[out * 2] = Bin->d[in1]; \ - Bout->w[out * 2 + 1] = Bin->d[in2] >> 32; - UNCOMBINE(0, 0, 6) - UNCOMBINE(1, 5, 3) - UNCOMBINE(2, 2, 0) - UNCOMBINE(3, 7, 5) - UNCOMBINE(4, 4, 2) - UNCOMBINE(5, 1, 7) - UNCOMBINE(6, 6, 4) - UNCOMBINE(7, 3, 1) -#undef UNCOMBINE -} - -/** - * salsa20(B): - * Apply the Salsa20 core to the provided block. - */ -static void -salsa20(uint64_t B[8], uint32_t doublerounds) -{ - salsa20_blk_t X; -#define x X.w - - salsa20_simd_unshuffle((const salsa20_blk_t *)B, &X); - - do { -#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) - /* Operate on columns */ - x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); - x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); - - x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); - x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); - - x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); - x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); - - x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); - x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); - - /* Operate on rows */ - x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); - x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); - - x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); - x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); - - x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); - x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); - - x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); - x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); -#undef R - } while (--doublerounds); -#undef x - - { - uint32_t i; - salsa20_blk_t Y; - salsa20_simd_shuffle(&X, &Y); - for (i = 0; i < 16; i += 4) { - ((salsa20_blk_t *)B)->w[i] += Y.w[i]; - ((salsa20_blk_t *)B)->w[i + 1] += Y.w[i + 1]; - ((salsa20_blk_t *)B)->w[i + 2] += Y.w[i + 2]; - ((salsa20_blk_t *)B)->w[i + 3] += Y.w[i + 3]; - } - } -} - -/** - * blockmix_salsa8(Bin, Bout, X, r): - * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r - * bytes in length; the output Bout must also be the same size. The - * temporary space X must be 64 bytes. - */ -static void -blockmix_salsa8(const uint64_t * Bin, uint64_t * Bout, uint64_t * X, size_t r) -{ - size_t i; - - /* 1: X <-- B_{2r - 1} */ - blkcpy(X, &Bin[(2 * r - 1) * 8], 8); - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < 2 * r; i += 2) { - /* 3: X <-- H(X \xor B_i) */ - blkxor(X, &Bin[i * 8], 8); - salsa20(X, 4); - - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - blkcpy(&Bout[i * 4], X, 8); - - /* 3: X <-- H(X \xor B_i) */ - blkxor(X, &Bin[i * 8 + 8], 8); - salsa20(X, 4); - - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - blkcpy(&Bout[i * 4 + r * 8], X, 8); - } -} - -/* These are tunable */ -#define PWXsimple 2 -#define PWXgather 4 -#define PWXrounds 6 -#define Swidth 8 - -/* Derived values. Not tunable on their own. */ -#define PWXbytes (PWXgather * PWXsimple * 8) -#define PWXwords (PWXbytes / sizeof(uint64_t)) -#define Sbytes (3 * (1 << Swidth) * PWXsimple * 8) -#define Swords (Sbytes / sizeof(uint64_t)) -#define Smask (((1 << Swidth) - 1) * PWXsimple * 8) -#define Smask2 (((uint64_t)Smask << 32) | Smask) -#define rmin ((PWXbytes + 127) / 128) - -#if PWXbytes % 32 != 0 -#error "blkcpy() and blkxor() currently work on multiples of 32." -#endif - -typedef struct { - uint64_t *S0, *S1, *S2; - size_t w; -} pwxform_ctx_t; - -#define Salloc (Sbytes + ((sizeof(pwxform_ctx_t) + 63) & ~63U)) - -/** - * pwxform(B): - * Transform the provided block using the provided S-boxes. - */ -static void -pwxform(uint64_t * B, pwxform_ctx_t * ctx) -{ - uint64_t (*X)[PWXsimple] = (uint64_t (*)[PWXsimple])B; - uint64_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; - size_t w = ctx->w; - size_t i, j; -#if PWXsimple > 2 - size_t k; -#endif - - /* 2: for j = 0 to PWXgather - 1 do */ - for (j = 0; j < PWXgather; j++) { - uint64_t *Xj = X[j]; - uint64_t x0 = Xj[0]; -#if PWXsimple > 1 - uint64_t x1 = Xj[1]; -#endif - - /* 1: for i = 0 to PWXrounds - 1 do */ - for (i = 0; i < PWXrounds; i++) { - uint64_t x = x0 & Smask2; - const uint64_t *p0, *p1; - - /* 3: p0 <-- (lo(B_{j,0}) & Smask) / (PWXsimple * 8) */ - p0 = (const uint64_t *)((uint8_t *)S0 + (uint32_t)x); - /* 4: p1 <-- (hi(B_{j,0}) & Smask) / (PWXsimple * 8) */ - p1 = (const uint64_t *)((uint8_t *)S1 + (x >> 32)); - - /* 5: for k = 0 to PWXsimple - 1 do */ - /* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) \xor S1_{p1,k} */ - x0 = (uint64_t)(x0 >> 32) * (uint32_t)x0; - x0 += p0[0]; - x0 ^= p1[0]; - -#if PWXsimple > 1 - /* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) \xor S1_{p1,k} */ - x1 = (uint64_t)(x1 >> 32) * (uint32_t)x1; - x1 += p0[1]; - x1 ^= p1[1]; -#endif - -#if PWXsimple > 2 - /* 5: for k = 0 to PWXsimple - 1 do */ - for (k = 2; k < PWXsimple; k++) { - /* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) \xor S1_{p1,k} */ - x = Xj[k]; - - x = (uint64_t)(x >> 32) * (uint32_t)x; - x += p0[k]; - x ^= p1[k]; - - Xj[k] = x; - } -#endif - - /* 8: if (i != 0) and (i != PWXrounds - 1) */ - if ((i - 1) < PWXrounds - 2) { - /* 9: S2_w <-- B_j */ - /* 10: w <-- w + 1 */ - uint64_t *p2 = (uint64_t *)((uint8_t *)S2 + w); - w += PWXbytes; - p2[0] = x0; -#if PWXsimple > 1 - p2[1] = x1; -#endif -#if PWXsimple > 2 - for (k = 2; k < PWXsimple; k++) - p2[k] = Xj[k]; -#endif - } - } - - Xj[0] = x0; -#if PWXsimple > 1 - Xj[1] = x1; -#endif - - w -= (PWXrounds - 2) * PWXbytes - PWXsimple * 8; - } - - /* 14: (S0, S1, S2) <-- (S2, S0, S1) */ - ctx->S0 = S2; - ctx->S1 = S0; - ctx->S2 = S1; - /* 15: w <-- w mod 2^Swidth */ - ctx->w = (w + (PWXrounds - 3) * PWXbytes) & Smask; -} - -/** - * blockmix_pwxform(Bin, Bout, S, r): - * Compute Bout = BlockMix_pwxform{salsa20/2, ctx, r}(Bin). The input Bin must - * be 128r bytes in length; the output Bout must also be the same size. - */ -static void -blockmix_pwxform(const uint64_t * Bin, uint64_t * Bout, - pwxform_ctx_t * ctx, size_t r) -{ - size_t r1, r2, i; - - /* Convert 128-byte blocks to PWXbytes blocks */ - /* 1: r_1 <-- 128r / PWXbytes */ - r1 = r * 128 / PWXbytes; - - /* 2: X <-- B'_{r_1 - 1} */ - blkcpy(Bout, &Bin[(r1 - 1) * PWXwords], PWXwords); - - /* 3: for i = 0 to r_1 - 1 do */ - /* 4: if r_1 > 1 */ - if (r1 > 1) { - /* 5: X <-- X \xor B'_i */ - blkxor(Bout, Bin, PWXwords); - } - - /* 7: X <-- pwxform(X) */ - /* 8: B'_i <-- X */ - pwxform(Bout, ctx); - - /* 3: for i = 0 to r_1 - 1 do */ - for (i = 1; i < r1; i++) { - /* 5: X <-- X \xor B'_i */ - blkcpy(&Bout[i * PWXwords], &Bout[(i - 1) * PWXwords], - PWXwords); - blkxor(&Bout[i * PWXwords], &Bin[i * PWXwords], PWXwords); - - /* 7: X <-- pwxform(X) */ - /* 8: B'_i <-- X */ - pwxform(&Bout[i * PWXwords], ctx); - } - -#if PWXbytes > 128 - /* - * Handle partial blocks. If we were using just one buffer, like in - * the algorithm specification, the data would already be there, but - * since we use separate input and output buffers, we may have to copy - * some data over (which will then be processed by the Salsa20/8 - * invocations below) in this special case - that is, when 128r is not - * a multiple of PWXbytes. Since PWXgather and PWXsimple must each be - * a power of 2 (per the specification), PWXbytes is also a power of 2. - * Thus, 128r is obviously a multiple of valid values of PWXbytes up to - * 128, inclusive. When PWXbytes is larger than that (thus, 256 or - * larger) we perform this extra check. - */ - if (i * PWXwords < r * 16) - blkcpy(&Bout[i * PWXwords], &Bin[i * PWXwords], - r * 16 - i * PWXwords); -#endif - - /* 10: i <-- floor((r_1 - 1) * PWXbytes / 64) */ - i = (r1 - 1) * PWXbytes / 64; - - /* Convert 128-byte blocks to 64-byte blocks */ - r2 = r * 2; - - /* 11: B_i <-- H(B_i) */ - salsa20(&Bout[i * 8], 1); - - for (i++; i < r2; i++) { - /* 13: B_i <-- H(B_i \xor B_{i-1}) */ - blkxor(&Bout[i * 8], &Bout[(i - 1) * 8], 8); - salsa20(&Bout[i * 8], 1); - } -} - -/** - * integerify(B, r): - * Return the result of parsing B_{2r-1} as a little-endian integer. - */ -static inline uint64_t -integerify(const uint64_t * B, size_t r) -{ -/* - * Our 64-bit words are in host byte order, and word 6 holds the second 32-bit - * word of B_{2r-1} due to SIMD shuffling. The 64-bit value we return is also - * in host byte order, as it should be. - */ - const uint64_t * X = &B[(2 * r - 1) * 8]; - uint32_t lo = X[0]; - uint32_t hi = X[6] >> 32; - return ((uint64_t)hi << 32) + lo; -} - -/** - * smix1(B, r, N, flags, V, NROM, VROM, XY, ctx): - * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage XY must be 256r + 64 bytes in length. The value N must be even and - * no smaller than 2. - */ -static void -smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags, - uint64_t * V, uint64_t NROM, const uint64_t * VROM, - uint64_t * XY, pwxform_ctx_t * ctx) -{ - size_t s = 16 * r; - uint64_t * X = V; - uint64_t * Y = &XY[s]; - uint64_t * Z = &XY[2 * s]; - uint64_t n, i, j; - size_t k; - - /* 1: X <-- B */ - /* 3: V_i <-- X */ - for (i = 0; i < 2 * r; i++) { - const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8]; - salsa20_blk_t *tmp = (salsa20_blk_t *)Y; - salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8]; - for (k = 0; k < 16; k++) - tmp->w[k] = le32dec(&src->w[k]); - salsa20_simd_shuffle(tmp, dst); - } - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - if (ctx) - blockmix_pwxform(X, Y, ctx, r); - else - blockmix_salsa8(X, Y, Z, r); - blkcpy(&V[s], Y, s); - - X = XY; - - if (VROM) { - /* j <-- Integerify(X) mod NROM */ - j = integerify(Y, r) & (NROM - 1); - - /* X <-- H(X \xor VROM_j) */ - blkxor(Y, &VROM[j * s], s); - - blockmix_pwxform(Y, X, ctx, r); - - /* 2: for i = 0 to N - 1 do */ - for (n = 1, i = 2; i < N; i += 2) { - /* 3: V_i <-- X */ - blkcpy(&V[i * s], X, s); - - if ((i & (i - 1)) == 0) - n <<= 1; - - /* j <-- Wrap(Integerify(X), i) */ - j = integerify(X, r) & (n - 1); - j += i - n; - - /* X <-- X \xor V_j */ - blkxor(X, &V[j * s], s); - - /* 4: X <-- H(X) */ - blockmix_pwxform(X, Y, ctx, r); - - /* 3: V_i <-- X */ - blkcpy(&V[(i + 1) * s], Y, s); - - /* j <-- Integerify(X) mod NROM */ - j = integerify(Y, r) & (NROM - 1); - - /* X <-- H(X \xor VROM_j) */ - blkxor(Y, &VROM[j * s], s); - - blockmix_pwxform(Y, X, ctx, r); - } - } else if (flags & YESCRYPT_RW) { - /* 4: X <-- H(X) */ - blockmix_pwxform(Y, X, ctx, r); - - /* 2: for i = 0 to N - 1 do */ - for (n = 1, i = 2; i < N; i += 2) { - /* 3: V_i <-- X */ - blkcpy(&V[i * s], X, s); - - if ((i & (i - 1)) == 0) - n <<= 1; - - /* j <-- Wrap(Integerify(X), i) */ - j = integerify(X, r) & (n - 1); - j += i - n; - - /* X <-- X \xor V_j */ - blkxor(X, &V[j * s], s); - - /* 4: X <-- H(X) */ - blockmix_pwxform(X, Y, ctx, r); - - /* 3: V_i <-- X */ - blkcpy(&V[(i + 1) * s], Y, s); - - /* j <-- Wrap(Integerify(X), i) */ - j = integerify(Y, r) & (n - 1); - j += (i + 1) - n; - - /* X <-- X \xor V_j */ - blkxor(Y, &V[j * s], s); - - /* 4: X <-- H(X) */ - blockmix_pwxform(Y, X, ctx, r); - } - } else { - /* 4: X <-- H(X) */ - blockmix_salsa8(Y, X, Z, r); - - /* 2: for i = 0 to N - 1 do */ - for (n = 1, i = 2; i < N; i += 2) { - /* 3: V_i <-- X */ - blkcpy(&V[i * s], X, s); - - /* 4: X <-- H(X) */ - blockmix_salsa8(X, Y, Z, r); - - /* 3: V_i <-- X */ - blkcpy(&V[(i + 1) * s], Y, s); - - /* 4: X <-- H(X) */ - blockmix_salsa8(Y, X, Z, r); - } - } - - /* B' <-- X */ - for (i = 0; i < 2 * r; i++) { - const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8]; - salsa20_blk_t *tmp = (salsa20_blk_t *)Y; - salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8]; - for (k = 0; k < 16; k++) - le32enc(&tmp->w[k], src->w[k]); - salsa20_simd_unshuffle(tmp, dst); - } -} - -/** - * smix2(B, r, N, Nloop, flags, V, NROM, VROM, XY, ctx): - * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage XY must be 256r + 64 bytes in length. The value N must be a - * power of 2 greater than 1. The value Nloop must be even. - */ -static void -smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop, - yescrypt_flags_t flags, - uint64_t * V, uint64_t NROM, const uint64_t * VROM, - uint64_t * XY, pwxform_ctx_t * ctx) -{ - size_t s = 16 * r; - uint64_t * X = XY; - uint64_t * Y = &XY[s]; - uint64_t i, j; - - if (Nloop == 0) - return; - - /* X <-- B' */ - for (i = 0; i < 2 * r; i++) { - const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8]; - salsa20_blk_t *tmp = (salsa20_blk_t *)Y; - salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8]; - size_t k; - for (k = 0; k < 16; k++) - tmp->w[k] = le32dec(&src->w[k]); - salsa20_simd_shuffle(tmp, dst); - } - - if (VROM) { - yescrypt_flags_t rw = flags & YESCRYPT_RW; - - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < Nloop; i += 2) { - /* 7: j <-- Integerify(X) mod N */ - j = integerify(X, r) & (N - 1); - - /* 8: X <-- H(X \xor V_j) */ - blkxor(X, &V[j * s], s); - /* V_j <-- Xprev \xor V_j */ - if (rw) - blkcpy(&V[j * s], X, s); - blockmix_pwxform(X, Y, ctx, r); - - /* j <-- Integerify(X) mod NROM */ - j = integerify(Y, r) & (NROM - 1); - - /* X <-- H(X \xor VROM_j) */ - blkxor(Y, &VROM[j * s], s); - - blockmix_pwxform(Y, X, ctx, r); - } - } else if (ctx) { - yescrypt_flags_t rw = flags & YESCRYPT_RW; - - /* 6: for i = 0 to N - 1 do */ - i = Nloop / 2; - do { - /* 7: j <-- Integerify(X) mod N */ - j = integerify(X, r) & (N - 1); - - /* 8: X <-- H(X \xor V_j) */ - blkxor(X, &V[j * s], s); - /* V_j <-- Xprev \xor V_j */ - if (rw) - blkcpy(&V[j * s], X, s); - blockmix_pwxform(X, Y, ctx, r); - - /* 7: j <-- Integerify(X) mod N */ - j = integerify(Y, r) & (N - 1); - - /* 8: X <-- H(X \xor V_j) */ - blkxor(Y, &V[j * s], s); - /* V_j <-- Xprev \xor V_j */ - if (rw) - blkcpy(&V[j * s], Y, s); - blockmix_pwxform(Y, X, ctx, r); - } while (--i); - } else { - uint64_t * Z = &XY[2 * s]; - - /* 6: for i = 0 to N - 1 do */ - i = Nloop / 2; - do { - /* 7: j <-- Integerify(X) mod N */ - j = integerify(X, r) & (N - 1); - - /* 8: X <-- H(X \xor V_j) */ - blkxor(X, &V[j * s], s); - /* V_j <-- Xprev \xor V_j */ - blockmix_salsa8(X, Y, Z, r); - - /* 7: j <-- Integerify(X) mod N */ - j = integerify(Y, r) & (N - 1); - - /* 8: X <-- H(X \xor V_j) */ - blkxor(Y, &V[j * s], s); - /* V_j <-- Xprev \xor V_j */ - blockmix_salsa8(Y, X, Z, r); - } while (--i); - } - - /* 10: B' <-- X */ - for (i = 0; i < 2 * r; i++) { - const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8]; - salsa20_blk_t *tmp = (salsa20_blk_t *)Y; - salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8]; - size_t k; - for (k = 0; k < 16; k++) - le32enc(&tmp->w[k], src->w[k]); - salsa20_simd_unshuffle(tmp, dst); - } -} - -/** - * p2floor(x): - * Largest power of 2 not greater than argument. - */ -static uint64_t -p2floor(uint64_t x) -{ - uint64_t y; - while ((y = x & (x - 1))) - x = y; - return x; -} - -/** - * smix(B, r, N, p, t, flags, V, NROM, VROM, XY, S, passwd): - * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the - * temporary storage V must be 128rN bytes in length; the temporary storage - * XY must be 256r+64 or (256r+64)*p bytes in length (the larger size is - * required with OpenMP-enabled builds). The value N must be a power of 2 - * greater than 1. - */ -static void -smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t, - yescrypt_flags_t flags, - uint64_t * V, uint64_t NROM, const uint64_t * VROM, - uint64_t * XY, uint8_t * S, uint8_t * passwd) -{ - size_t s = 16 * r; - uint64_t Nchunk, Nloop_all, Nloop_rw; - uint32_t i; - - /* 1: n <-- N / p */ - Nchunk = N / p; - - /* 2: Nloop_all <-- fNloop(n, t, flags) */ - Nloop_all = Nchunk; - if (flags & YESCRYPT_RW) { - if (t <= 1) { - if (t) - Nloop_all *= 2; /* 2/3 */ - Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ - } else { - Nloop_all *= t - 1; - } - } else if (t) { - if (t == 1) - Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ - Nloop_all *= t; - } - - /* 6: Nloop_rw <-- 0 */ - Nloop_rw = 0; - if (flags & __YESCRYPT_INIT_SHARED) { - Nloop_rw = Nloop_all; - } else { - /* 3: if YESCRYPT_RW flag is set */ - if (flags & YESCRYPT_RW) { - /* 4: Nloop_rw <-- Nloop_all / p */ - Nloop_rw = Nloop_all / p; - } - } - - /* 8: n <-- n - (n mod 2) */ - Nchunk &= ~(uint64_t)1; /* round down to even */ - /* 9: Nloop_all <-- Nloop_all + (Nloop_all mod 2) */ - Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ - /* 10: Nloop_rw <-- Nloop_rw + (Nloop_rw mod 2) */ - Nloop_rw++; Nloop_rw &= ~(uint64_t)1; /* round up to even */ - - /* 11: for i = 0 to p - 1 do */ -#ifdef _OPENMP -#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, VROM, XY, S, passwd, s, Nchunk, Nloop_all, Nloop_rw) - { -#pragma omp for -#endif - for (i = 0; i < p; i++) { - /* 12: u <-- in */ - uint64_t Vchunk = i * Nchunk; - /* 13: if i = p - 1 */ - /* 14: n <-- N - u */ - /* 15: end if */ - /* 16: v <-- u + n - 1 */ - uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); - uint64_t * Bp = &B[i * s]; - uint64_t * Vp = &V[Vchunk * s]; -#ifdef _OPENMP - uint64_t * XYp = &XY[i * (2 * s + 8)]; -#else - uint64_t * XYp = XY; -#endif - pwxform_ctx_t * ctx_i = NULL; - /* 17: if YESCRYPT_RW flag is set */ - if (flags & YESCRYPT_RW) { - uint64_t *Si = (uint64_t *)(S + i * Salloc); - /* 18: SMix1_1(B_i, Sbytes / 128, S_i, no flags) */ - smix1(Bp, 1, Sbytes / 128, 0 /* no flags */, - Si, 0, NULL, XYp, NULL); - ctx_i = (pwxform_ctx_t *)(Si + Swords); - /* 19: S2_i <-- S_{i,0...2^Swidth-1} */ - ctx_i->S2 = Si; - /* 20: S1_i <-- S_{i,2^Swidth...2*2^Swidth-1} */ - ctx_i->S1 = Si + Swords / 3; - /* 21: S0_i <-- S_{i,2*2^Swidth...3*2^Swidth-1} */ - ctx_i->S0 = Si + Swords / 3 * 2; - /* 22: w_i <-- 0 */ - ctx_i->w = 0; - /* 23: if i = 0 */ - if (i == 0) { - /* 24: passwd <-- HMAC-SHA256(B_{0,2r-1}, passwd) */ - HMAC_SHA256_CTX ctx; - HMAC_SHA256_Init(&ctx, Bp + (s - 8), 64); - HMAC_SHA256_Update(&ctx, passwd, 32); - HMAC_SHA256_Final(passwd, &ctx); - } - } - if (!(flags & __YESCRYPT_INIT_SHARED_2)) { - /* 27: SMix1_r(B_i, n, V_{u..v}, flags) */ - smix1(Bp, r, Np, flags, Vp, NROM, VROM, XYp, ctx_i); - } - /* 28: SMix2_r(B_i, p2floor(n), Nloop_rw, V_{u..v}, flags) */ - smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, - NROM, VROM, XYp, ctx_i); - } - - /* 30: for i = 0 to p - 1 do */ - if (Nloop_all > Nloop_rw) { -#ifdef _OPENMP -#pragma omp for -#endif - for (i = 0; i < p; i++) { - uint64_t * Bp = &B[i * s]; -#ifdef _OPENMP - uint64_t * XYp = &XY[i * (2 * s + 8)]; -#else - uint64_t * XYp = XY; -#endif - pwxform_ctx_t * ctx_i = NULL; - if (flags & YESCRYPT_RW) - ctx_i = (pwxform_ctx_t *)(S + i * Salloc + Sbytes); - /* 31: SMix2_r(B_i, N, Nloop_all - Nloop_rw, V, flags excluding YESCRYPT_RW) */ - smix2(Bp, r, N, Nloop_all - Nloop_rw, - flags & ~YESCRYPT_RW, V, NROM, VROM, XYp, ctx_i); - } - } -#ifdef _OPENMP - } -#endif -} - -/** - * yescrypt_kdf_body(shared, local, passwd, passwdlen, salt, saltlen, - * N, r, p, t, flags, buf, buflen): - * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, - * p, buflen), or a revision of scrypt as requested by flags and shared, and - * write the result into buf. The parameters r, p, and buflen must satisfy - * r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power - * of 2 greater than 1. - * - * t controls computation time while not affecting peak memory usage. shared - * and flags may request special modes as described in yescrypt.h. local is - * the thread-local data structure, allowing to preserve and reuse a memory - * allocation across calls, thereby reducing its overhead. - * - * Return 0 on success; or -1 on error. - */ -static int -yescrypt_kdf_body(const yescrypt_shared_t * shared, yescrypt_local_t * local, - const uint8_t * passwd, size_t passwdlen, - const uint8_t * salt, size_t saltlen, - uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, - uint8_t * buf, size_t buflen) -{ - yescrypt_region_t tmp; - uint64_t NROM; - const uint64_t * VROM; - size_t B_size, V_size, XY_size, need; - uint64_t * B, * V, * XY; - uint8_t * S; - uint64_t sha256[4]; - uint8_t dk[sizeof(sha256)], * dkp = buf; - - /* Sanity-check parameters */ - if ((flags & ~YESCRYPT_KNOWN_FLAGS) || (!flags && t)) { - errno = EINVAL; - return -1; - } -#if SIZE_MAX > UINT32_MAX - if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { - errno = EFBIG; - return -1; - } -#endif - if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { - errno = EFBIG; - return -1; - } - if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) { - errno = EINVAL; - return -1; - } - if ((p > SIZE_MAX / ((size_t)256 * r + 64)) || -#if SIZE_MAX / 256 <= UINT32_MAX - (r > SIZE_MAX / 256) || -#endif - (N > SIZE_MAX / 128 / r)) { - errno = ENOMEM; - return -1; - } - if (N > UINT64_MAX / ((uint64_t)t + 1)) { - errno = EFBIG; - return -1; - } - if (flags & YESCRYPT_RW) { - if ((flags & YESCRYPT_WORM) || (N / p <= 1) || (r < rmin)) { - errno = EINVAL; - return -1; - } - if (p > SIZE_MAX / Salloc) { - errno = ENOMEM; - return -1; - } - } -#ifdef _OPENMP - else if (N > SIZE_MAX / 128 / (r * p)) { - errno = ENOMEM; - return -1; - } -#endif - - NROM = 0; - VROM = NULL; - if (shared) { - NROM = shared->aligned_size / ((size_t)128 * r); - if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) || - !(flags & YESCRYPT_RW)) { - errno = EINVAL; - return -1; - } - VROM = shared->aligned; - } - - /* Allocate memory */ - V = NULL; - V_size = (size_t)128 * r * N; -#ifdef _OPENMP - if (!(flags & YESCRYPT_RW)) - V_size *= p; -#endif - need = V_size; - if (flags & __YESCRYPT_INIT_SHARED) { - if (local->aligned_size < need) { - if (local->base || local->aligned || - local->base_size || local->aligned_size) { - errno = EINVAL; - return -1; - } - if (!alloc_region(local, need)) - return -1; - } - V = (uint64_t *)local->aligned; - need = 0; - } - B_size = (size_t)128 * r * p; - need += B_size; - if (need < B_size) { - errno = ENOMEM; - return -1; - } - XY_size = (size_t)256 * r + 64; -#ifdef _OPENMP - XY_size *= p; -#endif - need += XY_size; - if (need < XY_size) { - errno = ENOMEM; - return -1; - } - if (flags & YESCRYPT_RW) { - size_t S_size = (size_t)Salloc * p; - need += S_size; - if (need < S_size) { - errno = ENOMEM; - return -1; - } - } - if (flags & __YESCRYPT_INIT_SHARED) { - if (!alloc_region(&tmp, need)) - return -1; - B = (uint64_t *)tmp.aligned; - XY = (uint64_t *)((uint8_t *)B + B_size); - } else { - init_region(&tmp); - if (local->aligned_size < need) { - if (free_region(local)) - return -1; - if (!alloc_region(local, need)) - return -1; - } - B = (uint64_t *)local->aligned; - V = (uint64_t *)((uint8_t *)B + B_size); - XY = (uint64_t *)((uint8_t *)V + V_size); - } - S = NULL; - if (flags & YESCRYPT_RW) - S = (uint8_t *)XY + XY_size; - - if (flags) { - HMAC_SHA256_CTX ctx; - HMAC_SHA256_Init(&ctx, "yescrypt-prehash", - (flags & __YESCRYPT_PREHASH) ? 16 : 8); - HMAC_SHA256_Update(&ctx, passwd, passwdlen); - HMAC_SHA256_Final((uint8_t *)sha256, &ctx); - passwd = (uint8_t *)sha256; - passwdlen = sizeof(sha256); - } - - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, - (uint8_t *)B, B_size); - - if (flags) - blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0])); - - if (p == 1 || (flags & YESCRYPT_RW)) { - smix(B, r, N, p, t, flags, V, NROM, VROM, XY, S, - (uint8_t *)sha256); - } else { - uint32_t i; - - /* 2: for i = 0 to p - 1 do */ -#ifdef _OPENMP -#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, VROM, XY, S) -#endif - for (i = 0; i < p; i++) { - /* 3: B_i <-- MF(B_i, N) */ -#ifdef _OPENMP - smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, - &V[(size_t)16 * r * i * N], - NROM, VROM, - &XY[((size_t)32 * r + 8) * i], NULL, NULL); -#else - smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, V, - NROM, VROM, XY, NULL, NULL); -#endif - } - } - - dkp = buf; - if (flags && buflen < sizeof(dk)) { - PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, - dk, sizeof(dk)); - dkp = dk; - } - - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen); - - /* - * Except when computing classic scrypt, allow all computation so far - * to be performed on the client. The final steps below match those of - * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so - * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of - * SCRAM's use of SHA-1) would be usable with yescrypt hashes. - */ - if (flags && !(flags & __YESCRYPT_PREHASH)) { - /* Compute ClientKey */ - { - HMAC_SHA256_CTX ctx; - HMAC_SHA256_Init(&ctx, dkp, sizeof(dk)); - HMAC_SHA256_Update(&ctx, "Client Key", 10); - HMAC_SHA256_Final((uint8_t *)sha256, &ctx); - } - /* Compute StoredKey */ - { - SHA256_CTX ctx; - size_t clen = buflen; - if (clen > sizeof(dk)) - clen = sizeof(dk); - SHA256_Init(&ctx); - SHA256_Update(&ctx, (uint8_t *)sha256, sizeof(sha256)); - SHA256_Final(dk, &ctx); - memcpy(buf, dk, clen); - } - } - - if (free_region(&tmp)) - return -1; - - /* Success! */ - return 0; -} - -/** - * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, - * N, r, p, t, g, flags, buf, buflen): - * Compute scrypt or its revision as requested by the parameters. The inputs - * to this function are the same as those for yescrypt_kdf_body() above, with - * the addition of g, which controls hash upgrades (0 for no upgrades so far). - */ -int -yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local, - const uint8_t * passwd, size_t passwdlen, - const uint8_t * salt, size_t saltlen, - uint64_t N, uint32_t r, uint32_t p, uint32_t t, uint32_t g, - yescrypt_flags_t flags, - uint8_t * buf, size_t buflen) -{ - uint8_t dk[32]; - - if ((flags & (YESCRYPT_RW | __YESCRYPT_INIT_SHARED)) == YESCRYPT_RW && - p >= 1 && N / p >= 0x100 && N / p * r >= 0x20000) { - int retval = yescrypt_kdf_body(shared, local, - passwd, passwdlen, salt, saltlen, - N >> 6, r, p, 0, flags | __YESCRYPT_PREHASH, - dk, sizeof(dk)); - if (retval) - return retval; - passwd = dk; - passwdlen = sizeof(dk); - } - - do { - uint8_t * dkp = g ? dk : buf; - size_t dklen = g ? sizeof(dk) : buflen; - int retval = yescrypt_kdf_body(shared, local, - passwd, passwdlen, salt, saltlen, - N, r, p, t, flags, dkp, dklen); - if (retval) - return retval; - - passwd = dkp; - passwdlen = dklen; - - N <<= 2; - if (!N) - return -1; - t >>= 1; - } while (g--); - - return 0; -} diff --git a/src/crypto/randomx/defyx/yescrypt-platform.c b/src/crypto/randomx/defyx/yescrypt-platform.c deleted file mode 100644 index 3a8824c9..00000000 --- a/src/crypto/randomx/defyx/yescrypt-platform.c +++ /dev/null @@ -1,195 +0,0 @@ -/*- - * Copyright 2013-2015 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifdef __unix -#include -#endif - -#include - -#include "yescrypt.h" - -#define HUGEPAGE_THRESHOLD (12 * 1024 * 1024) - -#ifdef __x86_64__ -#define HUGEPAGE_SIZE (2 * 1024 * 1024) -#else -#undef HUGEPAGE_SIZE -#endif - -static void * -alloc_region(yescrypt_region_t * region, size_t size) -{ - size_t base_size = size; - uint8_t * base, * aligned; -#ifdef MAP_ANON - int flags = -#ifdef MAP_NOCORE - MAP_NOCORE | -#endif - MAP_ANON | MAP_PRIVATE; -#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE) - size_t new_size = size; - const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1; - if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) { - flags |= MAP_HUGETLB; -/* - * Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of - * huge page size, so let's round up to huge page size here. - */ - new_size = size + hugepage_mask; - new_size &= ~hugepage_mask; - } - base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0); - if (base != MAP_FAILED) { - base_size = new_size; - } else - if (flags & MAP_HUGETLB) { - flags &= ~MAP_HUGETLB; - base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); - } - -#else - base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); -#endif - if (base == MAP_FAILED) - base = NULL; - aligned = base; -#elif defined(HAVE_POSIX_MEMALIGN) - if ((errno = posix_memalign((void **)&base, 64, size)) != 0) - base = NULL; - aligned = base; -#else - base = aligned = NULL; - if (size + 63 < size) { - //errno = ENOMEM; - } else if ((base = malloc(size + 63)) != NULL) { - aligned = base + 63; - aligned -= (uintptr_t)aligned & 63; - } -#endif - region->base = base; - region->aligned = aligned; - region->base_size = base ? base_size : 0; - region->aligned_size = base ? size : 0; - return aligned; -} - -static inline void -init_region(yescrypt_region_t * region) -{ - region->base = region->aligned = NULL; - region->base_size = region->aligned_size = 0; -} - -static int -free_region(yescrypt_region_t * region) -{ - if (region->base) { -#ifdef MAP_ANON - if (munmap(region->base, region->base_size)) - return -1; -#else - free(region->base); -#endif - } - init_region(region); - return 0; -} - -int -yescrypt_init_shared(yescrypt_shared_t * shared, - const uint8_t * param, size_t paramlen, - uint64_t N, uint32_t r, uint32_t p, - yescrypt_init_shared_flags_t flags, - uint8_t * buf, size_t buflen) -{ - yescrypt_shared_t half1, half2; - uint8_t salt[32]; - - if (flags & YESCRYPT_SHARED_PREALLOCATED) { - if (!shared->aligned || !shared->aligned_size) - return -1; - } else { - init_region(shared); - } - if (!param && !paramlen && !N && !r && !p && !buf && !buflen) - return 0; - - if (yescrypt_kdf(NULL, shared, - param, paramlen, NULL, 0, N, r, p, 0, 0, - YESCRYPT_RW | __YESCRYPT_INIT_SHARED_1, - salt, sizeof(salt))) - goto out; - - half1 = half2 = *shared; - half1.aligned_size /= 2; -#ifdef _MSC_VER - (uint8_t*)half2.aligned += half1.aligned_size; -#else - half2.aligned += half1.aligned_size; -#endif - half2.aligned_size = half1.aligned_size; - N /= 2; - - if (p > 1 && yescrypt_kdf(&half1, &half2, - param, paramlen, salt, sizeof(salt), N, r, p, 0, 0, - YESCRYPT_RW | __YESCRYPT_INIT_SHARED_2, - salt, sizeof(salt))) - goto out; - - if (yescrypt_kdf(&half2, &half1, - param, paramlen, salt, sizeof(salt), N, r, p, 0, 0, - YESCRYPT_RW | __YESCRYPT_INIT_SHARED_1, - salt, sizeof(salt))) - goto out; - - if (yescrypt_kdf(&half1, &half2, - param, paramlen, salt, sizeof(salt), N, r, p, 0, 0, - YESCRYPT_RW | __YESCRYPT_INIT_SHARED_1, - buf, buflen)) - goto out; - - return 0; - -out: - if (!(flags & YESCRYPT_SHARED_PREALLOCATED)) - free_region(shared); - return -1; -} - -int -yescrypt_free_shared(yescrypt_shared_t * shared) -{ - return free_region(shared); -} - -int -yescrypt_init_local(yescrypt_local_t * local) -{ - init_region(local); - return 0; -} - -int -yescrypt_free_local(yescrypt_local_t * local) -{ - return free_region(local); -} diff --git a/src/crypto/randomx/defyx/yescrypt-ref.c b/src/crypto/randomx/defyx/yescrypt-ref.c deleted file mode 100644 index 9121a847..00000000 --- a/src/crypto/randomx/defyx/yescrypt-ref.c +++ /dev/null @@ -1,880 +0,0 @@ -/*- - * Copyright 2009 Colin Percival - * Copyright 2013-2015 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - * - * This is the reference implementation. Its purpose is to provide a simple - * human- and machine-readable specification that implementations intended - * for actual use should be tested against. It is deliberately mostly not - * optimized, and it is not meant to be used in production. Instead, use - * yescrypt-best.c or one of the source files included from there. - */ - -#include -#include -#include -#include - -#include "sha256.h" -#include "sysendian.h" - -#include "yescrypt.h" - -static void -blkcpy(uint32_t * dest, const uint32_t * src, size_t count) -{ - do { - *dest++ = *src++; - } while (--count); -} - -static void -blkxor(uint32_t * dest, const uint32_t * src, size_t count) -{ - do { - *dest++ ^= *src++; - } while (--count); -} - -/** - * salsa20(B): - * Apply the Salsa20 core to the provided block. - */ -static void -salsa20(uint32_t B[16], uint32_t rounds) -{ - uint32_t x[16]; - size_t i; - - /* SIMD unshuffle */ - for (i = 0; i < 16; i++) - x[i * 5 % 16] = B[i]; - - for (i = 0; i < rounds; i += 2) { -#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) - /* Operate on columns */ - x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); - x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); - - x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); - x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); - - x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); - x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); - - x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); - x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); - - /* Operate on rows */ - x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); - x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); - - x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); - x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); - - x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); - x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); - - x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); - x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); -#undef R - } - - /* SIMD shuffle */ - for (i = 0; i < 16; i++) - B[i] += x[i * 5 % 16]; -} - -/** - * blockmix_salsa8(B, Y, r): - * Compute B = BlockMix_{salsa20/8, r}(B). The input B must be 128r bytes in - * length; the temporary space Y must also be the same size. - */ -static void -blockmix_salsa8(uint32_t * B, uint32_t * Y, size_t r) -{ - uint32_t X[16]; - size_t i; - - /* 1: X <-- B_{2r - 1} */ - blkcpy(X, &B[(2 * r - 1) * 16], 16); - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < 2 * r; i++) { - /* 3: X <-- H(X \xor B_i) */ - blkxor(X, &B[i * 16], 16); - salsa20(X, 8); - - /* 4: Y_i <-- X */ - blkcpy(&Y[i * 16], X, 16); - } - - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - for (i = 0; i < r; i++) - blkcpy(&B[i * 16], &Y[(i * 2) * 16], 16); - for (i = 0; i < r; i++) - blkcpy(&B[(i + r) * 16], &Y[(i * 2 + 1) * 16], 16); -} - -/* These are tunable */ -#define PWXsimple 2 -#define PWXgather 4 -#define PWXrounds 6 -#define Swidth 8 - -/* Derived values. Not tunable on their own. */ -#define PWXbytes (PWXgather * PWXsimple * 8) -#define PWXwords (PWXbytes / sizeof(uint32_t)) -#define Sbytes (3 * (1 << Swidth) * PWXsimple * 8) -#define Swords (Sbytes / sizeof(uint32_t)) -#define Smask (((1 << Swidth) - 1) * PWXsimple * 8) -#define rmin ((PWXbytes + 127) / 128) - -typedef struct { - uint32_t *S; - uint32_t (*S0)[2], (*S1)[2], (*S2)[2]; - size_t w; -} pwxform_ctx_t; - -/** - * pwxform(B): - * Transform the provided block using the provided S-boxes. - */ -static void -pwxform(uint32_t * B, pwxform_ctx_t * ctx) -{ - uint32_t (*X)[PWXsimple][2] = (uint32_t (*)[PWXsimple][2])B; - uint32_t (*S0)[2] = ctx->S0, (*S1)[2] = ctx->S1, (*S2)[2] = ctx->S2; - size_t w = ctx->w; - size_t i, j, k; - - /* 1: for i = 0 to PWXrounds - 1 do */ - for (i = 0; i < PWXrounds; i++) { - /* 2: for j = 0 to PWXgather - 1 do */ - for (j = 0; j < PWXgather; j++) { - uint32_t xl = X[j][0][0]; - uint32_t xh = X[j][0][1]; - uint32_t (*p0)[2], (*p1)[2]; - - /* 3: p0 <-- (lo(B_{j,0}) & Smask) / (PWXsimple * 8) */ - p0 = S0 + (xl & Smask) / sizeof(*S0); - /* 4: p1 <-- (hi(B_{j,0}) & Smask) / (PWXsimple * 8) */ - p1 = S1 + (xh & Smask) / sizeof(*S1); - - /* 5: for k = 0 to PWXsimple - 1 do */ - for (k = 0; k < PWXsimple; k++) { - uint64_t x, s0, s1; - - /* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) \xor S1_{p1,k} */ - s0 = ((uint64_t)p0[k][1] << 32) + p0[k][0]; - s1 = ((uint64_t)p1[k][1] << 32) + p1[k][0]; - - xl = X[j][k][0]; - xh = X[j][k][1]; - - x = (uint64_t)xh * xl; - x += s0; - x ^= s1; - - X[j][k][0] = x; - X[j][k][1] = x >> 32; - - /* 8: if (i != 0) and (i != PWXrounds - 1) */ - if (i != 0 && i != PWXrounds - 1) { - /* 9: S2_w <-- B_j */ - S2[w][0] = x; - S2[w][1] = x >> 32; - /* 10: w <-- w + 1 */ - w++; - } - } - } - } - - /* 14: (S0, S1, S2) <-- (S2, S0, S1) */ - ctx->S0 = S2; - ctx->S1 = S0; - ctx->S2 = S1; - /* 15: w <-- w mod 2^Swidth */ - ctx->w = w & ((1 << Swidth) * PWXsimple - 1); -} - -/** - * blockmix_pwxform(B, Y, ctx, r): - * Compute B = BlockMix_pwxform{salsa20/2, ctx, r}(B). The input B must be 128r - * bytes in length; the temporary space Y must be at least PWXbytes. - */ -static void -blockmix_pwxform(uint32_t * B, uint32_t * Y, pwxform_ctx_t * ctx, size_t r) -{ - size_t r1, i; - - /* Convert 128-byte blocks to PWXbytes blocks */ - /* 1: r_1 <-- 128r / PWXbytes */ - r1 = 128 * r / PWXbytes; - - /* 2: X <-- B'_{r_1 - 1} */ - blkcpy(Y, &B[(r1 - 1) * PWXwords], PWXwords); - - /* 3: for i = 0 to r_1 - 1 do */ - for (i = 0; i < r1; i++) { - /* 4: if r_1 > 1 */ - if (r1 > 1) { - /* 5: X <-- X \xor B'_i */ - blkxor(Y, &B[i * PWXwords], PWXwords); - } - - /* 7: X <-- pwxform(X) */ - pwxform(Y, ctx); - - /* 8: B'_i <-- X */ - blkcpy(&B[i * PWXwords], Y, PWXwords); - } - - /* 10: i <-- floor((r_1 - 1) * PWXbytes / 64) */ - i = (r1 - 1) * PWXbytes / 64; - - /* 11: B_i <-- H(B_i) */ - salsa20(&B[i * 16], 2); - - /* 12: for i = i + 1 to 2r - 1 do */ - for (i++; i < 2 * r; i++) { - /* 13: B_i <-- H(B_i \xor B_{i-1}) */ - blkxor(&B[i * 16], &B[(i - 1) * 16], 16); - salsa20(&B[i * 16], 2); - } -} - -/** - * integerify(B, r): - * Return the result of parsing B_{2r-1} as a little-endian integer. - */ -static uint64_t -integerify(const uint32_t * B, size_t r) -{ -/* - * Our 32-bit words are in host byte order, and word 13 is the second word of - * B_{2r-1} due to SIMD shuffling. The 64-bit value we return is also in host - * byte order, as it should be. - */ - const uint32_t * X = &B[(2 * r - 1) * 16]; - return ((uint64_t)X[13] << 32) + X[0]; -} - -/** - * p2floor(x): - * Largest power of 2 not greater than argument. - */ -static uint64_t -p2floor(uint64_t x) -{ - uint64_t y; - while ((y = x & (x - 1))) - x = y; - return x; -} - -/** - * wrap(x, i): - * Wrap x to the range 0 to i-1. - */ -static uint64_t -wrap(uint64_t x, uint64_t i) -{ - uint64_t n = p2floor(i); - return (x & (n - 1)) + (i - n); -} - -/** - * smix1(B, r, N, flags, V, NROM, VROM, XY, ctx): - * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage XY must be 256r bytes in length. - */ -static void -smix1(uint32_t * B, size_t r, uint64_t N, yescrypt_flags_t flags, - uint32_t * V, uint64_t NROM, const uint32_t * VROM, - uint32_t * XY, pwxform_ctx_t * ctx) -{ - size_t s = 32 * r; - uint32_t * X = XY; - uint32_t * Y = &XY[s]; - uint64_t i, j; - size_t k; - - /* 1: X <-- B */ - for (k = 0; k < 2 * r; k++) - for (i = 0; i < 16; i++) - X[k * 16 + i] = le32dec(&B[k * 16 + (i * 5 % 16)]); - - /* 2: for i = 0 to N - 1 do */ - for (i = 0; i < N; i++) { - /* 3: V_i <-- X */ - blkcpy(&V[i * s], X, s); - - if (VROM && (i & 1)) { - /* j <-- Integerify(X) mod NROM */ - j = integerify(X, r) & (NROM - 1); - - /* X <-- H(X \xor VROM_j) */ - blkxor(X, &VROM[j * s], s); - } else if ((flags & YESCRYPT_RW) && i > 1) { - /* j <-- Wrap(Integerify(X), i) */ - j = wrap(integerify(X, r), i); - - /* X <-- X \xor V_j */ - blkxor(X, &V[j * s], s); - } - - /* 4: X <-- H(X) */ - if (ctx) - blockmix_pwxform(X, Y, ctx, r); - else - blockmix_salsa8(X, Y, r); - } - - /* B' <-- X */ - for (k = 0; k < 2 * r; k++) - for (i = 0; i < 16; i++) - le32enc(&B[k * 16 + (i * 5 % 16)], X[k * 16 + i]); -} - -/** - * smix2(B, r, N, Nloop, flags, V, NROM, VROM, XY, ctx): - * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage XY must be 256r bytes in length. The value N must be a power of 2 - * greater than 1. - */ -static void -smix2(uint32_t * B, size_t r, uint64_t N, uint64_t Nloop, - yescrypt_flags_t flags, uint32_t * V, uint64_t NROM, - const uint32_t * VROM, uint32_t * XY, pwxform_ctx_t * ctx) -{ - size_t s = 32 * r; - uint32_t * X = XY; - uint32_t * Y = &XY[s]; - uint64_t i, j; - size_t k; - - /* X <-- B */ - for (k = 0; k < 2 * r; k++) - for (i = 0; i < 16; i++) - X[k * 16 + i] = le32dec(&B[k * 16 + (i * 5 % 16)]); - - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < Nloop; i++) { - if (VROM && (i & 1)) { - /* j <-- Integerify(X) mod NROM */ - j = integerify(X, r) & (NROM - 1); - - /* X <-- H(X \xor VROM_j) */ - blkxor(X, &VROM[j * s], s); - } else { - /* 7: j <-- Integerify(X) mod N */ - j = integerify(X, r) & (N - 1); - - /* 8.1: X <-- X \xor V_j */ - blkxor(X, &V[j * s], s); - /* V_j <-- X */ - if (flags & YESCRYPT_RW) - blkcpy(&V[j * s], X, s); - } - - /* 8.2: X <-- H(X) */ - if (ctx) - blockmix_pwxform(X, Y, ctx, r); - else - blockmix_salsa8(X, Y, r); - } - - /* 10: B' <-- X */ - for (k = 0; k < 2 * r; k++) - for (i = 0; i < 16; i++) - le32enc(&B[k * 16 + (i * 5 % 16)], X[k * 16 + i]); -} - -/** - * smix(B, r, N, p, t, flags, V, NROM, VROM, XY, ctx, passwd): - * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the - * temporary storage V must be 128rN bytes in length; the temporary storage - * XY must be 256r bytes in length. The value N must be a power of 2 greater - * than 1. - */ -static void -smix(uint32_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t, - yescrypt_flags_t flags, - uint32_t * V, uint64_t NROM, const uint32_t * VROM, - uint32_t * XY, pwxform_ctx_t * ctx, uint8_t * passwd) -{ - size_t s = 32 * r; - uint64_t Nchunk, Nloop_all, Nloop_rw, Vchunk; - uint32_t i; - - /* 1: n <-- N / p */ - Nchunk = N / p; - - /* 2: Nloop_all <-- fNloop(n, t, flags) */ - Nloop_all = Nchunk; - if (flags & YESCRYPT_RW) { - if (t <= 1) { - if (t) - Nloop_all *= 2; /* 2/3 */ - Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ - } else { - Nloop_all *= t - 1; - } - } else if (t) { - if (t == 1) - Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ - Nloop_all *= t; - } - - /* 6: Nloop_rw <-- 0 */ - Nloop_rw = 0; - if (flags & __YESCRYPT_INIT_SHARED) { - Nloop_rw = Nloop_all; - } else { - /* 3: if YESCRYPT_RW flag is set */ - if (flags & YESCRYPT_RW) { - /* 4: Nloop_rw <-- Nloop_all / p */ - Nloop_rw = Nloop_all / p; - } - } - - /* 8: n <-- n - (n mod 2) */ - Nchunk &= ~(uint64_t)1; /* round down to even */ - /* 9: Nloop_all <-- Nloop_all + (Nloop_all mod 2) */ - Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ - /* 10: Nloop_rw <-- Nloop_rw + (Nloop_rw mod 2) */ - Nloop_rw++; Nloop_rw &= ~(uint64_t)1; /* round up to even */ - - /* 11: for i = 0 to p - 1 do */ - /* 12: u <-- in */ - for (i = 0, Vchunk = 0; i < p; i++, Vchunk += Nchunk) { - /* 13: if i = p - 1 */ - /* 14: n <-- N - u */ - /* 15: end if */ - /* 16: v <-- u + n - 1 */ - uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); - uint32_t * Bp = &B[i * s]; - uint32_t * Vp = &V[Vchunk * s]; - pwxform_ctx_t * ctx_i = NULL; - /* 17: if YESCRYPT_RW flag is set */ - if (flags & YESCRYPT_RW) { - ctx_i = &ctx[i]; - /* 18: SMix1_1(B_i, Sbytes / 128, S_i, no flags) */ - smix1(Bp, 1, Sbytes / 128, 0 /* no flags */, - ctx_i->S, 0, NULL, XY, NULL); - /* 19: S2_i <-- S_{i,0...2^Swidth-1} */ - ctx_i->S2 = (uint32_t (*)[2])ctx_i->S; - /* 20: S1_i <-- S_{i,2^Swidth...2*2^Swidth-1} */ - ctx_i->S1 = ctx_i->S2 + (1 << Swidth) * PWXsimple; - /* 21: S0_i <-- S_{i,2*2^Swidth...3*2^Swidth-1} */ - ctx_i->S0 = ctx_i->S1 + (1 << Swidth) * PWXsimple; - /* 22: w_i <-- 0 */ - ctx_i->w = 0; - /* 23: if i = 0 */ - if (i == 0) { - /* 24: passwd <-- HMAC-SHA256(B_{0,2r-1}, passwd) */ - HMAC_SHA256_CTX_Y ctx; - HMAC_SHA256_Init_Y(&ctx, Bp + (s - 16), 64); - HMAC_SHA256_Update_Y(&ctx, passwd, 32); - HMAC_SHA256_Final_Y(passwd, &ctx); - } - } - if (!(flags & __YESCRYPT_INIT_SHARED_2)) { - /* 27: SMix1_r(B_i, n, V_{u..v}, flags) */ - smix1(Bp, r, Np, flags, Vp, NROM, VROM, XY, ctx_i); - } - /* 28: SMix2_r(B_i, p2floor(n), Nloop_rw, V_{u..v}, flags) */ - smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, - NROM, VROM, XY, ctx_i); - } - - /* 30: for i = 0 to p - 1 do */ - for (i = 0; i < p; i++) { - uint32_t * Bp = &B[i * s]; - /* 31: SMix2_r(B_i, N, Nloop_all - Nloop_rw, V, flags excluding YESCRYPT_RW) */ - smix2(Bp, r, N, Nloop_all - Nloop_rw, flags & ~YESCRYPT_RW, - V, NROM, VROM, XY, (flags & YESCRYPT_RW) ? &ctx[i] : NULL); - } -} - -/** - * yescrypt_kdf_body(shared, local, passwd, passwdlen, salt, saltlen, - * N, r, p, t, flags, buf, buflen): - * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, - * p, buflen), or a revision of scrypt as requested by flags and shared, and - * write the result into buf. The parameters r, p, and buflen must satisfy - * r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power - * of 2 greater than 1. - * - * t controls computation time while not affecting peak memory usage. shared - * and flags may request special modes as described in yescrypt.h. local is - * the thread-local data structure, allowing optimized implementations to - * preserve and reuse a memory allocation across calls, thereby reducing its - * overhead (this reference implementation does not make that optimization). - * - * Return 0 on success; or -1 on error. - */ -static int -yescrypt_kdf_body(const yescrypt_shared_t * shared, yescrypt_local_t * local, - const uint8_t * passwd, size_t passwdlen, - const uint8_t * salt, size_t saltlen, - uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, - uint8_t * buf, size_t buflen) -{ - int retval = -1; - uint64_t NROM; - const uint32_t * VROM; - size_t B_size, V_size; - uint32_t * B, * V, * XY, * S; - pwxform_ctx_t * pwxform_ctx; - uint32_t sha256[8]; - uint8_t dk[sizeof(sha256)], * dkp = buf; - uint32_t i; - - /* Sanity-check parameters */ - if ((flags & ~YESCRYPT_KNOWN_FLAGS) || (!flags && t)) { - errno = EINVAL; - return -1; - } -#if SIZE_MAX > UINT32_MAX - if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { - errno = EFBIG; - return -1; - } -#endif - if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { - errno = EFBIG; - return -1; - } - if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) { - errno = EINVAL; - return -1; - } - if ((r > SIZE_MAX / 128 / p) || -#if SIZE_MAX / 256 <= UINT32_MAX - (r > SIZE_MAX / 256) || -#endif - (N > SIZE_MAX / 128 / r)) { - errno = ENOMEM; - return -1; - } - if (N > UINT64_MAX / ((uint64_t)t + 1)) { - errno = EFBIG; - return -1; - } - if (flags & YESCRYPT_RW) { - if ((flags & YESCRYPT_WORM) || (N / p <= 1) || (r < rmin)) { - errno = EINVAL; - return -1; - } - if (p > SIZE_MAX / Sbytes) { - errno = ENOMEM; - return -1; - } - if (p > SIZE_MAX / sizeof(*pwxform_ctx)) { - errno = ENOMEM; - return -1; - } - } - - NROM = 0; - VROM = NULL; - if (shared) { - NROM = shared->aligned_size / ((size_t)128 * r); -/* - * This implementation could support ROM without YESCRYPT_RW as well, but we - * currently don't want to make such support available so that it can be safely - * excluded from optimized implementations (where it'd require extra code). - */ - if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) || - !(flags & YESCRYPT_RW)) { - errno = EINVAL; - return -1; - } - VROM = shared->aligned; - } - - /* Allocate memory */ - V_size = (size_t)128 * r * N; - if (flags & __YESCRYPT_INIT_SHARED) { - V = (uint32_t *)local->aligned; - if (local->aligned_size < V_size) { - if (local->base || local->aligned || - local->base_size || local->aligned_size) { - errno = EINVAL; - return -1; - } - if ((V = malloc(V_size)) == NULL) - return -1; - local->base = local->aligned = V; - local->base_size = local->aligned_size = V_size; - } - } else { - if ((V = malloc(V_size)) == NULL) - return -1; - } - B_size = (size_t)128 * r * p; - if ((B = malloc(B_size)) == NULL) - goto free_V; - if ((XY = malloc((size_t)256 * r)) == NULL) - goto free_B; - S = NULL; - pwxform_ctx = NULL; - if (flags & YESCRYPT_RW) { - if ((S = malloc((size_t)Sbytes * p)) == NULL) - goto free_XY; - if ((pwxform_ctx = malloc(sizeof(*pwxform_ctx) * p)) == NULL) - goto free_S; - } - - if (flags) { - HMAC_SHA256_CTX_Y ctx; - HMAC_SHA256_Init_Y(&ctx, "yescrypt-prehash", - (flags & __YESCRYPT_PREHASH) ? 16 : 8); - HMAC_SHA256_Update_Y(&ctx, passwd, passwdlen); - HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx); - passwd = (uint8_t *)sha256; - passwdlen = sizeof(sha256); - } - - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, - (uint8_t *)B, B_size); - - if (flags) - blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0])); - - if (flags & YESCRYPT_RW) { - for (i = 0; i < p; i++) - pwxform_ctx[i].S = &S[i * Swords]; - smix(B, r, N, p, t, flags, V, NROM, VROM, XY, pwxform_ctx, - (uint8_t *)sha256); - } else { - /* 2: for i = 0 to p - 1 do */ - for (i = 0; i < p; i++) { - /* 3: B_i <-- MF(B_i, N) */ - smix(&B[(size_t)32 * r * i], r, N, 1, t, flags, V, - NROM, VROM, XY, NULL, NULL); - } - } - - dkp = buf; - if (flags && buflen < sizeof(dk)) { - PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, - dk, sizeof(dk)); - dkp = dk; - } - - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen); - - /* - * Except when computing classic scrypt, allow all computation so far - * to be performed on the client. The final steps below match those of - * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so - * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of - * SCRAM's use of SHA-1) would be usable with yescrypt hashes. - */ - if (flags && !(flags & __YESCRYPT_PREHASH)) { - /* Compute ClientKey */ - { - HMAC_SHA256_CTX_Y ctx; - HMAC_SHA256_Init_Y(&ctx, dkp, sizeof(dk)); - HMAC_SHA256_Update_Y(&ctx, "Client Key", 10); - HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx); - } - /* Compute StoredKey */ - { - SHA256_CTX_Y ctx; - size_t clen = buflen; - if (clen > sizeof(dk)) - clen = sizeof(dk); - SHA256_Init_Y(&ctx); - SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256)); - SHA256_Final_Y(dk, &ctx); - memcpy(buf, dk, clen); - } - } - - /* Success! */ - retval = 0; - - /* Free memory */ - free(pwxform_ctx); -free_S: - free(S); -free_XY: - free(XY); -free_B: - free(B); -free_V: - if (!(flags & __YESCRYPT_INIT_SHARED)) - free(V); - - return retval; -} - -/** - * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, - * N, r, p, t, g, flags, buf, buflen): - * Compute scrypt or its revision as requested by the parameters. The inputs - * to this function are the same as those for yescrypt_kdf_body() above, with - * the addition of g, which controls hash upgrades (0 for no upgrades so far). - */ -int -yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local, - const uint8_t * passwd, size_t passwdlen, - const uint8_t * salt, size_t saltlen, - uint64_t N, uint32_t r, uint32_t p, uint32_t t, uint32_t g, - yescrypt_flags_t flags, - uint8_t * buf, size_t buflen) -{ - uint8_t dk[32]; - - if ((flags & YESCRYPT_RW) && - p >= 1 && N / p >= 0x100 && N / p * r >= 0x20000) { - int retval = yescrypt_kdf_body(shared, local, - passwd, passwdlen, salt, saltlen, - N >> 6, r, p, 0, flags | __YESCRYPT_PREHASH, - dk, sizeof(dk)); - if (retval) - return retval; - passwd = dk; - passwdlen = sizeof(dk); - } - - do { - uint8_t * dkp = g ? dk : buf; - size_t dklen = g ? sizeof(dk) : buflen; - int retval = yescrypt_kdf_body(shared, local, - passwd, passwdlen, salt, saltlen, - N, r, p, t, flags, dkp, dklen); - if (retval) - return retval; - - passwd = dkp; - passwdlen = dklen; - - N <<= 2; - if (!N) - return -1; - t >>= 1; - } while (g--); - - return 0; -} - -int -yescrypt_init_shared(yescrypt_shared_t * shared, - const uint8_t * param, size_t paramlen, - uint64_t N, uint32_t r, uint32_t p, - yescrypt_init_shared_flags_t flags, - uint8_t * buf, size_t buflen) -{ - yescrypt_shared_t half1, half2; - uint8_t salt[32]; - - if (flags & YESCRYPT_SHARED_PREALLOCATED) { - if (!shared->aligned || !shared->aligned_size) - return -1; - } else { - shared->base = shared->aligned = NULL; - shared->base_size = shared->aligned_size = 0; - } - if (!param && !paramlen && !N && !r && !p && !buf && !buflen) - return 0; - - if (yescrypt_kdf_body(NULL, shared, - param, paramlen, NULL, 0, N, r, p, 0, - YESCRYPT_RW | __YESCRYPT_INIT_SHARED_1, - salt, sizeof(salt))) - goto out; - - half1 = half2 = *shared; - half1.aligned_size /= 2; -#ifdef _MSC_VER - (uint8_t*)half2.aligned += half1.aligned_size; -#else - half2.aligned += half1.aligned_size; -#endif - half2.aligned_size = half1.aligned_size; - N /= 2; - - if (p > 1 && yescrypt_kdf_body(&half1, &half2, - param, paramlen, salt, sizeof(salt), N, r, p, 0, - YESCRYPT_RW | __YESCRYPT_INIT_SHARED_2, - salt, sizeof(salt))) - goto out; - - if (yescrypt_kdf_body(&half2, &half1, - param, paramlen, salt, sizeof(salt), N, r, p, 0, - YESCRYPT_RW | __YESCRYPT_INIT_SHARED_1, - salt, sizeof(salt))) - goto out; - - if (yescrypt_kdf_body(&half1, &half2, - param, paramlen, salt, sizeof(salt), N, r, p, 0, - YESCRYPT_RW | __YESCRYPT_INIT_SHARED_1, - buf, buflen)) - goto out; - - return 0; - -out: - if (!(flags & YESCRYPT_SHARED_PREALLOCATED)) - free(shared->base); - return -1; -} - -int -yescrypt_free_shared(yescrypt_shared_t * shared) -{ - free(shared->base); - shared->base = shared->aligned = NULL; - shared->base_size = shared->aligned_size = 0; - return 0; -} - -int -yescrypt_init_local(yescrypt_local_t * local) -{ -/* The reference implementation doesn't use the local structure */ - local->base = local->aligned = NULL; - local->base_size = local->aligned_size = 0; - return 0; -} - -int -yescrypt_free_local(yescrypt_local_t * local) -{ -/* The reference implementation frees its memory in yescrypt_kdf() */ - return 0; -} diff --git a/src/crypto/randomx/defyx/yescrypt-simd.c b/src/crypto/randomx/defyx/yescrypt-simd.c deleted file mode 100644 index 6f3d5ad7..00000000 --- a/src/crypto/randomx/defyx/yescrypt-simd.c +++ /dev/null @@ -1,1368 +0,0 @@ -/*- - * Copyright 2009 Colin Percival - * Copyright 2012-2015 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ - -/* - * On 64-bit, enabling SSE4.1 helps our pwxform code indirectly, via avoiding - * gcc bug 54349 (fixed for gcc 4.9+). On 32-bit, it's of direct help. AVX - * and XOP are of further help either way. - */ - -#include -#ifdef __XOP__ -#include -#endif - -#include -#include -#include -#include - -#include "insecure_memzero.h" -#include "sha256.h" -#include "sysendian.h" - -#include "yescrypt.h" - -#include "yescrypt-platform.c" - -#if __STDC_VERSION__ >= 199901L -/* have restrict */ -#elif defined(__GNUC__) -#define restrict __restrict -#else -#define restrict -#endif - -#ifdef __GNUC__ -#define unlikely(exp) __builtin_expect(exp, 0) -#else -#define unlikely(exp) (exp) -#endif - -#define PREFETCH(x, hint) _mm_prefetch((const char *)(x), (hint)); - -#ifdef __XOP__ -#define ARX(out, in1, in2, s) \ - out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s)); -#else -#define ARX(out, in1, in2, s) \ - { \ - __m128i T = _mm_add_epi32(in1, in2); \ - out = _mm_xor_si128(out, _mm_slli_epi32(T, s)); \ - out = _mm_xor_si128(out, _mm_srli_epi32(T, 32-s)); \ - } -#endif - -#define SALSA20_2ROUNDS \ - /* Operate on "columns" */ \ - ARX(X1, X0, X3, 7) \ - ARX(X2, X1, X0, 9) \ - ARX(X3, X2, X1, 13) \ - ARX(X0, X3, X2, 18) \ -\ - /* Rearrange data */ \ - X1 = _mm_shuffle_epi32(X1, 0x93); \ - X2 = _mm_shuffle_epi32(X2, 0x4E); \ - X3 = _mm_shuffle_epi32(X3, 0x39); \ -\ - /* Operate on "rows" */ \ - ARX(X3, X0, X1, 7) \ - ARX(X2, X3, X0, 9) \ - ARX(X1, X2, X3, 13) \ - ARX(X0, X1, X2, 18) \ -\ - /* Rearrange data */ \ - X1 = _mm_shuffle_epi32(X1, 0x39); \ - X2 = _mm_shuffle_epi32(X2, 0x4E); \ - X3 = _mm_shuffle_epi32(X3, 0x93); - -/** - * Apply the Salsa20/2 core to the block provided in (X0 ... X3). - */ -#define SALSA20_2(out) \ - { \ - __m128i Y0 = X0; \ - __m128i Y1 = X1; \ - __m128i Y2 = X2; \ - __m128i Y3 = X3; \ - SALSA20_2ROUNDS \ - (out)[0] = X0 = _mm_add_epi32(X0, Y0); \ - (out)[1] = X1 = _mm_add_epi32(X1, Y1); \ - (out)[2] = X2 = _mm_add_epi32(X2, Y2); \ - (out)[3] = X3 = _mm_add_epi32(X3, Y3); \ - } - -/** - * Apply the Salsa20/8 core to the block provided in (X0 ... X3) ^ (Z0 ... Z3). - */ -#define SALSA20_8_XOR_ANY(maybe_decl, Z0, Z1, Z2, Z3, out) \ - X0 = _mm_xor_si128(X0, Z0); \ - X1 = _mm_xor_si128(X1, Z1); \ - X2 = _mm_xor_si128(X2, Z2); \ - X3 = _mm_xor_si128(X3, Z3); \ - { \ - maybe_decl Y0 = X0; \ - maybe_decl Y1 = X1; \ - maybe_decl Y2 = X2; \ - maybe_decl Y3 = X3; \ - SALSA20_2ROUNDS \ - SALSA20_2ROUNDS \ - SALSA20_2ROUNDS \ - SALSA20_2ROUNDS \ - (out)[0] = X0 = _mm_add_epi32(X0, Y0); \ - (out)[1] = X1 = _mm_add_epi32(X1, Y1); \ - (out)[2] = X2 = _mm_add_epi32(X2, Y2); \ - (out)[3] = X3 = _mm_add_epi32(X3, Y3); \ - } - -#define SALSA20_8_XOR_MEM(in, out) \ - SALSA20_8_XOR_ANY(__m128i, (in)[0], (in)[1], (in)[2], (in)[3], out) - -#define SALSA20_8_XOR_REG(out) \ - SALSA20_8_XOR_ANY(/* empty */, Y0, Y1, Y2, Y3, out) - -typedef union { - uint32_t w[16]; - __m128i q[4]; -} salsa20_blk_t; - -/** - * blockmix_salsa8(Bin, Bout, r): - * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r - * bytes in length; the output Bout must also be the same size. - */ -static void -blockmix_salsa8(const salsa20_blk_t *restrict Bin, - salsa20_blk_t *restrict Bout, size_t r) -{ - size_t i; - __m128i X0, X1, X2, X3; - - r--; - PREFETCH(&Bin[r * 2 + 1], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin[i * 2], _MM_HINT_T0) - PREFETCH(&Bin[i * 2 + 1], _MM_HINT_T0) - } - PREFETCH(&Bin[r * 2], _MM_HINT_T0) - - /* 1: X <-- B_{2r - 1} */ - X0 = Bin[r * 2 + 1].q[0]; - X1 = Bin[r * 2 + 1].q[1]; - X2 = Bin[r * 2 + 1].q[2]; - X3 = Bin[r * 2 + 1].q[3]; - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i <= r; i++) { - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[i * 2].q, Bout[i].q) - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[i * 2 + 1].q, Bout[r + 1 + i].q) - } -} - -/* - * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs - * starting with Sandy Bridge. Additionally, PSHUFD uses separate source and - * destination registers, whereas the shifts would require an extra move - * instruction for our code when building without AVX. Unfortunately, PSHUFD - * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ) - * and somewhat slower on some non-Intel CPUs (luckily not including AMD - * Bulldozer and Piledriver). - */ -#ifdef __AVX__ -#define HI32(X) \ - _mm_srli_si128((X), 4) -#elif 1 /* As an option, check for __SSE4_1__ here not to hurt Conroe */ -#define HI32(X) \ - _mm_shuffle_epi32((X), _MM_SHUFFLE(2,3,0,1)) -#else -#define HI32(X) \ - _mm_srli_epi64((X), 32) -#endif - -#if defined(__x86_64__) && (defined(__ICC) || defined(__llvm__)) -/* Intel's name, also supported by recent gcc */ -#define EXTRACT64(X) _mm_cvtsi128_si64(X) -#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__) -/* gcc got the 'x' name earlier than non-'x', MSVC and Open64 had bugs */ -#define EXTRACT64(X) _mm_cvtsi128_si64x(X) -#elif defined(__x86_64__) && defined(__SSE4_1__) -/* No known bugs for this intrinsic */ -#include -#define EXTRACT64(X) _mm_extract_epi64((X), 0) -#elif defined(__SSE4_1__) -/* 32-bit */ -#include -#if 0 -/* This is currently unused by the code below, which instead uses these two - * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */ -#define EXTRACT64(X) \ - ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ - ((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32)) -#endif -#else -/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64*() */ -#define EXTRACT64(X) \ - ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ - ((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32)) -#endif - -/* This is tunable */ -#define Swidth 8 - -/* Not tunable in this implementation, hard-coded in a few places */ -#define PWXsimple 2 -#define PWXgather 4 - -/* Derived values. Not tunable except via Swidth above. */ -#define PWXbytes (PWXgather * PWXsimple * 8) -#define Sbytes (3 * (1 << Swidth) * PWXsimple * 8) -#define Smask (((1 << Swidth) - 1) * PWXsimple * 8) -#define Smask2 (((uint64_t)Smask << 32) | Smask) - -#if !defined(__x86_64__) && defined(__SSE4_1__) -/* 32-bit with SSE4.1 */ -#define PWXFORM_X_T __m128i -#define PWXFORM_SIMD(X, x, s0, s1) \ - x = _mm_and_si128(X, _mm_set1_epi64x(Smask2)); \ - s0 = *(__m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \ - s1 = *(__m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1)); \ - X = _mm_mul_epu32(HI32(X), X); \ - X = _mm_add_epi64(X, s0); \ - X = _mm_xor_si128(X, s1); -#else -/* 64-bit, or 32-bit without SSE4.1 */ -#define PWXFORM_X_T uint64_t -#define PWXFORM_SIMD(X, x, s0, s1) \ - x = EXTRACT64(X) & Smask2; \ - s0 = *(__m128i *)(S0 + (uint32_t)x); \ - s1 = *(__m128i *)(S1 + (x >> 32)); \ - X = _mm_mul_epu32(HI32(X), X); \ - X = _mm_add_epi64(X, s0); \ - X = _mm_xor_si128(X, s1); -#endif - -#define PWXFORM_WRITE \ - *(__m128i *)(S2 + w) = X0; \ - *(__m128i *)(S2 + w + 16) = X1; \ - *(__m128i *)(S2 + w + 32) = X2; \ - *(__m128i *)(S2 + w + 48) = X3; \ - w += 64; - -#define PWXFORM_ROUND \ - PWXFORM_SIMD(X0, x0, s00, s01) \ - PWXFORM_SIMD(X1, x1, s10, s11) \ - PWXFORM_SIMD(X2, x2, s20, s21) \ - PWXFORM_SIMD(X3, x3, s30, s31) - -#define PWXFORM \ - { \ - PWXFORM_X_T x0, x1, x2, x3; \ - __m128i s00, s01, s10, s11, s20, s21, s30, s31; \ - PWXFORM_ROUND \ - PWXFORM_ROUND PWXFORM_WRITE \ - PWXFORM_ROUND PWXFORM_WRITE \ - PWXFORM_ROUND PWXFORM_WRITE \ - PWXFORM_ROUND PWXFORM_WRITE \ - PWXFORM_ROUND \ - w &= Smask; \ - { \ - uint8_t * Stmp = S2; \ - S2 = S1; \ - S1 = S0; \ - S0 = Stmp; \ - } \ - } - -#define XOR4(in) \ - X0 = _mm_xor_si128(X0, (in)[0]); \ - X1 = _mm_xor_si128(X1, (in)[1]); \ - X2 = _mm_xor_si128(X2, (in)[2]); \ - X3 = _mm_xor_si128(X3, (in)[3]); - -#define OUT(out) \ - (out)[0] = X0; \ - (out)[1] = X1; \ - (out)[2] = X2; \ - (out)[3] = X3; - -typedef struct { - uint8_t *S0, *S1, *S2; - size_t w; -} pwxform_ctx_t; - -#define Salloc (Sbytes + ((sizeof(pwxform_ctx_t) + 63) & ~63U)) - -/** - * blockmix_pwxform(Bin, Bout, r, S): - * Compute Bout = BlockMix_pwxform{salsa20/8, r, S}(Bin). The input Bin must - * be 128r bytes in length; the output Bout must also be the same size. - */ -static void -blockmix(const salsa20_blk_t *restrict Bin, salsa20_blk_t *restrict Bout, - size_t r, pwxform_ctx_t *restrict ctx) -{ - uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; - size_t w = ctx->w; - size_t i; - __m128i X0, X1, X2, X3; - - /* Convert 128-byte blocks to 64-byte blocks */ - /* 1: r_1 <-- 128r / PWXbytes */ - r *= 2; - - r--; - PREFETCH(&Bin[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin[i], _MM_HINT_T0) - } - - /* 2: X <-- B'_{r_1 - 1} */ - X0 = Bin[r].q[0]; - X1 = Bin[r].q[1]; - X2 = Bin[r].q[2]; - X3 = Bin[r].q[3]; - - /* 3: for i = 0 to r_1 - 1 do */ - i = 0; - do { - /* 5: X <-- X \xor B'_i */ - XOR4(Bin[i].q) - /* 7: X <-- pwxform(X) */ - PWXFORM - - if (unlikely(i >= r)) - break; - - /* 8: B'_i <-- X */ - OUT(Bout[i].q) - - i++; - } while (1); - - ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; - ctx->w = w; - - /* 11: B_i <-- H(B_i) */ - SALSA20_2(Bout[i].q) -} - -#define XOR4_2(in1, in2) \ - X0 = _mm_xor_si128((in1)[0], (in2)[0]); \ - X1 = _mm_xor_si128((in1)[1], (in2)[1]); \ - X2 = _mm_xor_si128((in1)[2], (in2)[2]); \ - X3 = _mm_xor_si128((in1)[3], (in2)[3]); - -static uint32_t -blockmix_salsa8_xor(const salsa20_blk_t *restrict Bin1, - const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r) -{ - size_t i; - __m128i X0, X1, X2, X3; - - r--; - PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0) - PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i * 2], _MM_HINT_T0) - PREFETCH(&Bin1[i * 2], _MM_HINT_T0) - PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0) - PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) - } - PREFETCH(&Bin2[r * 2], _MM_HINT_T0) - PREFETCH(&Bin1[r * 2], _MM_HINT_T0) - - /* 1: X <-- B_{2r - 1} */ - XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i <= r; i++) { - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2].q) - SALSA20_8_XOR_MEM(Bin2[i * 2].q, Bout[i].q) - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2 + 1].q) - SALSA20_8_XOR_MEM(Bin2[i * 2 + 1].q, Bout[r + 1 + i].q) - } - - return _mm_cvtsi128_si32(X0); -} - -static uint32_t -blockmix_xor(const salsa20_blk_t *restrict Bin1, - const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r, int Bin2_in_ROM, pwxform_ctx_t *restrict ctx) -{ - uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; - size_t w = ctx->w; - size_t i; - __m128i X0, X1, X2, X3; - - /* Convert 128-byte blocks to 64-byte blocks */ - /* 1: r_1 <-- 128r / PWXbytes */ - r *= 2; - - r--; - if (Bin2_in_ROM) { - PREFETCH(&Bin2[r], _MM_HINT_NTA) - PREFETCH(&Bin1[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i], _MM_HINT_NTA) - PREFETCH(&Bin1[i], _MM_HINT_T0) - } - } else { - PREFETCH(&Bin2[r], _MM_HINT_T0) - PREFETCH(&Bin1[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i], _MM_HINT_T0) - PREFETCH(&Bin1[i], _MM_HINT_T0) - } - } - - /* 2: X <-- B'_{r_1 - 1} */ - XOR4_2(Bin1[r].q, Bin2[r].q) - - /* 3: for i = 0 to r_1 - 1 do */ - i = 0; - r--; - do { - /* 5: X <-- X \xor B'_i */ - XOR4(Bin1[i].q) - XOR4(Bin2[i].q) - /* 7: X <-- pwxform(X) */ - PWXFORM - /* 8: B'_i <-- X */ - OUT(Bout[i].q) - - /* 5: X <-- X \xor B'_i */ - XOR4(Bin1[i + 1].q) - XOR4(Bin2[i + 1].q) - /* 7: X <-- pwxform(X) */ - PWXFORM - - if (unlikely(i >= r)) - break; - - /* 8: B'_i <-- X */ - OUT(Bout[i + 1].q) - - i += 2; - } while (1); - i++; - - ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; - ctx->w = w; - - /* 11: B_i <-- H(B_i) */ - SALSA20_2(Bout[i].q) - - return _mm_cvtsi128_si32(X0); -} - -#undef XOR4 -#define XOR4(in, out) \ - (out)[0] = Y0 = _mm_xor_si128((in)[0], (out)[0]); \ - (out)[1] = Y1 = _mm_xor_si128((in)[1], (out)[1]); \ - (out)[2] = Y2 = _mm_xor_si128((in)[2], (out)[2]); \ - (out)[3] = Y3 = _mm_xor_si128((in)[3], (out)[3]); - -#define XOR4_Y \ - X0 = _mm_xor_si128(X0, Y0); \ - X1 = _mm_xor_si128(X1, Y1); \ - X2 = _mm_xor_si128(X2, Y2); \ - X3 = _mm_xor_si128(X3, Y3); - -static uint32_t -blockmix_xor_save(const salsa20_blk_t *restrict Bin1, - salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r, pwxform_ctx_t *restrict ctx) -{ - __m128i X0, X1, X2, X3, Y0, Y1, Y2, Y3; - uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; - size_t w = ctx->w; - size_t i; - - /* Convert 128-byte blocks to 64-byte blocks */ - /* 1: r_1 <-- 128r / PWXbytes */ - r *= 2; - - r--; - PREFETCH(&Bin2[r], _MM_HINT_T0) - PREFETCH(&Bin1[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i], _MM_HINT_T0) - PREFETCH(&Bin1[i], _MM_HINT_T0) - } - - /* 2: X <-- B'_{r_1 - 1} */ - XOR4_2(Bin1[r].q, Bin2[r].q) - - /* 3: for i = 0 to r_1 - 1 do */ - i = 0; - r--; - do { - XOR4(Bin1[i].q, Bin2[i].q) - /* 5: X <-- X \xor B'_i */ - XOR4_Y - /* 7: X <-- pwxform(X) */ - PWXFORM - /* 8: B'_i <-- X */ - OUT(Bout[i].q) - - XOR4(Bin1[i + 1].q, Bin2[i + 1].q) - /* 5: X <-- X \xor B'_i */ - XOR4_Y - /* 7: X <-- pwxform(X) */ - PWXFORM - - if (unlikely(i >= r)) - break; - - /* 8: B'_i <-- X */ - OUT(Bout[i + 1].q) - - i += 2; - } while (1); - i++; - - ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; - ctx->w = w; - - /* 11: B_i <-- H(B_i) */ - SALSA20_2(Bout[i].q) - - return _mm_cvtsi128_si32(X0); -} - -#undef ARX -#undef SALSA20_2ROUNDS -#undef SALSA20_2 -#undef SALSA20_8_XOR_ANY -#undef SALSA20_8_XOR_MEM -#undef SALSA20_8_XOR_REG -#undef PWXFORM_X_T -#undef PWXFORM_SIMD -#undef PWXFORM_ROUND -#undef PWXFORM -#undef OUT -#undef XOR4 -#undef XOR4_2 -#undef XOR4_Y - -/** - * integerify(B, r): - * Return the result of parsing B_{2r-1} as a little-endian integer. - */ -static inline uint32_t -integerify(const salsa20_blk_t * B, size_t r) -{ - return B[2 * r - 1].w[0]; -} - -/** - * smix1(B, r, N, flags, V, NROM, VROM, XY, ctx): - * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage XY must be 128r bytes in length. The value N must be even and no - * smaller than 2. The array V must be aligned to a multiple of 64 bytes, and - * arrays B and XY to a multiple of at least 16 bytes (aligning them to 64 - * bytes as well saves cache lines, but might result in cache bank conflicts). - */ -static void -smix1(uint8_t * B, size_t r, uint32_t N, yescrypt_flags_t flags, - salsa20_blk_t * V, uint32_t NROM, const salsa20_blk_t * VROM, - salsa20_blk_t * XY, pwxform_ctx_t * ctx) -{ - size_t s = 2 * r; - salsa20_blk_t * X = V, * Y; - uint32_t i, j; - size_t k; - - /* 1: X <-- B */ - /* 3: V_i <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); - } - } - - if (VROM) { - uint32_t n; - salsa20_blk_t * V_n; - const salsa20_blk_t * V_j; - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[s]; - blockmix(X, Y, r, ctx); - - X = &V[2 * s]; - /* j <-- Integerify(X) mod NROM */ - j = integerify(Y, r) & (NROM - 1); - V_j = &VROM[j * s]; - - /* X <-- H(X \xor VROM_j) */ - j = blockmix_xor(Y, V_j, X, r, 1, ctx); - - for (n = 2; n < N; n <<= 1) { - uint32_t m = (n < N / 2) ? n : (N - 1 - n); - - V_n = &V[n * s]; - - /* 2: for i = 0 to N - 1 do */ - for (i = 1; i < m; i += 2) { - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += i - 1; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V_n[i * s]; - - /* j <-- Integerify(X) mod NROM */ - j = blockmix_xor(X, V_j, Y, r, 0, ctx) & (NROM - 1); - V_j = &VROM[j * s]; - - /* X <-- H(X \xor VROM_j) */ - X = &V_n[(i + 1) * s]; - j = blockmix_xor(Y, V_j, X, r, 1, ctx); - } - } - - n >>= 1; - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += N - 2 - n; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[(N - 1) * s]; - - /* j <-- Integerify(X) mod NROM */ - j = blockmix_xor(X, V_j, Y, r, 0, ctx) & (NROM - 1); - V_j = &VROM[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - X = XY; - blockmix_xor(Y, V_j, X, r, 1, ctx); - } else if (flags & YESCRYPT_RW) { - uint32_t n; - salsa20_blk_t * V_n, * V_j; - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[s]; - blockmix(X, Y, r, ctx); - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - X = &V[2 * s]; - blockmix(Y, X, r, ctx); - j = integerify(X, r); - - for (n = 2; n < N; n <<= 1) { - uint32_t m = (n < N / 2) ? n : (N - 1 - n); - - V_n = &V[n * s]; - - /* 2: for i = 0 to N - 1 do */ - for (i = 1; i < m; i += 2) { - Y = &V_n[i * s]; - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += i - 1; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - j = blockmix_xor(X, V_j, Y, r, 0, ctx); - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += i; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - X = &V_n[(i + 1) * s]; - j = blockmix_xor(Y, V_j, X, r, 0, ctx); - } - } - - n >>= 1; - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += N - 2 - n; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[(N - 1) * s]; - j = blockmix_xor(X, V_j, Y, r, 0, ctx); - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += N - 1 - n; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - X = XY; - blockmix_xor(Y, V_j, X, r, 0, ctx); - } else { - /* 2: for i = 0 to N - 1 do */ - for (i = 1; i < N - 1; i += 2) { - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[i * s]; - blockmix_salsa8(X, Y, r); - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - X = &V[(i + 1) * s]; - blockmix_salsa8(Y, X, r); - } - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[i * s]; - blockmix_salsa8(X, Y, r); - - /* 4: X <-- H(X) */ - X = XY; - blockmix_salsa8(Y, X, r); - } - - /* B' <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]); - } - } -} - -/** - * smix2(B, r, N, Nloop, flags, V, NROM, VROM, XY, ctx): - * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage XY must be 256r bytes in length. The value N must be a power of 2 - * greater than 1. The value Nloop must be even. The array V must be aligned - * to a multiple of 64 bytes, and arrays B and XY to a multiple of at least 16 - * bytes (aligning them to 64 bytes as well saves cache lines, but might result - * in cache bank conflicts). - */ -static void -smix2(uint8_t * B, size_t r, uint32_t N, uint64_t Nloop, - yescrypt_flags_t flags, salsa20_blk_t * V, uint32_t NROM, - const salsa20_blk_t * VROM, salsa20_blk_t * XY, pwxform_ctx_t * ctx) -{ - size_t s = 2 * r; - salsa20_blk_t * X = XY, * Y = &XY[s]; - uint64_t i; - uint32_t j; - size_t k; - - if (Nloop == 0) - return; - - /* X <-- B' */ - /* 3: V_i <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); - } - } - - i = Nloop / 2; - - /* 7: j <-- Integerify(X) mod N */ - j = integerify(X, r) & (N - 1); - -/* - * Normally, VROM implies YESCRYPT_RW, but we check for these separately - * because our SMix resets YESCRYPT_RW for the smix2() calls operating on the - * entire V when p > 1. - */ - if (VROM && (flags & YESCRYPT_RW)) { - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < Nloop; i += 2) { - salsa20_blk_t * V_j = &V[j * s]; - const salsa20_blk_t * VROM_j; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* j <-- Integerify(X) mod NROM */ - j = blockmix_xor_save(X, V_j, Y, r, ctx) & (NROM - 1); - VROM_j = &VROM[j * s]; - - /* X <-- H(X \xor VROM_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(Y, VROM_j, X, r, 1, ctx) & (N - 1); - V_j = &V[j * s]; - } - } else if (VROM) { - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < Nloop; i += 2) { - const salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* j <-- Integerify(X) mod NROM */ - j = blockmix_xor(X, V_j, Y, r, 0, ctx) & (NROM - 1); - V_j = &VROM[j * s]; - - /* X <-- H(X \xor VROM_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(Y, V_j, X, r, 1, ctx) & (N - 1); - V_j = &V[j * s]; - } - } else if (flags & YESCRYPT_RW) { - /* 6: for i = 0 to N - 1 do */ - do { - salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor_save(X, V_j, Y, r, ctx) & (N - 1); - V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor_save(Y, V_j, X, r, ctx) & (N - 1); - } while (--i); - } else if (ctx) { - /* 6: for i = 0 to N - 1 do */ - do { - const salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(X, V_j, Y, r, 0, ctx) & (N - 1); - V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(Y, V_j, X, r, 0, ctx) & (N - 1); - } while (--i); - } else { - /* 6: for i = 0 to N - 1 do */ - do { - const salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_salsa8_xor(X, V_j, Y, r) & (N - 1); - V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_salsa8_xor(Y, V_j, X, r) & (N - 1); - } while (--i); - } - - /* 10: B' <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]); - } - } -} - -/** - * p2floor(x): - * Largest power of 2 not greater than argument. - */ -static uint64_t -p2floor(uint64_t x) -{ - uint64_t y; - while ((y = x & (x - 1))) - x = y; - return x; -} - -/** - * smix(B, r, N, p, t, flags, V, NROM, VROM, XY, S, passwd): - * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the - * temporary storage V must be 128rN bytes in length; the temporary storage XY - * must be 256r or 256rp bytes in length (the larger size is required with - * OpenMP-enabled builds). The value N must be a power of 2 greater than 1. - * The array V must be aligned to a multiple of 64 bytes, and arrays B and - * XY to a multiple of at least 16 bytes (aligning them to 64 bytes as well - * saves cache lines and helps avoid false sharing in OpenMP-enabled builds - * when p > 1, but it might also result in cache bank conflicts). - */ -static void -smix(uint8_t * B, size_t r, uint32_t N, uint32_t p, uint32_t t, - yescrypt_flags_t flags, - salsa20_blk_t * V, uint32_t NROM, const salsa20_blk_t * VROM, - salsa20_blk_t * XY, uint8_t * S, uint8_t * passwd) -{ - size_t s = 2 * r; - uint32_t Nchunk; - uint64_t Nloop_all, Nloop_rw; - uint32_t i; - - /* 1: n <-- N / p */ - Nchunk = N / p; - - /* 2: Nloop_all <-- fNloop(n, t, flags) */ - Nloop_all = Nchunk; - if (flags & YESCRYPT_RW) { - if (t <= 1) { - if (t) - Nloop_all *= 2; /* 2/3 */ - Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ - } else { - Nloop_all *= t - 1; - } - } else if (t) { - if (t == 1) - Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ - Nloop_all *= t; - } - - /* 6: Nloop_rw <-- 0 */ - Nloop_rw = 0; - if (flags & __YESCRYPT_INIT_SHARED) { - Nloop_rw = Nloop_all; - } else { - /* 3: if YESCRYPT_RW flag is set */ - if (flags & YESCRYPT_RW) { - /* 4: Nloop_rw <-- Nloop_all / p */ - Nloop_rw = Nloop_all / p; - } - } - - /* 8: n <-- n - (n mod 2) */ - Nchunk &= ~(uint32_t)1; /* round down to even */ - /* 9: Nloop_all <-- Nloop_all + (Nloop_all mod 2) */ - Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ - /* 10: Nloop_rw <-- Nloop_rw + (Nloop_rw mod 2) */ - Nloop_rw++; Nloop_rw &= ~(uint64_t)1; /* round up to even */ - - /* 11: for i = 0 to p - 1 do */ -#ifdef _OPENMP -#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, VROM, XY, S, passwd, s, Nchunk, Nloop_all, Nloop_rw) - { -#pragma omp for -#endif - for (i = 0; i < p; i++) { - /* 12: u <-- in */ - uint32_t Vchunk = i * Nchunk; - /* 13: if i = p - 1 */ - /* 14: n <-- N - u */ - /* 15: end if */ - /* 16: v <-- u + n - 1 */ - uint32_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); - uint8_t * Bp = &B[128 * r * i]; - salsa20_blk_t * Vp = &V[Vchunk * s]; -#ifdef _OPENMP - salsa20_blk_t * XYp = &XY[i * (2 * s)]; -#else - salsa20_blk_t * XYp = XY; -#endif - pwxform_ctx_t * ctx_i = NULL; - /* 17: if YESCRYPT_RW flag is set */ - if (flags & YESCRYPT_RW) { - uint8_t *Si = S + i * Salloc; - /* 18: SMix1_1(B_i, Sbytes / 128, S_i, no flags) */ - smix1(Bp, 1, Sbytes / 128, 0 /* no flags */, - (salsa20_blk_t *)Si, 0, NULL, XYp, NULL); - ctx_i = (pwxform_ctx_t *)(Si + Sbytes); - /* 19: S2_i <-- S_{i,0...2^Swidth-1} */ - ctx_i->S2 = Si; - /* 20: S1_i <-- S_{i,2^Swidth...2*2^Swidth-1} */ - ctx_i->S1 = Si + Sbytes / 3; - /* 21: S0_i <-- S_{i,2*2^Swidth...3*2^Swidth-1} */ - ctx_i->S0 = Si + Sbytes / 3 * 2; - /* 22: w_i <-- 0 */ - ctx_i->w = 0; - /* 23: if i = 0 */ - if (i == 0) { - /* 24: passwd <-- HMAC-SHA256(B_{0,2r-1}, passwd) */ - HMAC_SHA256_CTX ctx; - HMAC_SHA256_Init(&ctx, Bp + (128 * r - 64), 64); - HMAC_SHA256_Update(&ctx, passwd, 32); - HMAC_SHA256_Final(passwd, &ctx); - } - } - if (!(flags & __YESCRYPT_INIT_SHARED_2)) { - /* 27: SMix1_r(B_i, n, V_{u..v}, flags) */ - smix1(Bp, r, Np, flags, Vp, NROM, VROM, XYp, ctx_i); - } - /* 28: SMix2_r(B_i, p2floor(n), Nloop_rw, V_{u..v}, flags) */ - smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, - NROM, VROM, XYp, ctx_i); - } - - /* 30: for i = 0 to p - 1 do */ - if (Nloop_all > Nloop_rw) { -#ifdef _OPENMP -#pragma omp for -#endif - for (i = 0; i < p; i++) { - uint8_t * Bp = &B[128 * r * i]; -#ifdef _OPENMP - salsa20_blk_t * XYp = &XY[i * (2 * s)]; -#else - salsa20_blk_t * XYp = XY; -#endif - pwxform_ctx_t * ctx_i = NULL; - if (flags & YESCRYPT_RW) { - uint8_t *Si = S + i * Salloc; - ctx_i = (pwxform_ctx_t *)(Si + Sbytes); - } - /* 31: SMix2_r(B_i, N, Nloop_all - Nloop_rw, V, flags excluding YESCRYPT_RW) */ - smix2(Bp, r, N, Nloop_all - Nloop_rw, - flags & ~YESCRYPT_RW, V, NROM, VROM, XYp, ctx_i); - } - } -#ifdef _OPENMP - } -#endif -} - -/** - * yescrypt_kdf_body(shared, local, passwd, passwdlen, salt, saltlen, - * N, r, p, t, flags, buf, buflen): - * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, - * p, buflen), or a revision of scrypt as requested by flags and shared, and - * write the result into buf. The parameters r, p, and buflen must satisfy - * r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power - * of 2 greater than 1. (This optimized implementation currently additionally - * limits N to the range from 8 to 2^31, but other implementation might not.) - * - * t controls computation time while not affecting peak memory usage. shared - * and flags may request special modes as described in yescrypt.h. local is - * the thread-local data structure, allowing to preserve and reuse a memory - * allocation across calls, thereby reducing its overhead. - * - * Return 0 on success; or -1 on error. - */ -static int -yescrypt_kdf_body(const yescrypt_shared_t * shared, yescrypt_local_t * local, - const uint8_t * passwd, size_t passwdlen, - const uint8_t * salt, size_t saltlen, - uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, - uint8_t * buf, size_t buflen) -{ - yescrypt_region_t tmp; - uint64_t NROM; - const salsa20_blk_t * VROM; - size_t B_size, V_size, XY_size, need; - uint8_t * B, * S; - salsa20_blk_t * V, * XY; - uint8_t sha256[32]; - uint8_t dk[sizeof(sha256)], * dkp = buf; - - /* Sanity-check parameters */ - if (flags & ~YESCRYPT_KNOWN_FLAGS) { - errno = EINVAL; - return -1; - } -#if SIZE_MAX > UINT32_MAX - if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { - errno = EFBIG; - return -1; - } -#endif - if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { - errno = EFBIG; - return -1; - } - if (N > UINT32_MAX) { - errno = EFBIG; - return -1; - } - if (((N & (N - 1)) != 0) || (N <= 3) || (r < 1) || (p < 1)) { - errno = EINVAL; - return -1; - } - if ((r > SIZE_MAX / 256 / p) || - (N > SIZE_MAX / 128 / r)) { - errno = ENOMEM; - return -1; - } - if (flags & YESCRYPT_RW) { - if (N / p <= 3) { - errno = EINVAL; - return -1; - } - if (p > SIZE_MAX / Salloc) { - errno = ENOMEM; - return -1; - } - } -#ifdef _OPENMP - else if (N > SIZE_MAX / 128 / (r * p)) { - errno = ENOMEM; - return -1; - } -#endif - - NROM = 0; - VROM = NULL; - if (shared) { - NROM = shared->aligned_size / ((size_t)128 * r); - if (NROM > UINT32_MAX) { - errno = EFBIG; - return -1; - } - if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) || - !(flags & YESCRYPT_RW)) { - errno = EINVAL; - return -1; - } - VROM = shared->aligned; - } - - /* Allocate memory */ - V = NULL; - V_size = (size_t)128 * r * N; -#ifdef _OPENMP - if (!(flags & YESCRYPT_RW)) - V_size *= p; -#endif - need = V_size; - if (flags & __YESCRYPT_INIT_SHARED) { - if (local->aligned_size < need) { - if (local->base || local->aligned || - local->base_size || local->aligned_size) { - errno = EINVAL; - return -1; - } - if (!alloc_region(local, need)) - return -1; - } - V = (salsa20_blk_t *)local->aligned; - need = 0; - } - B_size = (size_t)128 * r * p; - need += B_size; - if (need < B_size) { - errno = ENOMEM; - return -1; - } - XY_size = (size_t)256 * r; -#ifdef _OPENMP - XY_size *= p; -#endif - need += XY_size; - if (need < XY_size) { - errno = ENOMEM; - return -1; - } - if (flags & YESCRYPT_RW) { - size_t S_size = (size_t)Salloc * p; - need += S_size; - if (need < S_size) { - errno = ENOMEM; - return -1; - } - } - if (flags & __YESCRYPT_INIT_SHARED) { - if (!alloc_region(&tmp, need)) - return -1; - B = (uint8_t *)tmp.aligned; - XY = (salsa20_blk_t *)((uint8_t *)B + B_size); - } else { - init_region(&tmp); - if (local->aligned_size < need) { - if (free_region(local)) - return -1; - if (!alloc_region(local, need)) - return -1; - } - B = (uint8_t *)local->aligned; - V = (salsa20_blk_t *)((uint8_t *)B + B_size); - XY = (salsa20_blk_t *)((uint8_t *)V + V_size); - } - S = NULL; - if (flags & YESCRYPT_RW) - S = (uint8_t *)XY + XY_size; - - if (flags) { - HMAC_SHA256_CTX ctx; - HMAC_SHA256_Init(&ctx, "yescrypt-prehash", - (flags & __YESCRYPT_PREHASH) ? 16 : 8); - HMAC_SHA256_Update(&ctx, passwd, passwdlen); - HMAC_SHA256_Final(sha256, &ctx); - passwd = sha256; - passwdlen = sizeof(sha256); - } - - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, B, B_size); - - if (t || flags) - memcpy(sha256, B, sizeof(sha256)); - - if (p == 1 || (flags & YESCRYPT_RW)) { - smix(B, r, N, p, t, flags, V, NROM, VROM, XY, S, sha256); - } else { - uint32_t i; - - /* 2: for i = 0 to p - 1 do */ -#ifdef _OPENMP -#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, VROM, XY, S) -#endif - for (i = 0; i < p; i++) { - /* 3: B_i <-- MF(B_i, N) */ -#ifdef _OPENMP - smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, - &V[(size_t)2 * r * i * N], - NROM, VROM, - &XY[(size_t)4 * r * i], NULL, NULL); -#else - smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, V, - NROM, VROM, XY, NULL, NULL); -#endif - } - } - - dkp = buf; - if (flags && buflen < sizeof(dk)) { - PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, dk, sizeof(dk)); - dkp = dk; - } - - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, buf, buflen); - - /* - * Except when computing classic scrypt, allow all computation so far - * to be performed on the client. The final steps below match those of - * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so - * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of - * SCRAM's use of SHA-1) would be usable with yescrypt hashes. - */ - if (flags && !(flags & __YESCRYPT_PREHASH)) { - /* Compute ClientKey */ - { - HMAC_SHA256_CTX ctx; - HMAC_SHA256_Init(&ctx, dkp, sizeof(dk)); - HMAC_SHA256_Update(&ctx, "Client Key", 10); - HMAC_SHA256_Final(sha256, &ctx); - } - /* Compute StoredKey */ - { - SHA256_CTX ctx; - size_t clen = buflen; - if (clen > sizeof(dk)) - clen = sizeof(dk); - SHA256_Init(&ctx); - SHA256_Update(&ctx, sha256, sizeof(sha256)); - SHA256_Final(dk, &ctx); - memcpy(buf, dk, clen); - } - } - - if (free_region(&tmp)) - return -1; - - /* Success! */ - return 0; -} - -/** - * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, - * N, r, p, t, g, flags, buf, buflen): - * Compute scrypt or its revision as requested by the parameters. The inputs - * to this function are the same as those for yescrypt_kdf_body() above, with - * the addition of g, which controls hash upgrades (0 for no upgrades so far). - */ -int -yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local, - const uint8_t * passwd, size_t passwdlen, - const uint8_t * salt, size_t saltlen, - uint64_t N, uint32_t r, uint32_t p, uint32_t t, uint32_t g, - yescrypt_flags_t flags, - uint8_t * buf, size_t buflen) -{ - uint8_t dk[32]; - - if ((flags & (YESCRYPT_RW | __YESCRYPT_INIT_SHARED)) == YESCRYPT_RW && - p >= 1 && N / p >= 0x100 && N / p * r >= 0x20000) { - int retval = yescrypt_kdf_body(shared, local, - passwd, passwdlen, salt, saltlen, - N >> 6, r, p, 0, flags | __YESCRYPT_PREHASH, - dk, sizeof(dk)); - if (retval) - return retval; - passwd = dk; - passwdlen = sizeof(dk); - } - - do { - uint8_t * dkp = g ? dk : buf; - size_t dklen = g ? sizeof(dk) : buflen; - int retval = yescrypt_kdf_body(shared, local, - passwd, passwdlen, salt, saltlen, - N, r, p, t, flags, dkp, dklen); - if (retval) - return retval; - - passwd = dkp; - passwdlen = dklen; - - N <<= 2; - if (!N) - return -1; - t >>= 1; - } while (g--); - - return 0; -} diff --git a/src/crypto/randomx/defyx/yescrypt.h b/src/crypto/randomx/defyx/yescrypt.h deleted file mode 100644 index 4af307e8..00000000 --- a/src/crypto/randomx/defyx/yescrypt.h +++ /dev/null @@ -1,326 +0,0 @@ -/*- - * Copyright 2009 Colin Percival - * Copyright 2013-2015 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ -#ifndef _YESCRYPT_H_ -#define _YESCRYPT_H_ - -#include -#include /* for size_t */ - -/** - * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen): - * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, - * p, buflen) and write the result into buf. The parameters r, p, and buflen - * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N - * must be a power of 2 greater than 1. - * - * Return 0 on success; or -1 on error. - * - * MT-safe as long as buf is local to the thread. - */ -extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen, - const uint8_t * __salt, size_t __saltlen, - uint64_t __N, uint32_t __r, uint32_t __p, - uint8_t * __buf, size_t __buflen); - -/** - * Internal type used by the memory allocator. Please do not use it directly. - * Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since - * they might differ from each other in a future version. - */ -typedef struct { - void * base, * aligned; - size_t base_size, aligned_size; -} yescrypt_region_t; - -/** - * Types for shared (ROM) and thread-local (RAM) data structures. - */ -typedef yescrypt_region_t yescrypt_shared_t; -typedef yescrypt_region_t yescrypt_local_t; - -/** - * Possible values for yescrypt_init_shared()'s flags argument. - */ -typedef enum { - YESCRYPT_SHARED_DEFAULTS = 0, - YESCRYPT_SHARED_PREALLOCATED = 0x100 -} yescrypt_init_shared_flags_t; - -/** - * Possible values for the flags argument of yescrypt_kdf(), - * yescrypt_gensalt_r(), yescrypt_gensalt(). These may be OR'ed together, - * except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive. - * Please refer to the description of yescrypt_kdf() below for the meaning of - * these flags. - */ -typedef enum { -/* public */ - YESCRYPT_WORM = 2, - YESCRYPT_RW = 1, -/* private */ - __YESCRYPT_INIT_SHARED_1 = 0x10000, - __YESCRYPT_INIT_SHARED_2 = 0x20000, - __YESCRYPT_INIT_SHARED = 0x30000, - __YESCRYPT_PREHASH = 0x100000 -} yescrypt_flags_t; - -#define YESCRYPT_KNOWN_FLAGS \ - (YESCRYPT_WORM | YESCRYPT_RW | \ - __YESCRYPT_INIT_SHARED | __YESCRYPT_PREHASH) - -/** - * yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, buf, buflen): - * Optionally allocate memory for and initialize the shared (ROM) data - * structure. The parameters N, r, and p must satisfy the same conditions as - * with crypto_scrypt(). param and paramlen specify a local parameter with - * which the ROM is seeded. If buf is not NULL, then it is used to return - * buflen bytes of message digest for the initialized ROM (the caller may use - * this to verify that the ROM has been computed in the same way that it was on - * a previous run). - * - * Return 0 on success; or -1 on error. - * - * If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the - * ROM is assumed to have been preallocated by the caller, with shared->aligned - * being the start address of the ROM and shared->aligned_size being its size - * (which must be consistent with N, r, and p). This may be used e.g. when the - * ROM is to be placed in a SysV shared memory segment allocated by the caller. - * - * MT-safe as long as shared is local to the thread. - */ -extern int yescrypt_init_shared(yescrypt_shared_t * __shared, - const uint8_t * __param, size_t __paramlen, - uint64_t __N, uint32_t __r, uint32_t __p, - yescrypt_init_shared_flags_t __flags, - uint8_t * __buf, size_t __buflen); - -/** - * yescrypt_free_shared(shared): - * Free memory that had been allocated with yescrypt_init_shared(). - * - * Return 0 on success; or -1 on error. - * - * MT-safe as long as shared is local to the thread. - */ -extern int yescrypt_free_shared(yescrypt_shared_t * __shared); - -/** - * yescrypt_init_local(local): - * Initialize the thread-local (RAM) data structure. Actual memory allocation - * is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r(). - * - * Return 0 on success; or -1 on error. - * - * MT-safe as long as local is local to the thread. - */ -extern int yescrypt_init_local(yescrypt_local_t * __local); - -/** - * yescrypt_free_local(local): - * Free memory that may have been allocated for an initialized thread-local - * (RAM) data structure. - * - * Return 0 on success; or -1 on error. - * - * MT-safe as long as local is local to the thread. - */ -extern int yescrypt_free_local(yescrypt_local_t * __local); - -/** - * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, - * N, r, p, t, g, flags, buf, buflen): - * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, - * p, buflen), or a revision of scrypt as requested by flags and shared, and - * write the result into buf. The parameters N, r, p, and buflen must satisfy - * the same conditions as with crypto_scrypt(). t controls computation time - * while not affecting peak memory usage. g controls hash upgrades (0 for no - * upgrades so far). shared and flags may request special modes as described - * below. local is the thread-local data structure, allowing to preserve and - * reuse a memory allocation across calls, thereby reducing its overhead. - * - * Return 0 on success; or -1 on error. - * - * t controls computation time. t = 0 is optimal in terms of achieving the - * highest area-time for ASIC attackers. Thus, higher computation time, if - * affordable, is best achieved by increasing N rather than by increasing t. - * However, if the higher memory usage (which goes along with higher N) is not - * affordable, or if fine-tuning of the time is needed (recall that N must be a - * power of 2), then t = 1 or above may be used to increase time while staying - * at the same peak memory usage. t = 1 increases the time by 25% and - * decreases the normalized area-time to 96% of optimal. (Of course, in - * absolute terms the area-time increases with higher t. It's just that it - * would increase slightly more with higher N*r rather than with higher t.) - * t = 2 increases the time by another 20% and decreases the normalized - * area-time to 89% of optimal. Thus, these two values are reasonable to use - * for fine-tuning. Values of t higher than 2 result in further increase in - * time while reducing the efficiency much further (e.g., down to around 50% of - * optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact - * numbers varying by the flags settings). - * - * Classic scrypt is available by setting t = 0, flags = 0, and shared = NULL. - * In this mode, the thread-local memory region (RAM) is first sequentially - * written to and then randomly read from. This algorithm is friendly towards - * time-memory tradeoffs (TMTO), available both to defenders (albeit not in - * this implementation) and to attackers. - * - * Setting YESCRYPT_WORM enables only minimal enhancements relative to classic - * scrypt: support for the t parameter, and pre- and post-hashing. - * - * Setting YESCRYPT_RW adds extra random reads and writes to the thread-local - * memory region (RAM), which makes TMTO a lot less efficient. This may be - * used to slow down the kinds of attackers who would otherwise benefit from - * classic scrypt's efficient TMTO. Since classic scrypt's TMTO allows not - * only for the tradeoff, but also for a decrease of attacker's area-time (by - * up to a constant factor), setting YESCRYPT_RW substantially increases the - * cost of attacks in area-time terms as well. Yet another benefit of it is - * that optimal area-time is reached at an earlier time than with classic - * scrypt, and t = 0 actually corresponds to this earlier completion time, - * resulting in quicker hash computations (and thus in higher request rate - * capacity). Due to these properties, YESCRYPT_RW should almost always be - * set, except when compatibility with classic scrypt or TMTO-friendliness are - * desired. - * - * YESCRYPT_RW also moves parallelism that is present with p > 1 to a - * lower level as compared to where it is in classic scrypt. This reduces - * flexibility for efficient computation (for both attackers and defenders) by - * requiring that, short of resorting to TMTO, the full amount of memory be - * allocated as needed for the specified p, regardless of whether that - * parallelism is actually being fully made use of or not. (For comparison, a - * single instance of classic scrypt may be computed in less memory without any - * CPU time overhead, but in more real time, by not making full use of the - * parallelism.) This may be desirable when the defender has enough memory - * with sufficiently low latency and high bandwidth for efficient full parallel - * execution, yet the required memory size is high enough that some likely - * attackers might end up being forced to choose between using higher latency - * memory than they could use otherwise (waiting for data longer) or using TMTO - * (waiting for data more times per one hash computation). The area-time cost - * for other kinds of attackers (who would use the same memory type and TMTO - * factor or no TMTO either way) remains roughly the same, given the same - * running time for the defender. - * - * As a side effect of differences between the algorithms, setting YESCRYPT_RW - * also changes the way the total processing time (combined for all threads) - * and memory allocation (if the parallelism is being made use of) is to be - * controlled from N*r*p (for classic scrypt) to N*r (in this modification). - * Obviously, these only differ for p > 1. - * - * Passing a shared structure, with ROM contents previously computed by - * yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for - * the thread-local RAM region. In order to allow for initialization of the - * ROM to be split into a separate program, the shared->aligned and - * shared->aligned_size fields may be set by the caller of yescrypt_kdf() - * manually rather than with yescrypt_init_shared(). - * - * local must be initialized with yescrypt_init_local(). - * - * MT-safe as long as local and buf are local to the thread. - */ -extern int yescrypt_kdf(const yescrypt_shared_t * __shared, - yescrypt_local_t * __local, - const uint8_t * __passwd, size_t __passwdlen, - const uint8_t * __salt, size_t __saltlen, - uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t, uint32_t __g, - yescrypt_flags_t __flags, - uint8_t * __buf, size_t __buflen); - -/** - * yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen): - * Compute and encode an scrypt or enhanced scrypt hash of passwd given the - * parameters and salt value encoded in setting. If shared is not NULL, a ROM - * is used and YESCRYPT_RW is required. Otherwise, whether to compute classic - * scrypt, YESCRYPT_WORM (a slight deviation from classic scrypt), or - * YESCRYPT_RW (time-memory tradeoff discouraging modification) is determined - * by the setting string. shared (if not NULL) and local must be initialized - * as described above for yescrypt_kdf(). buf must be large enough (as - * indicated by buflen) to hold the encoded hash string. - * - * Return the encoded hash string on success; or NULL on error. - * - * MT-safe as long as local and buf are local to the thread. - */ -extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared, - yescrypt_local_t * __local, - const uint8_t * __passwd, size_t __passwdlen, - const uint8_t * __setting, - uint8_t * __buf, size_t __buflen); - -/** - * yescrypt(passwd, setting): - * Compute and encode an scrypt or enhanced scrypt hash of passwd given the - * parameters and salt value encoded in setting. Whether to compute classic - * scrypt, YESCRYPT_WORM (a slight deviation from classic scrypt), or - * YESCRYPT_RW (time-memory tradeoff discouraging modification) is determined - * by the setting string. - * - * Return the encoded hash string on success; or NULL on error. - * - * This is a crypt(3)-like interface, which is simpler to use than - * yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM, - * and it is slower than yescrypt_r() for repeated calls because it allocates - * and frees memory on each call. - * - * MT-unsafe. - */ -extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting); - -/** - * yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen): - * Generate a setting string for use with yescrypt_r() and yescrypt() by - * encoding into it the parameters N_log2 (which is to be set to base 2 - * logarithm of the desired value for N), r, p, flags, and a salt given by src - * (of srclen bytes). buf must be large enough (as indicated by buflen) to - * hold the setting string. - * - * Return the setting string on success; or NULL on error. - * - * MT-safe as long as buf is local to the thread. - */ -extern uint8_t * yescrypt_gensalt_r( - uint32_t __N_log2, uint32_t __r, uint32_t __p, - yescrypt_flags_t __flags, - const uint8_t * __src, size_t __srclen, - uint8_t * __buf, size_t __buflen); - -/** - * yescrypt_gensalt(N_log2, r, p, flags, src, srclen): - * Generate a setting string for use with yescrypt_r() and yescrypt(). This - * function is the same as yescrypt_gensalt_r() except that it uses a static - * buffer and thus is not MT-safe. - * - * Return the setting string on success; or NULL on error. - * - * MT-unsafe. - */ -extern uint8_t * yescrypt_gensalt( - uint32_t __N_log2, uint32_t __r, uint32_t __p, - yescrypt_flags_t __flags, - const uint8_t * __src, size_t __srclen); - -#endif /* !_YESCRYPT_H_ */ diff --git a/src/crypto/randomx/defyx/KangarooTwelve.c b/src/crypto/randomx/panthera/KangarooTwelve.c similarity index 100% rename from src/crypto/randomx/defyx/KangarooTwelve.c rename to src/crypto/randomx/panthera/KangarooTwelve.c diff --git a/src/crypto/randomx/defyx/KangarooTwelve.h b/src/crypto/randomx/panthera/KangarooTwelve.h similarity index 100% rename from src/crypto/randomx/defyx/KangarooTwelve.h rename to src/crypto/randomx/panthera/KangarooTwelve.h diff --git a/src/crypto/randomx/defyx/KeccakP-1600-SnP.h b/src/crypto/randomx/panthera/KeccakP-1600-SnP.h similarity index 100% rename from src/crypto/randomx/defyx/KeccakP-1600-SnP.h rename to src/crypto/randomx/panthera/KeccakP-1600-SnP.h diff --git a/src/crypto/randomx/defyx/KeccakP-1600-reference.c b/src/crypto/randomx/panthera/KeccakP-1600-reference.c similarity index 100% rename from src/crypto/randomx/defyx/KeccakP-1600-reference.c rename to src/crypto/randomx/panthera/KeccakP-1600-reference.c diff --git a/src/crypto/randomx/defyx/KeccakSponge-common.h b/src/crypto/randomx/panthera/KeccakSponge-common.h similarity index 100% rename from src/crypto/randomx/defyx/KeccakSponge-common.h rename to src/crypto/randomx/panthera/KeccakSponge-common.h diff --git a/src/crypto/randomx/defyx/KeccakSponge.inc b/src/crypto/randomx/panthera/KeccakSponge.inc similarity index 100% rename from src/crypto/randomx/defyx/KeccakSponge.inc rename to src/crypto/randomx/panthera/KeccakSponge.inc diff --git a/src/crypto/randomx/defyx/KeccakSpongeWidth1600.c b/src/crypto/randomx/panthera/KeccakSpongeWidth1600.c similarity index 100% rename from src/crypto/randomx/defyx/KeccakSpongeWidth1600.c rename to src/crypto/randomx/panthera/KeccakSpongeWidth1600.c diff --git a/src/crypto/randomx/defyx/KeccakSpongeWidth1600.h b/src/crypto/randomx/panthera/KeccakSpongeWidth1600.h similarity index 100% rename from src/crypto/randomx/defyx/KeccakSpongeWidth1600.h rename to src/crypto/randomx/panthera/KeccakSpongeWidth1600.h diff --git a/src/crypto/randomx/defyx/Phases.h b/src/crypto/randomx/panthera/Phases.h similarity index 100% rename from src/crypto/randomx/defyx/Phases.h rename to src/crypto/randomx/panthera/Phases.h diff --git a/src/crypto/randomx/defyx/align.h b/src/crypto/randomx/panthera/align.h similarity index 100% rename from src/crypto/randomx/defyx/align.h rename to src/crypto/randomx/panthera/align.h diff --git a/src/crypto/randomx/defyx/brg_endian.h b/src/crypto/randomx/panthera/brg_endian.h similarity index 100% rename from src/crypto/randomx/defyx/brg_endian.h rename to src/crypto/randomx/panthera/brg_endian.h diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index ebdeb8ec..670e2de3 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -48,9 +48,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include extern "C" { -#include "crypto/randomx/defyx/yescrypt.h" #include "crypto/randomx/panthera/yespower.h" -#include "crypto/randomx/defyx/KangarooTwelve.h" +#include "crypto/randomx/panthera/KangarooTwelve.h" } #include "crypto/rx/Profiler.h" @@ -122,15 +121,11 @@ RandomX_ConfigurationScala::RandomX_ConfigurationScala() RANDOMX_FREQ_CBRANCH = 16; } -RandomX_ConfigurationScala2::RandomX_ConfigurationScala2() -{ -} - RandomX_ConfigurationBase::RandomX_ConfigurationBase() : ArgonMemory(262144) - , CacheAccesses(8) - , DatasetBaseSize(2147483648) - , ArgonIterations(3) + , CacheAccesses(8) + , DatasetBaseSize(2147483648) + , ArgonIterations(3) , ArgonLanes(1) , ArgonSalt("RandomX\x03") , ScratchpadL1_Size(16384) @@ -391,31 +386,11 @@ RandomX_ConfigurationArqma RandomX_ArqmaConfig; RandomX_ConfigurationSafex RandomX_SafexConfig; RandomX_ConfigurationKeva RandomX_KevaConfig; RandomX_ConfigurationScala RandomX_ScalaConfig; -RandomX_ConfigurationScala2 RandomX_Scala2Config; alignas(64) RandomX_ConfigurationBase RandomX_CurrentConfig; static std::mutex vm_pool_mutex; -int rx_sipesh_k12(void *out, size_t outlen, const void *in, size_t inlen) -{ - const void *salt = in; - size_t saltlen = inlen; - yescrypt_local_t local; - int retval; - - if (yescrypt_init_local(&local)) return -1; - retval = yescrypt_kdf(NULL, &local, - (const uint8_t*)in, inlen, - (const uint8_t*)salt, saltlen, - (uint64_t)2048, 8, 1, 0, 0, (yescrypt_flags_t)1, - (uint8_t*)out, outlen - ); - if (yescrypt_free_local(&local) || retval) return -1; - retval = KangarooTwelve((const unsigned char *)in, inlen, (unsigned char *)out, 32, 0, 0); - return retval; -} - int rx_yespower_k12(void *out, size_t outlen, const void *in, size_t inlen) { rx_blake2b_wrapper::run(out, outlen, in, inlen); diff --git a/src/crypto/randomx/randomx.h b/src/crypto/randomx/randomx.h index 8446b098..99d5877c 100644 --- a/src/crypto/randomx/randomx.h +++ b/src/crypto/randomx/randomx.h @@ -154,7 +154,6 @@ struct RandomX_ConfigurationArqma : public RandomX_ConfigurationBase { RandomX_C struct RandomX_ConfigurationSafex : public RandomX_ConfigurationBase { RandomX_ConfigurationSafex(); }; struct RandomX_ConfigurationKeva : public RandomX_ConfigurationBase { RandomX_ConfigurationKeva(); }; struct RandomX_ConfigurationScala : public RandomX_ConfigurationBase { RandomX_ConfigurationScala(); }; -struct RandomX_ConfigurationScala2 : public RandomX_ConfigurationScala { RandomX_ConfigurationScala2(); }; extern RandomX_ConfigurationMonero RandomX_MoneroConfig; extern RandomX_ConfigurationWownero RandomX_WowneroConfig; @@ -162,7 +161,6 @@ extern RandomX_ConfigurationArqma RandomX_ArqmaConfig; extern RandomX_ConfigurationSafex RandomX_SafexConfig; extern RandomX_ConfigurationKeva RandomX_KevaConfig; extern RandomX_ConfigurationScala RandomX_ScalaConfig; -extern RandomX_ConfigurationScala2 RandomX_Scala2Config; extern RandomX_ConfigurationBase RandomX_CurrentConfig; diff --git a/src/crypto/rx/RxAlgo.cpp b/src/crypto/rx/RxAlgo.cpp index 0df335b4..3c265c6d 100644 --- a/src/crypto/rx/RxAlgo.cpp +++ b/src/crypto/rx/RxAlgo.cpp @@ -46,7 +46,7 @@ const RandomX_ConfigurationBase *xmrig::RxAlgo::base(Algorithm::Id algorithm) return &RandomX_KevaConfig; case Algorithm::RX_XLA: - return &RandomX_Scala2Config; + return &RandomX_ScalaConfig; default: break;