Merge branch 'dev' into sync-base

This commit is contained in:
XMRig 2022-06-14 05:19:28 +07:00
commit 3ded8e6734
No known key found for this signature in database
GPG key ID: 446A53638BE94409
119 changed files with 5867 additions and 14547 deletions

View file

@ -1,139 +0,0 @@
/* XMRig
* Copyright (c) 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright (c) 2018-2019 tevador <tevador@gmail.com>
* Copyright (c) 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
* Copyright (c) 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
* Copyright (c) 2018-2021 SChernykh <https://github.com/SChernykh>
* Copyright (c) 2016-2021 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "crypto/astrobwt/AstroBWT.h"
#include "backend/cpu/Cpu.h"
#include "base/crypto/sha3.h"
#include "base/tools/bswap_64.h"
#include "crypto/cn/CryptoNight.h"
#include "crypto/astrobwt/sort_indices2.h"
#include <limits>
static bool astrobwtInitialized = false;
#ifdef ASTROBWT_AVX2
static bool hasAVX2 = false;
extern "C"
#ifndef _MSC_VER
__attribute__((ms_abi))
#endif
void SHA3_256_AVX2_ASM(const void* in, size_t inBytes, void* out);
#endif
#ifdef XMRIG_ARM
extern "C" {
#include "salsa20_ref/ecrypt-sync.h"
}
static void Salsa20_XORKeyStream(const void* key, void* output, size_t size)
{
uint8_t iv[8] = {};
ECRYPT_ctx ctx;
ECRYPT_keysetup(&ctx, static_cast<const uint8_t*>(key), 256, 64);
ECRYPT_ivsetup(&ctx, iv);
ECRYPT_keystream_bytes(&ctx, static_cast<uint8_t*>(output), size);
memset(static_cast<uint8_t*>(output) - 16, 0, 16);
memset(static_cast<uint8_t*>(output) + size, 0, 16);
}
#else
#include "Salsa20.hpp"
static void Salsa20_XORKeyStream(const void* key, void* output, size_t size)
{
const uint64_t iv = 0;
ZeroTier::Salsa20 s(key, &iv);
s.XORKeyStream(output, static_cast<uint32_t>(size));
memset(static_cast<uint8_t*>(output) - 16, 0, 16);
memset(static_cast<uint8_t*>(output) + size, 0, 16);
}
extern "C" int salsa20_stream_avx2(void* c, uint64_t clen, const void* iv, const void* key);
static void Salsa20_XORKeyStream_AVX256(const void* key, void* output, size_t size)
{
const uint64_t iv = 0;
salsa20_stream_avx2(output, size, &iv, key);
memset(static_cast<uint8_t*>(output) - 16, 0, 16);
memset(static_cast<uint8_t*>(output) + size, 0, 16);
}
#endif
bool xmrig::astrobwt::astrobwt_dero_v2(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash)
{
constexpr size_t N = 9973;
constexpr size_t STRIDE = 10240;
alignas(8) uint8_t key[32];
uint8_t* scratchpad_ptr = (uint8_t*)(scratchpad) + 64;
uint8_t* v = scratchpad_ptr;
uint32_t* indices = (uint32_t*)(scratchpad_ptr + STRIDE);
uint32_t* tmp_indices = (uint32_t*)(scratchpad_ptr + STRIDE * 5);
#ifdef ASTROBWT_AVX2
if (hasAVX2) {
SHA3_256_AVX2_ASM(input_data, input_size, key);
Salsa20_XORKeyStream_AVX256(key, v, N);
}
else
#endif
{
sha3_HashBuffer(256, SHA3_FLAGS_NONE, input_data, input_size, key, sizeof(key));
Salsa20_XORKeyStream(key, v, N);
}
sort_indices_astrobwt_v2(N, v, indices, tmp_indices);
#ifdef ASTROBWT_AVX2
if (hasAVX2) {
SHA3_256_AVX2_ASM(indices, N * 2, output_hash);
}
else
#endif
{
sha3_HashBuffer(256, SHA3_FLAGS_NONE, indices, N * 2, output_hash, 32);
}
return true;
}
void xmrig::astrobwt::init()
{
if (!astrobwtInitialized) {
# ifdef ASTROBWT_AVX2
hasAVX2 = Cpu::info()->hasAVX2();
# endif
astrobwtInitialized = true;
}
}
template<>
void xmrig::astrobwt::single_hash<xmrig::Algorithm::ASTROBWT_DERO_2>(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t)
{
astrobwt_dero_v2(input, static_cast<uint32_t>(size), ctx[0]->memory, output);
}

View file

@ -1,43 +0,0 @@
/* XMRig
* Copyright (c) 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright (c) 2018-2019 tevador <tevador@gmail.com>
* Copyright (c) 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
* Copyright (c) 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
* Copyright (c) 2018-2021 SChernykh <https://github.com/SChernykh>
* Copyright (c) 2016-2021 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "base/crypto/Algorithm.h"
struct cryptonight_ctx;
namespace xmrig {
namespace astrobwt {
bool astrobwt_dero_v2(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash);
void init();
template<Algorithm::Id ALGO>
void single_hash(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t);
template<>
void single_hash<Algorithm::ASTROBWT_DERO_2>(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t);
}} // namespace xmrig::astrobwt

View file

@ -1,352 +0,0 @@
/*
* Based on public domain code available at: http://cr.yp.to/snuffle.html
*
* Modifications and C-native SSE macro based SSE implementation by
* Adam Ierymenko <adam.ierymenko@zerotier.com>.
*
* Additional modifications and code cleanup for AstroBWT by
* SChernykh <https://github.com/SChernykh>
*
* Since the original was public domain, this is too.
*/
#include "Salsa20.hpp"
// Statically compute and define SSE constants
class _s20sseconsts
{
public:
_s20sseconsts()
{
maskLo32 = _mm_shuffle_epi32(_mm_cvtsi32_si128(-1), _MM_SHUFFLE(1, 0, 1, 0));
maskHi32 = _mm_slli_epi64(maskLo32, 32);
}
__m128i maskLo32,maskHi32;
};
static const _s20sseconsts _S20SSECONSTANTS;
namespace ZeroTier {
void Salsa20::init(const void *key,const void *iv)
{
const uint32_t *const k = (const uint32_t *)key;
_state.i[0] = 0x61707865;
_state.i[1] = 0x3320646e;
_state.i[2] = 0x79622d32;
_state.i[3] = 0x6b206574;
_state.i[4] = k[3];
_state.i[5] = 0;
_state.i[6] = k[7];
_state.i[7] = k[2];
_state.i[8] = 0;
_state.i[9] = k[6];
_state.i[10] = k[1];
_state.i[11] = ((const uint32_t *)iv)[1];
_state.i[12] = k[5];
_state.i[13] = k[0];
_state.i[14] = ((const uint32_t *)iv)[0];
_state.i[15] = k[4];
}
void Salsa20::XORKeyStream(void *out,unsigned int bytes)
{
uint8_t tmp[64];
uint8_t *c = (uint8_t *)out;
uint8_t *ctarget = c;
unsigned int i;
if (!bytes)
return;
for (;;) {
if (bytes < 64) {
for (i = 0;i < bytes;++i)
tmp[i] = 0;
ctarget = c;
c = tmp;
}
__m128i X0 = _mm_loadu_si128((const __m128i *)&(_state.v[0]));
__m128i X1 = _mm_loadu_si128((const __m128i *)&(_state.v[1]));
__m128i X2 = _mm_loadu_si128((const __m128i *)&(_state.v[2]));
__m128i X3 = _mm_loadu_si128((const __m128i *)&(_state.v[3]));
__m128i T;
__m128i X0s = X0;
__m128i X1s = X1;
__m128i X2s = X2;
__m128i X3s = X3;
// 2X round -------------------------------------------------------------
T = _mm_add_epi32(X0, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X1, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X3, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x93);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x39);
T = _mm_add_epi32(X0, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X3, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X1, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x39);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x93);
// 2X round -------------------------------------------------------------
T = _mm_add_epi32(X0, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X1, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X3, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x93);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x39);
T = _mm_add_epi32(X0, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X3, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X1, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x39);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x93);
// 2X round -------------------------------------------------------------
T = _mm_add_epi32(X0, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X1, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X3, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x93);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x39);
T = _mm_add_epi32(X0, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X3, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X1, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x39);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x93);
// 2X round -------------------------------------------------------------
T = _mm_add_epi32(X0, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X1, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X3, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x93);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x39);
T = _mm_add_epi32(X0, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X3, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X1, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x39);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x93);
// 2X round -------------------------------------------------------------
T = _mm_add_epi32(X0, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X1, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X3, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x93);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x39);
T = _mm_add_epi32(X0, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X3, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X1, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x39);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x93);
// 2X round -------------------------------------------------------------
T = _mm_add_epi32(X0, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X1, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X3, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x93);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x39);
T = _mm_add_epi32(X0, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X3, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X1, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x39);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x93);
// 2X round -------------------------------------------------------------
T = _mm_add_epi32(X0, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X1, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X3, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x93);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x39);
T = _mm_add_epi32(X0, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X3, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X1, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x39);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x93);
// 2X round -------------------------------------------------------------
T = _mm_add_epi32(X0, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X1, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X3, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x93);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x39);
T = _mm_add_epi32(X0, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X3, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X1, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x39);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x93);
// 2X round -------------------------------------------------------------
T = _mm_add_epi32(X0, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X1, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X3, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x93);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x39);
T = _mm_add_epi32(X0, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X3, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X1, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x39);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x93);
// 2X round -------------------------------------------------------------
T = _mm_add_epi32(X0, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X1, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X3, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x93);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x39);
T = _mm_add_epi32(X0, X1);
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X3, X0);
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X3);
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X1, X2);
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x39);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x93);
X0 = _mm_add_epi32(X0s,X0);
X1 = _mm_add_epi32(X1s,X1);
X2 = _mm_add_epi32(X2s,X2);
X3 = _mm_add_epi32(X3s,X3);
__m128i k02 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X0, 32), _mm_srli_epi64(X3, 32)), _MM_SHUFFLE(0, 1, 2, 3));
__m128i k13 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X1, 32), _mm_srli_epi64(X0, 32)), _MM_SHUFFLE(0, 1, 2, 3));
__m128i k20 = _mm_or_si128(_mm_and_si128(X2, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X1, _S20SSECONSTANTS.maskHi32));
__m128i k31 = _mm_or_si128(_mm_and_si128(X3, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X2, _S20SSECONSTANTS.maskHi32));
_mm_storeu_ps(reinterpret_cast<float *>(c),_mm_castsi128_ps(_mm_unpackhi_epi64(k02,k20)));
_mm_storeu_ps(reinterpret_cast<float *>(c) + 4,_mm_castsi128_ps(_mm_unpackhi_epi64(k13,k31)));
_mm_storeu_ps(reinterpret_cast<float *>(c) + 8,_mm_castsi128_ps(_mm_unpacklo_epi64(k20,k02)));
_mm_storeu_ps(reinterpret_cast<float *>(c) + 12,_mm_castsi128_ps(_mm_unpacklo_epi64(k31,k13)));
if (!(++_state.i[8])) {
++_state.i[5]; // state reordered for SSE
/* stopping at 2^70 bytes per nonce is user's responsibility */
}
if (bytes <= 64) {
if (bytes < 64) {
for (i = 0;i < bytes;++i)
ctarget[i] = c[i];
}
return;
}
bytes -= 64;
c += 64;
}
}
} // namespace ZeroTier

View file

@ -1,52 +0,0 @@
/*
* Based on public domain code available at: http://cr.yp.to/snuffle.html
*
* This therefore is public domain.
*/
#ifndef ZT_SALSA20_HPP
#define ZT_SALSA20_HPP
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <emmintrin.h>
namespace ZeroTier {
/**
* Salsa20 stream cipher
*/
class Salsa20
{
public:
/**
* @param key 256-bit (32 byte) key
* @param iv 64-bit initialization vector
*/
Salsa20(const void *key,const void *iv)
{
init(key,iv);
}
/**
* Initialize cipher
*
* @param key Key bits
* @param iv 64-bit initialization vector
*/
void init(const void *key,const void *iv);
void XORKeyStream(void *out,unsigned int bytes);
private:
union {
__m128i v[4];
uint32_t i[16];
} _state;
};
} // namespace ZeroTier
#endif

View file

@ -1,272 +0,0 @@
/* ecrypt-config.h */
/* *** Normally, it should not be necessary to edit this file. *** */
#ifndef ECRYPT_CONFIG
#define ECRYPT_CONFIG
/* ------------------------------------------------------------------------- */
/* Guess the endianness of the target architecture. */
/*
* The LITTLE endian machines:
*/
#if defined(__ultrix) /* Older MIPS */
#define ECRYPT_LITTLE_ENDIAN
#elif defined(__alpha) /* Alpha */
#define ECRYPT_LITTLE_ENDIAN
#elif defined(i386) /* x86 (gcc) */
#define ECRYPT_LITTLE_ENDIAN
#elif defined(__i386) /* x86 (gcc) */
#define ECRYPT_LITTLE_ENDIAN
#elif defined(_M_IX86) /* x86 (MSC, Borland) */
#define ECRYPT_LITTLE_ENDIAN
#elif defined(_MSC_VER) /* x86 (surely MSC) */
#define ECRYPT_LITTLE_ENDIAN
#elif defined(__INTEL_COMPILER) /* x86 (surely Intel compiler icl.exe) */
#define ECRYPT_LITTLE_ENDIAN
/*
* The BIG endian machines:
*/
#elif defined(sun) /* Newer Sparc's */
#define ECRYPT_BIG_ENDIAN
#elif defined(__ppc__) /* PowerPC */
#define ECRYPT_BIG_ENDIAN
/*
* Finally machines with UNKNOWN endianness:
*/
#elif defined (_AIX) /* RS6000 */
#define ECRYPT_UNKNOWN
#elif defined(__hpux) /* HP-PA */
#define ECRYPT_UNKNOWN
#elif defined(__aux) /* 68K */
#define ECRYPT_UNKNOWN
#elif defined(__dgux) /* 88K (but P6 in latest boxes) */
#define ECRYPT_UNKNOWN
#elif defined(__sgi) /* Newer MIPS */
#define ECRYPT_UNKNOWN
#else /* Any other processor */
#define ECRYPT_UNKNOWN
#endif
/* ------------------------------------------------------------------------- */
/*
* Find minimal-width types to store 8-bit, 16-bit, 32-bit, and 64-bit
* integers.
*
* Note: to enable 64-bit types on 32-bit compilers, it might be
* necessary to switch from ISO C90 mode to ISO C99 mode (e.g., gcc
* -std=c99).
*/
#include <limits.h>
/* --- check char --- */
#if (UCHAR_MAX / 0xFU > 0xFU)
#ifndef I8T
#define I8T char
#define U8C(v) (v##U)
#if (UCHAR_MAX == 0xFFU)
#define ECRYPT_I8T_IS_BYTE
#endif
#endif
#if (UCHAR_MAX / 0xFFU > 0xFFU)
#ifndef I16T
#define I16T char
#define U16C(v) (v##U)
#endif
#if (UCHAR_MAX / 0xFFFFU > 0xFFFFU)
#ifndef I32T
#define I32T char
#define U32C(v) (v##U)
#endif
#if (UCHAR_MAX / 0xFFFFFFFFU > 0xFFFFFFFFU)
#ifndef I64T
#define I64T char
#define U64C(v) (v##U)
#define ECRYPT_NATIVE64
#endif
#endif
#endif
#endif
#endif
/* --- check short --- */
#if (USHRT_MAX / 0xFU > 0xFU)
#ifndef I8T
#define I8T short
#define U8C(v) (v##U)
#if (USHRT_MAX == 0xFFU)
#define ECRYPT_I8T_IS_BYTE
#endif
#endif
#if (USHRT_MAX / 0xFFU > 0xFFU)
#ifndef I16T
#define I16T short
#define U16C(v) (v##U)
#endif
#if (USHRT_MAX / 0xFFFFU > 0xFFFFU)
#ifndef I32T
#define I32T short
#define U32C(v) (v##U)
#endif
#if (USHRT_MAX / 0xFFFFFFFFU > 0xFFFFFFFFU)
#ifndef I64T
#define I64T short
#define U64C(v) (v##U)
#define ECRYPT_NATIVE64
#endif
#endif
#endif
#endif
#endif
/* --- check int --- */
#if (UINT_MAX / 0xFU > 0xFU)
#ifndef I8T
#define I8T int
#define U8C(v) (v##U)
#if (ULONG_MAX == 0xFFU)
#define ECRYPT_I8T_IS_BYTE
#endif
#endif
#if (UINT_MAX / 0xFFU > 0xFFU)
#ifndef I16T
#define I16T int
#define U16C(v) (v##U)
#endif
#if (UINT_MAX / 0xFFFFU > 0xFFFFU)
#ifndef I32T
#define I32T int
#define U32C(v) (v##U)
#endif
#if (UINT_MAX / 0xFFFFFFFFU > 0xFFFFFFFFU)
#ifndef I64T
#define I64T int
#define U64C(v) (v##U)
#define ECRYPT_NATIVE64
#endif
#endif
#endif
#endif
#endif
/* --- check long --- */
#if (ULONG_MAX / 0xFUL > 0xFUL)
#ifndef I8T
#define I8T long
#define U8C(v) (v##UL)
#if (ULONG_MAX == 0xFFUL)
#define ECRYPT_I8T_IS_BYTE
#endif
#endif
#if (ULONG_MAX / 0xFFUL > 0xFFUL)
#ifndef I16T
#define I16T long
#define U16C(v) (v##UL)
#endif
#if (ULONG_MAX / 0xFFFFUL > 0xFFFFUL)
#ifndef I32T
#define I32T long
#define U32C(v) (v##UL)
#endif
#if (ULONG_MAX / 0xFFFFFFFFUL > 0xFFFFFFFFUL)
#ifndef I64T
#define I64T long
#define U64C(v) (v##UL)
#define ECRYPT_NATIVE64
#endif
#endif
#endif
#endif
#endif
/* --- check long long --- */
#ifdef ULLONG_MAX
#if (ULLONG_MAX / 0xFULL > 0xFULL)
#ifndef I8T
#define I8T long long
#define U8C(v) (v##ULL)
#if (ULLONG_MAX == 0xFFULL)
#define ECRYPT_I8T_IS_BYTE
#endif
#endif
#if (ULLONG_MAX / 0xFFULL > 0xFFULL)
#ifndef I16T
#define I16T long long
#define U16C(v) (v##ULL)
#endif
#if (ULLONG_MAX / 0xFFFFULL > 0xFFFFULL)
#ifndef I32T
#define I32T long long
#define U32C(v) (v##ULL)
#endif
#if (ULLONG_MAX / 0xFFFFFFFFULL > 0xFFFFFFFFULL)
#ifndef I64T
#define I64T long long
#define U64C(v) (v##ULL)
#endif
#endif
#endif
#endif
#endif
#endif
/* --- check __int64 --- */
#ifdef _UI64_MAX
#if (_UI64_MAX / 0xFFFFFFFFui64 > 0xFFFFFFFFui64)
#ifndef I64T
#define I64T __int64
#define U64C(v) (v##ui64)
#endif
#endif
#endif
/* ------------------------------------------------------------------------- */
#endif

View file

@ -1,46 +0,0 @@
/* ecrypt-machine.h */
/*
* This file is included by 'ecrypt-portable.h'. It allows to override
* the default macros for specific platforms. Please carefully check
* the machine code generated by your compiler (with optimisations
* turned on) before deciding to edit this file.
*/
/* ------------------------------------------------------------------------- */
#if (defined(ECRYPT_DEFAULT_ROT) && !defined(ECRYPT_MACHINE_ROT))
#define ECRYPT_MACHINE_ROT
#if (defined(WIN32) && defined(_MSC_VER))
#undef ROTL32
#undef ROTR32
#undef ROTL64
#undef ROTR64
#include <stdlib.h>
#define ROTL32(v, n) _lrotl(v, n)
#define ROTR32(v, n) _lrotr(v, n)
#define ROTL64(v, n) _rotl64(v, n)
#define ROTR64(v, n) _rotr64(v, n)
#endif
#endif
/* ------------------------------------------------------------------------- */
#if (defined(ECRYPT_DEFAULT_SWAP) && !defined(ECRYPT_MACHINE_SWAP))
#define ECRYPT_MACHINE_SWAP
/*
* If you want to overwrite the default swap macros, put it here. And so on.
*/
#endif
/* ------------------------------------------------------------------------- */

View file

@ -1,303 +0,0 @@
/* ecrypt-portable.h */
/*
* WARNING: the conversions defined below are implemented as macros,
* and should be used carefully. They should NOT be used with
* parameters which perform some action. E.g., the following two lines
* are not equivalent:
*
* 1) ++x; y = ROTL32(x, n);
* 2) y = ROTL32(++x, n);
*/
/*
* *** Please do not edit this file. ***
*
* The default macros can be overridden for specific architectures by
* editing 'ecrypt-machine.h'.
*/
#ifndef ECRYPT_PORTABLE
#define ECRYPT_PORTABLE
#include "ecrypt-config.h"
/* ------------------------------------------------------------------------- */
/*
* The following types are defined (if available):
*
* u8: unsigned integer type, at least 8 bits
* u16: unsigned integer type, at least 16 bits
* u32: unsigned integer type, at least 32 bits
* u64: unsigned integer type, at least 64 bits
*
* s8, s16, s32, s64 -> signed counterparts of u8, u16, u32, u64
*
* The selection of minimum-width integer types is taken care of by
* 'ecrypt-config.h'. Note: to enable 64-bit types on 32-bit
* compilers, it might be necessary to switch from ISO C90 mode to ISO
* C99 mode (e.g., gcc -std=c99).
*/
#ifdef I8T
typedef signed I8T s8;
typedef unsigned I8T u8;
#endif
#ifdef I16T
typedef signed I16T s16;
typedef unsigned I16T u16;
#endif
#ifdef I32T
typedef signed I32T s32;
typedef unsigned I32T u32;
#endif
#ifdef I64T
typedef signed I64T s64;
typedef unsigned I64T u64;
#endif
/*
* The following macros are used to obtain exact-width results.
*/
#define U8V(v) ((u8)(v) & U8C(0xFF))
#define U16V(v) ((u16)(v) & U16C(0xFFFF))
#define U32V(v) ((u32)(v) & U32C(0xFFFFFFFF))
#define U64V(v) ((u64)(v) & U64C(0xFFFFFFFFFFFFFFFF))
/* ------------------------------------------------------------------------- */
/*
* The following macros return words with their bits rotated over n
* positions to the left/right.
*/
#define ECRYPT_DEFAULT_ROT
#define ROTL8(v, n) \
(U8V((v) << (n)) | ((v) >> (8 - (n))))
#define ROTL16(v, n) \
(U16V((v) << (n)) | ((v) >> (16 - (n))))
#define ROTL32(v, n) \
(U32V((v) << (n)) | ((v) >> (32 - (n))))
#define ROTL64(v, n) \
(U64V((v) << (n)) | ((v) >> (64 - (n))))
#define ROTR8(v, n) ROTL8(v, 8 - (n))
#define ROTR16(v, n) ROTL16(v, 16 - (n))
#define ROTR32(v, n) ROTL32(v, 32 - (n))
#define ROTR64(v, n) ROTL64(v, 64 - (n))
#include "ecrypt-machine.h"
/* ------------------------------------------------------------------------- */
/*
* The following macros return a word with bytes in reverse order.
*/
#define ECRYPT_DEFAULT_SWAP
#define SWAP16(v) \
ROTL16(v, 8)
#define SWAP32(v) \
((ROTL32(v, 8) & U32C(0x00FF00FF)) | \
(ROTL32(v, 24) & U32C(0xFF00FF00)))
#ifdef ECRYPT_NATIVE64
#define SWAP64(v) \
((ROTL64(v, 8) & U64C(0x000000FF000000FF)) | \
(ROTL64(v, 24) & U64C(0x0000FF000000FF00)) | \
(ROTL64(v, 40) & U64C(0x00FF000000FF0000)) | \
(ROTL64(v, 56) & U64C(0xFF000000FF000000)))
#else
#define SWAP64(v) \
(((u64)SWAP32(U32V(v)) << 32) | (u64)SWAP32(U32V(v >> 32)))
#endif
#include "ecrypt-machine.h"
#define ECRYPT_DEFAULT_WTOW
#ifdef ECRYPT_LITTLE_ENDIAN
#define U16TO16_LITTLE(v) (v)
#define U32TO32_LITTLE(v) (v)
#define U64TO64_LITTLE(v) (v)
#define U16TO16_BIG(v) SWAP16(v)
#define U32TO32_BIG(v) SWAP32(v)
#define U64TO64_BIG(v) SWAP64(v)
#endif
#ifdef ECRYPT_BIG_ENDIAN
#define U16TO16_LITTLE(v) SWAP16(v)
#define U32TO32_LITTLE(v) SWAP32(v)
#define U64TO64_LITTLE(v) SWAP64(v)
#define U16TO16_BIG(v) (v)
#define U32TO32_BIG(v) (v)
#define U64TO64_BIG(v) (v)
#endif
#include "ecrypt-machine.h"
/*
* The following macros load words from an array of bytes with
* different types of endianness, and vice versa.
*/
#define ECRYPT_DEFAULT_BTOW
#if (!defined(ECRYPT_UNKNOWN) && defined(ECRYPT_I8T_IS_BYTE))
#define U8TO16_LITTLE(p) U16TO16_LITTLE(((u16*)(p))[0])
#define U8TO32_LITTLE(p) U32TO32_LITTLE(((u32*)(p))[0])
#define U8TO64_LITTLE(p) U64TO64_LITTLE(((u64*)(p))[0])
#define U8TO16_BIG(p) U16TO16_BIG(((u16*)(p))[0])
#define U8TO32_BIG(p) U32TO32_BIG(((u32*)(p))[0])
#define U8TO64_BIG(p) U64TO64_BIG(((u64*)(p))[0])
#define U16TO8_LITTLE(p, v) (((u16*)(p))[0] = U16TO16_LITTLE(v))
#define U32TO8_LITTLE(p, v) (((u32*)(p))[0] = U32TO32_LITTLE(v))
#define U64TO8_LITTLE(p, v) (((u64*)(p))[0] = U64TO64_LITTLE(v))
#define U16TO8_BIG(p, v) (((u16*)(p))[0] = U16TO16_BIG(v))
#define U32TO8_BIG(p, v) (((u32*)(p))[0] = U32TO32_BIG(v))
#define U64TO8_BIG(p, v) (((u64*)(p))[0] = U64TO64_BIG(v))
#else
#define U8TO16_LITTLE(p) \
(((u16)((p)[0]) ) | \
((u16)((p)[1]) << 8))
#define U8TO32_LITTLE(p) \
(((u32)((p)[0]) ) | \
((u32)((p)[1]) << 8) | \
((u32)((p)[2]) << 16) | \
((u32)((p)[3]) << 24))
#ifdef ECRYPT_NATIVE64
#define U8TO64_LITTLE(p) \
(((u64)((p)[0]) ) | \
((u64)((p)[1]) << 8) | \
((u64)((p)[2]) << 16) | \
((u64)((p)[3]) << 24) | \
((u64)((p)[4]) << 32) | \
((u64)((p)[5]) << 40) | \
((u64)((p)[6]) << 48) | \
((u64)((p)[7]) << 56))
#else
#define U8TO64_LITTLE(p) \
((u64)U8TO32_LITTLE(p) | ((u64)U8TO32_LITTLE((p) + 4) << 32))
#endif
#define U8TO16_BIG(p) \
(((u16)((p)[0]) << 8) | \
((u16)((p)[1]) ))
#define U8TO32_BIG(p) \
(((u32)((p)[0]) << 24) | \
((u32)((p)[1]) << 16) | \
((u32)((p)[2]) << 8) | \
((u32)((p)[3]) ))
#ifdef ECRYPT_NATIVE64
#define U8TO64_BIG(p) \
(((u64)((p)[0]) << 56) | \
((u64)((p)[1]) << 48) | \
((u64)((p)[2]) << 40) | \
((u64)((p)[3]) << 32) | \
((u64)((p)[4]) << 24) | \
((u64)((p)[5]) << 16) | \
((u64)((p)[6]) << 8) | \
((u64)((p)[7]) ))
#else
#define U8TO64_BIG(p) \
(((u64)U8TO32_BIG(p) << 32) | (u64)U8TO32_BIG((p) + 4))
#endif
#define U16TO8_LITTLE(p, v) \
do { \
(p)[0] = U8V((v) ); \
(p)[1] = U8V((v) >> 8); \
} while (0)
#define U32TO8_LITTLE(p, v) \
do { \
(p)[0] = U8V((v) ); \
(p)[1] = U8V((v) >> 8); \
(p)[2] = U8V((v) >> 16); \
(p)[3] = U8V((v) >> 24); \
} while (0)
#ifdef ECRYPT_NATIVE64
#define U64TO8_LITTLE(p, v) \
do { \
(p)[0] = U8V((v) ); \
(p)[1] = U8V((v) >> 8); \
(p)[2] = U8V((v) >> 16); \
(p)[3] = U8V((v) >> 24); \
(p)[4] = U8V((v) >> 32); \
(p)[5] = U8V((v) >> 40); \
(p)[6] = U8V((v) >> 48); \
(p)[7] = U8V((v) >> 56); \
} while (0)
#else
#define U64TO8_LITTLE(p, v) \
do { \
U32TO8_LITTLE((p), U32V((v) )); \
U32TO8_LITTLE((p) + 4, U32V((v) >> 32)); \
} while (0)
#endif
#define U16TO8_BIG(p, v) \
do { \
(p)[0] = U8V((v) ); \
(p)[1] = U8V((v) >> 8); \
} while (0)
#define U32TO8_BIG(p, v) \
do { \
(p)[0] = U8V((v) >> 24); \
(p)[1] = U8V((v) >> 16); \
(p)[2] = U8V((v) >> 8); \
(p)[3] = U8V((v) ); \
} while (0)
#ifdef ECRYPT_NATIVE64
#define U64TO8_BIG(p, v) \
do { \
(p)[0] = U8V((v) >> 56); \
(p)[1] = U8V((v) >> 48); \
(p)[2] = U8V((v) >> 40); \
(p)[3] = U8V((v) >> 32); \
(p)[4] = U8V((v) >> 24); \
(p)[5] = U8V((v) >> 16); \
(p)[6] = U8V((v) >> 8); \
(p)[7] = U8V((v) ); \
} while (0)
#else
#define U64TO8_BIG(p, v) \
do { \
U32TO8_BIG((p), U32V((v) >> 32)); \
U32TO8_BIG((p) + 4, U32V((v) )); \
} while (0)
#endif
#endif
#include "ecrypt-machine.h"
/* ------------------------------------------------------------------------- */
#endif

View file

@ -1,279 +0,0 @@
/* ecrypt-sync.h */
/*
* Header file for synchronous stream ciphers without authentication
* mechanism.
*
* *** Please only edit parts marked with "[edit]". ***
*/
#ifndef ECRYPT_SYNC
#define ECRYPT_SYNC
#include "ecrypt-portable.h"
/* ------------------------------------------------------------------------- */
/* Cipher parameters */
/*
* The name of your cipher.
*/
#define ECRYPT_NAME "Salsa20" /* [edit] */
#define ECRYPT_PROFILE "S!_H."
/*
* Specify which key and IV sizes are supported by your cipher. A user
* should be able to enumerate the supported sizes by running the
* following code:
*
* for (i = 0; ECRYPT_KEYSIZE(i) <= ECRYPT_MAXKEYSIZE; ++i)
* {
* keysize = ECRYPT_KEYSIZE(i);
*
* ...
* }
*
* All sizes are in bits.
*/
#define ECRYPT_MAXKEYSIZE 256 /* [edit] */
#define ECRYPT_KEYSIZE(i) (128 + (i)*128) /* [edit] */
#define ECRYPT_MAXIVSIZE 64 /* [edit] */
#define ECRYPT_IVSIZE(i) (64 + (i)*64) /* [edit] */
/* ------------------------------------------------------------------------- */
/* Data structures */
/*
* ECRYPT_ctx is the structure containing the representation of the
* internal state of your cipher.
*/
typedef struct
{
u32 input[16]; /* could be compressed */
/*
* [edit]
*
* Put here all state variable needed during the encryption process.
*/
} ECRYPT_ctx;
/* ------------------------------------------------------------------------- */
/* Mandatory functions */
/*
* Key and message independent initialization. This function will be
* called once when the program starts (e.g., to build expanded S-box
* tables).
*/
void ECRYPT_init();
/*
* Key setup. It is the user's responsibility to select the values of
* keysize and ivsize from the set of supported values specified
* above.
*/
void ECRYPT_keysetup(
ECRYPT_ctx* ctx,
const u8* key,
u32 keysize, /* Key size in bits. */
u32 ivsize); /* IV size in bits. */
/*
* IV setup. After having called ECRYPT_keysetup(), the user is
* allowed to call ECRYPT_ivsetup() different times in order to
* encrypt/decrypt different messages with the same key but different
* IV's.
*/
void ECRYPT_ivsetup(
ECRYPT_ctx* ctx,
const u8* iv);
/*
* Encryption/decryption of arbitrary length messages.
*
* For efficiency reasons, the API provides two types of
* encrypt/decrypt functions. The ECRYPT_encrypt_bytes() function
* (declared here) encrypts byte strings of arbitrary length, while
* the ECRYPT_encrypt_blocks() function (defined later) only accepts
* lengths which are multiples of ECRYPT_BLOCKLENGTH.
*
* The user is allowed to make multiple calls to
* ECRYPT_encrypt_blocks() to incrementally encrypt a long message,
* but he is NOT allowed to make additional encryption calls once he
* has called ECRYPT_encrypt_bytes() (unless he starts a new message
* of course). For example, this sequence of calls is acceptable:
*
* ECRYPT_keysetup();
*
* ECRYPT_ivsetup();
* ECRYPT_encrypt_blocks();
* ECRYPT_encrypt_blocks();
* ECRYPT_encrypt_bytes();
*
* ECRYPT_ivsetup();
* ECRYPT_encrypt_blocks();
* ECRYPT_encrypt_blocks();
*
* ECRYPT_ivsetup();
* ECRYPT_encrypt_bytes();
*
* The following sequence is not:
*
* ECRYPT_keysetup();
* ECRYPT_ivsetup();
* ECRYPT_encrypt_blocks();
* ECRYPT_encrypt_bytes();
* ECRYPT_encrypt_blocks();
*/
void ECRYPT_encrypt_bytes(
ECRYPT_ctx* ctx,
const u8* plaintext,
u8* ciphertext,
u32 msglen); /* Message length in bytes. */
void ECRYPT_decrypt_bytes(
ECRYPT_ctx* ctx,
const u8* ciphertext,
u8* plaintext,
u32 msglen); /* Message length in bytes. */
/* ------------------------------------------------------------------------- */
/* Optional features */
/*
* For testing purposes it can sometimes be useful to have a function
* which immediately generates keystream without having to provide it
* with a zero plaintext. If your cipher cannot provide this function
* (e.g., because it is not strictly a synchronous cipher), please
* reset the ECRYPT_GENERATES_KEYSTREAM flag.
*/
#define ECRYPT_GENERATES_KEYSTREAM
#ifdef ECRYPT_GENERATES_KEYSTREAM
void ECRYPT_keystream_bytes(
ECRYPT_ctx* ctx,
u8* keystream,
u32 length); /* Length of keystream in bytes. */
#endif
/* ------------------------------------------------------------------------- */
/* Optional optimizations */
/*
* By default, the functions in this section are implemented using
* calls to functions declared above. However, you might want to
* implement them differently for performance reasons.
*/
/*
* All-in-one encryption/decryption of (short) packets.
*
* The default definitions of these functions can be found in
* "ecrypt-sync.c". If you want to implement them differently, please
* undef the ECRYPT_USES_DEFAULT_ALL_IN_ONE flag.
*/
#define ECRYPT_USES_DEFAULT_ALL_IN_ONE /* [edit] */
void ECRYPT_encrypt_packet(
ECRYPT_ctx* ctx,
const u8* iv,
const u8* plaintext,
u8* ciphertext,
u32 msglen);
void ECRYPT_decrypt_packet(
ECRYPT_ctx* ctx,
const u8* iv,
const u8* ciphertext,
u8* plaintext,
u32 msglen);
/*
* Encryption/decryption of blocks.
*
* By default, these functions are defined as macros. If you want to
* provide a different implementation, please undef the
* ECRYPT_USES_DEFAULT_BLOCK_MACROS flag and implement the functions
* declared below.
*/
#define ECRYPT_BLOCKLENGTH 64 /* [edit] */
#define ECRYPT_USES_DEFAULT_BLOCK_MACROS /* [edit] */
#ifdef ECRYPT_USES_DEFAULT_BLOCK_MACROS
#define ECRYPT_encrypt_blocks(ctx, plaintext, ciphertext, blocks) \
ECRYPT_encrypt_bytes(ctx, plaintext, ciphertext, \
(blocks) * ECRYPT_BLOCKLENGTH)
#define ECRYPT_decrypt_blocks(ctx, ciphertext, plaintext, blocks) \
ECRYPT_decrypt_bytes(ctx, ciphertext, plaintext, \
(blocks) * ECRYPT_BLOCKLENGTH)
#ifdef ECRYPT_GENERATES_KEYSTREAM
#define ECRYPT_keystream_blocks(ctx, keystream, blocks) \
ECRYPT_keystream_bytes(ctx, keystream, \
(blocks) * ECRYPT_BLOCKLENGTH)
#endif
#else
void ECRYPT_encrypt_blocks(
ECRYPT_ctx* ctx,
const u8* plaintext,
u8* ciphertext,
u32 blocks); /* Message length in blocks. */
void ECRYPT_decrypt_blocks(
ECRYPT_ctx* ctx,
const u8* ciphertext,
u8* plaintext,
u32 blocks); /* Message length in blocks. */
#ifdef ECRYPT_GENERATES_KEYSTREAM
void ECRYPT_keystream_blocks(
ECRYPT_ctx* ctx,
const u8* keystream,
u32 blocks); /* Keystream length in blocks. */
#endif
#endif
/*
* If your cipher can be implemented in different ways, you can use
* the ECRYPT_VARIANT parameter to allow the user to choose between
* them at compile time (e.g., gcc -DECRYPT_VARIANT=3 ...). Please
* only use this possibility if you really think it could make a
* significant difference and keep the number of variants
* (ECRYPT_MAXVARIANT) as small as possible (definitely not more than
* 10). Note also that all variants should have exactly the same
* external interface (i.e., the same ECRYPT_BLOCKLENGTH, etc.).
*/
#define ECRYPT_MAXVARIANT 1 /* [edit] */
#ifndef ECRYPT_VARIANT
#define ECRYPT_VARIANT 1
#endif
#if (ECRYPT_VARIANT > ECRYPT_MAXVARIANT)
#error this variant does not exist
#endif
/* ------------------------------------------------------------------------- */
#endif

View file

@ -1,219 +0,0 @@
/*
salsa20-merged.c version 20051118
D. J. Bernstein
Public domain.
*/
#include "ecrypt-sync.h"
#define ROTATE(v,c) (ROTL32(v,c))
#define XOR(v,w) ((v) ^ (w))
#define PLUS(v,w) (U32V((v) + (w)))
#define PLUSONE(v) (PLUS((v),1))
void ECRYPT_init(void)
{
return;
}
static const char sigma[16] = "expand 32-byte k";
static const char tau[16] = "expand 16-byte k";
void ECRYPT_keysetup(ECRYPT_ctx *x,const u8 *k,u32 kbits,u32 ivbits)
{
const char *constants;
x->input[1] = U8TO32_LITTLE(k + 0);
x->input[2] = U8TO32_LITTLE(k + 4);
x->input[3] = U8TO32_LITTLE(k + 8);
x->input[4] = U8TO32_LITTLE(k + 12);
if (kbits == 256) { /* recommended */
k += 16;
constants = sigma;
} else { /* kbits == 128 */
constants = tau;
}
x->input[11] = U8TO32_LITTLE(k + 0);
x->input[12] = U8TO32_LITTLE(k + 4);
x->input[13] = U8TO32_LITTLE(k + 8);
x->input[14] = U8TO32_LITTLE(k + 12);
x->input[0] = U8TO32_LITTLE(constants + 0);
x->input[5] = U8TO32_LITTLE(constants + 4);
x->input[10] = U8TO32_LITTLE(constants + 8);
x->input[15] = U8TO32_LITTLE(constants + 12);
}
void ECRYPT_ivsetup(ECRYPT_ctx *x,const u8 *iv)
{
x->input[6] = U8TO32_LITTLE(iv + 0);
x->input[7] = U8TO32_LITTLE(iv + 4);
x->input[8] = 0;
x->input[9] = 0;
}
void ECRYPT_encrypt_bytes(ECRYPT_ctx *x,const u8 *m,u8 *c,u32 bytes)
{
u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
u32 j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
u8 *ctarget = 0;
u8 tmp[64];
int i;
if (!bytes) return;
j0 = x->input[0];
j1 = x->input[1];
j2 = x->input[2];
j3 = x->input[3];
j4 = x->input[4];
j5 = x->input[5];
j6 = x->input[6];
j7 = x->input[7];
j8 = x->input[8];
j9 = x->input[9];
j10 = x->input[10];
j11 = x->input[11];
j12 = x->input[12];
j13 = x->input[13];
j14 = x->input[14];
j15 = x->input[15];
for (;;) {
if (bytes < 64) {
for (i = 0;i < bytes;++i) tmp[i] = m[i];
m = tmp;
ctarget = c;
c = tmp;
}
x0 = j0;
x1 = j1;
x2 = j2;
x3 = j3;
x4 = j4;
x5 = j5;
x6 = j6;
x7 = j7;
x8 = j8;
x9 = j9;
x10 = j10;
x11 = j11;
x12 = j12;
x13 = j13;
x14 = j14;
x15 = j15;
for (i = 20;i > 0;i -= 2) {
x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
}
x0 = PLUS(x0,j0);
x1 = PLUS(x1,j1);
x2 = PLUS(x2,j2);
x3 = PLUS(x3,j3);
x4 = PLUS(x4,j4);
x5 = PLUS(x5,j5);
x6 = PLUS(x6,j6);
x7 = PLUS(x7,j7);
x8 = PLUS(x8,j8);
x9 = PLUS(x9,j9);
x10 = PLUS(x10,j10);
x11 = PLUS(x11,j11);
x12 = PLUS(x12,j12);
x13 = PLUS(x13,j13);
x14 = PLUS(x14,j14);
x15 = PLUS(x15,j15);
x0 = XOR(x0,U8TO32_LITTLE(m + 0));
x1 = XOR(x1,U8TO32_LITTLE(m + 4));
x2 = XOR(x2,U8TO32_LITTLE(m + 8));
x3 = XOR(x3,U8TO32_LITTLE(m + 12));
x4 = XOR(x4,U8TO32_LITTLE(m + 16));
x5 = XOR(x5,U8TO32_LITTLE(m + 20));
x6 = XOR(x6,U8TO32_LITTLE(m + 24));
x7 = XOR(x7,U8TO32_LITTLE(m + 28));
x8 = XOR(x8,U8TO32_LITTLE(m + 32));
x9 = XOR(x9,U8TO32_LITTLE(m + 36));
x10 = XOR(x10,U8TO32_LITTLE(m + 40));
x11 = XOR(x11,U8TO32_LITTLE(m + 44));
x12 = XOR(x12,U8TO32_LITTLE(m + 48));
x13 = XOR(x13,U8TO32_LITTLE(m + 52));
x14 = XOR(x14,U8TO32_LITTLE(m + 56));
x15 = XOR(x15,U8TO32_LITTLE(m + 60));
j8 = PLUSONE(j8);
if (!j8) {
j9 = PLUSONE(j9);
/* stopping at 2^70 bytes per nonce is user's responsibility */
}
U32TO8_LITTLE(c + 0,x0);
U32TO8_LITTLE(c + 4,x1);
U32TO8_LITTLE(c + 8,x2);
U32TO8_LITTLE(c + 12,x3);
U32TO8_LITTLE(c + 16,x4);
U32TO8_LITTLE(c + 20,x5);
U32TO8_LITTLE(c + 24,x6);
U32TO8_LITTLE(c + 28,x7);
U32TO8_LITTLE(c + 32,x8);
U32TO8_LITTLE(c + 36,x9);
U32TO8_LITTLE(c + 40,x10);
U32TO8_LITTLE(c + 44,x11);
U32TO8_LITTLE(c + 48,x12);
U32TO8_LITTLE(c + 52,x13);
U32TO8_LITTLE(c + 56,x14);
U32TO8_LITTLE(c + 60,x15);
if (bytes <= 64) {
if (bytes < 64) {
for (i = 0;i < bytes;++i) ctarget[i] = c[i];
}
x->input[8] = j8;
x->input[9] = j9;
return;
}
bytes -= 64;
c += 64;
m += 64;
}
}
void ECRYPT_decrypt_bytes(ECRYPT_ctx *x,const u8 *c,u8 *m,u32 bytes)
{
ECRYPT_encrypt_bytes(x,c,m,bytes);
}
void ECRYPT_keystream_bytes(ECRYPT_ctx *x,u8 *stream,u32 bytes)
{
u32 i;
for (i = 0; i < bytes; ++i) stream[i] = 0;
ECRYPT_encrypt_bytes(x,stream,stream,bytes);
}

View file

@ -1,57 +0,0 @@
;# XMRig
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
;# Copyright 2018-2019 tevador <tevador@gmail.com>
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
;#
;# This program is free software: you can redistribute it and/or modify
;# it under the terms of the GNU General Public License as published by
;# the Free Software Foundation, either version 3 of the License, or
;# (at your option) any later version.
;#
;# This program is distributed in the hope that it will be useful,
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;# GNU General Public License for more details.
;#
;# You should have received a copy of the GNU General Public License
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
;#
.intel_syntax noprefix
#if defined(__APPLE__)
.text
#define DECL(x) _##x
#else
.section .text
#define DECL(x) x
#endif
#define ALIGN .balign
#define dq .quad
.global DECL(SHA3_256_AVX2_ASM)
ALIGN 64
DECL(SHA3_256_AVX2_ASM):
#include "sha3_256_avx2.inc"
KeccakF1600_AVX2_ASM:
lea r8,[rip+rot_left+96]
lea r9,[rip+rot_right+96]
lea r10,[rip+rndc]
#include "sha3_256_keccakf1600_avx2.inc"
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif

View file

@ -1,45 +0,0 @@
;# XMRig
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
;# Copyright 2018-2019 tevador <tevador@gmail.com>
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
;#
;# This program is free software: you can redistribute it and/or modify
;# it under the terms of the GNU General Public License as published by
;# the Free Software Foundation, either version 3 of the License, or
;# (at your option) any later version.
;#
;# This program is distributed in the hope that it will be useful,
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;# GNU General Public License for more details.
;#
;# You should have received a copy of the GNU General Public License
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
;#
_SHA3_256_AVX2_ASM SEGMENT PAGE READ EXECUTE
PUBLIC SHA3_256_AVX2_ASM
ALIGN 64
SHA3_256_AVX2_ASM:
include sha3_256_avx2.inc
KeccakF1600_AVX2_ASM:
lea r8,[rot_left+96]
lea r9,[rot_right+96]
lea r10,[rndc]
include sha3_256_keccakf1600_avx2.inc
_SHA3_256_AVX2_ASM ENDS
END

View file

@ -1,162 +0,0 @@
;# XMRig
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
;# Copyright 2018-2019 tevador <tevador@gmail.com>
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
;#
;# This program is free software: you can redistribute it and/or modify
;# it under the terms of the GNU General Public License as published by
;# the Free Software Foundation, either version 3 of the License, or
;# (at your option) any later version.
;#
;# This program is distributed in the hope that it will be useful,
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;# GNU General Public License for more details.
;#
;# You should have received a copy of the GNU General Public License
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
;#
vzeroupper
mov qword ptr [rsp+8],rbx
mov qword ptr [rsp+16],rsi
mov qword ptr [rsp+24],rdi
push rbp
push r12
push r13
push r14
push r15
sub rsp, 80
movdqu xmmword ptr [rsp+64], xmm6
movdqu xmmword ptr [rsp+48], xmm7
movdqu xmmword ptr [rsp+32], xmm8
movdqu xmmword ptr [rsp+16], xmm9
movdqu xmmword ptr [rsp+0], xmm10
sub rsp, 80
movdqu xmmword ptr [rsp+64], xmm11
movdqu xmmword ptr [rsp+48], xmm12
movdqu xmmword ptr [rsp+32], xmm13
movdqu xmmword ptr [rsp+16], xmm14
movdqu xmmword ptr [rsp+0], xmm15
sub rsp,320
lea rbp,[rsp+64]
and rbp,-32
vpxor xmm0,xmm0,xmm0
xor edi,edi
mov dword ptr [rbp],50462976
mov r12,rdx
mov dword ptr [rbp+4],169150212
mov r14,rdx
mov dword ptr [rbp+8],218436623
shr r14,3
and r12d,7
mov dword ptr [rbp+12],135009046
mov r13,r8
mov byte ptr [rbp+16],9
mov rsi,rcx
mov ebx,edi
vmovdqa ymmword ptr [rbp+32],ymm0
vmovdqa ymmword ptr [rbp+64],ymm0
vmovdqa ymmword ptr [rbp+96],ymm0
vmovdqa ymmword ptr [rbp+128],ymm0
vmovdqa ymmword ptr [rbp+160],ymm0
vmovdqa ymmword ptr [rbp+192],ymm0
vmovdqa ymmword ptr [rbp+224],ymm0
test r14,r14
je sha3_main_loop_end
sha3_main_loop:
movzx eax,byte ptr [rbp+rbx]
lea rcx,[rbp+32]
lea rcx,[rcx+rax*8]
mov rax,qword ptr [rsi]
xor qword ptr [rcx],rax
lea r15,[rbx+1]
cmp rbx,16
jne skip_keccak
lea rcx,[rbp+32]
call KeccakF1600_AVX2_ASM
skip_keccak:
cmp rbx,16
mov rax,rdi
cmovne rax,r15
add rsi,8
mov rbx,rax
sub r14,1
jne sha3_main_loop
sha3_main_loop_end:
mov rdx,rdi
test r12,r12
je sha3_tail_loop_end
mov r8,rdi
sha3_tail_loop:
movzx eax,byte ptr [rdx+rsi]
inc rdx
shlx rcx,rax,r8
or rdi,rcx
add r8,8
cmp rdx,r12
jb sha3_tail_loop
sha3_tail_loop_end:
movzx eax,byte ptr [rbp+rbx]
lea rdx,[rbp+32]
lea rdx,[rdx+rax*8]
mov ecx,6
lea rax,[r12*8]
shlx rcx,rcx,rax
xor rcx,qword ptr [rdx]
mov eax,1
shl rax,63
xor rcx,rdi
mov qword ptr [rdx],rcx
xor qword ptr [rbp+104],rax
lea rcx,[rbp+32]
call KeccakF1600_AVX2_ASM
vmovups ymm0,ymmword ptr [rbp+32]
vmovups ymmword ptr [r13],ymm0
vzeroupper
add rsp,320
movdqu xmm15, xmmword ptr [rsp]
movdqu xmm14, xmmword ptr [rsp+16]
movdqu xmm13, xmmword ptr [rsp+32]
movdqu xmm12, xmmword ptr [rsp+48]
movdqu xmm11, xmmword ptr [rsp+64]
add rsp, 80
movdqu xmm10, xmmword ptr [rsp]
movdqu xmm9, xmmword ptr [rsp+16]
movdqu xmm8, xmmword ptr [rsp+32]
movdqu xmm7, xmmword ptr [rsp+48]
movdqu xmm6, xmmword ptr [rsp+64]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rbp
mov rbx,qword ptr [rsp+8]
mov rsi,qword ptr [rsp+16]
mov rdi,qword ptr [rsp+24]
ret

View file

@ -1,203 +0,0 @@
;# XMRig
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
;# Copyright 2018-2019 tevador <tevador@gmail.com>
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
;#
;# This program is free software: you can redistribute it and/or modify
;# it under the terms of the GNU General Public License as published by
;# the Free Software Foundation, either version 3 of the License, or
;# (at your option) any later version.
;#
;# This program is distributed in the hope that it will be useful,
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;# GNU General Public License for more details.
;#
;# You should have received a copy of the GNU General Public License
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
;#
mov eax,24
lea rcx,[rcx+96]
vpbroadcastq ymm0,QWORD PTR [rcx-96]
vmovdqu ymm1,YMMWORD PTR [rcx-88]
vmovdqu ymm2,YMMWORD PTR [rcx-56]
vmovdqu ymm3,YMMWORD PTR [rcx-24]
vmovdqu ymm4,YMMWORD PTR [rcx+8]
vmovdqu ymm5,YMMWORD PTR [rcx+40]
vmovdqu ymm6,YMMWORD PTR [rcx+72]
ALIGN 64
Loop_avx2:
vpshufd ymm13,ymm2,78
vpxor ymm12,ymm5,ymm3
vpxor ymm9,ymm4,ymm6
vpxor ymm12,ymm12,ymm1
vpxor ymm12,ymm12,ymm9
vpermq ymm11,ymm12,147
vpxor ymm13,ymm13,ymm2
vpermq ymm7,ymm13,78
vpsrlq ymm8,ymm12,63
vpaddq ymm9,ymm12,ymm12
vpor ymm8,ymm8,ymm9
vpermq ymm15,ymm8,57
vpxor ymm14,ymm8,ymm11
vpermq ymm14,ymm14,0
vpxor ymm13,ymm13,ymm0
vpxor ymm13,ymm13,ymm7
vpsrlq ymm7,ymm13,63
vpaddq ymm8,ymm13,ymm13
vpor ymm8,ymm8,ymm7
vpxor ymm2,ymm2,ymm14
vpxor ymm0,ymm0,ymm14
vpblendd ymm15,ymm15,ymm8,192
vpblendd ymm11,ymm11,ymm13,3
vpxor ymm15,ymm15,ymm11
vpsllvq ymm10,ymm2,YMMWORD PTR [r8-96]
vpsrlvq ymm2,ymm2,YMMWORD PTR [r9-96]
vpor ymm2,ymm2,ymm10
vpxor ymm3,ymm3,ymm15
vpsllvq ymm11,ymm3,YMMWORD PTR [r8-32]
vpsrlvq ymm3,ymm3,YMMWORD PTR [r9-32]
vpor ymm3,ymm3,ymm11
vpxor ymm4,ymm4,ymm15
vpsllvq ymm12,ymm4,YMMWORD PTR [r8]
vpsrlvq ymm4,ymm4,YMMWORD PTR [r9]
vpor ymm4,ymm4,ymm12
vpxor ymm5,ymm5,ymm15
vpsllvq ymm13,ymm5,YMMWORD PTR [r8+32]
vpsrlvq ymm5,ymm5,YMMWORD PTR [r9+32]
vpor ymm5,ymm5,ymm13
vpxor ymm6,ymm6,ymm15
vpermq ymm10,ymm2,141
vpermq ymm11,ymm3,141
vpsllvq ymm14,ymm6,YMMWORD PTR [r8+64]
vpsrlvq ymm8,ymm6,YMMWORD PTR [r9+64]
vpor ymm8,ymm8,ymm14
vpxor ymm1,ymm1,ymm15
vpermq ymm12,ymm4,27
vpermq ymm13,ymm5,114
vpsllvq ymm15,ymm1,YMMWORD PTR [r8-64]
vpsrlvq ymm9,ymm1,YMMWORD PTR [r9-64]
vpor ymm9,ymm9,ymm15
vpsrldq ymm14,ymm8,8
vpandn ymm7,ymm8,ymm14
vpblendd ymm3,ymm9,ymm13,12
vpblendd ymm15,ymm11,ymm9,12
vpblendd ymm5,ymm10,ymm11,12
vpblendd ymm14,ymm9,ymm10,12
vpblendd ymm3,ymm3,ymm11,48
vpblendd ymm15,ymm15,ymm12,48
vpblendd ymm5,ymm5,ymm9,48
vpblendd ymm14,ymm14,ymm13,48
vpblendd ymm3,ymm3,ymm12,192
vpblendd ymm15,ymm15,ymm13,192
vpblendd ymm5,ymm5,ymm13,192
vpblendd ymm14,ymm14,ymm11,192
vpandn ymm3,ymm3,ymm15
vpandn ymm5,ymm5,ymm14
vpblendd ymm6,ymm12,ymm9,12
vpblendd ymm15,ymm10,ymm12,12
vpxor ymm3,ymm3,ymm10
vpblendd ymm6,ymm6,ymm10,48
vpblendd ymm15,ymm15,ymm11,48
vpxor ymm5,ymm5,ymm12
vpblendd ymm6,ymm6,ymm11,192
vpblendd ymm15,ymm15,ymm9,192
vpandn ymm6,ymm6,ymm15
vpxor ymm6,ymm6,ymm13
vpermq ymm4,ymm8,30
vpblendd ymm15,ymm4,ymm0,48
vpermq ymm1,ymm8,57
vpblendd ymm1,ymm1,ymm0,192
vpandn ymm1,ymm1,ymm15
vpblendd ymm2,ymm11,ymm12,12
vpblendd ymm14,ymm13,ymm11,12
vpblendd ymm2,ymm2,ymm13,48
vpblendd ymm14,ymm14,ymm10,48
vpblendd ymm2,ymm2,ymm10,192
vpblendd ymm14,ymm14,ymm12,192
vpandn ymm2,ymm2,ymm14
vpxor ymm2,ymm2,ymm9
vpermq ymm7,ymm7,0
vpermq ymm3,ymm3,27
vpermq ymm5,ymm5,141
vpermq ymm6,ymm6,114
vpblendd ymm4,ymm13,ymm10,12
vpblendd ymm14,ymm12,ymm13,12
vpblendd ymm4,ymm4,ymm12,48
vpblendd ymm14,ymm14,ymm9,48
vpblendd ymm4,ymm4,ymm9,192
vpblendd ymm14,ymm14,ymm10,192
vpandn ymm4,ymm4,ymm14
vpxor ymm0,ymm0,ymm7
vpxor ymm1,ymm1,ymm8
vpxor ymm4,ymm4,ymm11
vpxor ymm0,ymm0,YMMWORD PTR [r10]
lea r10,[r10+32]
dec eax
jnz Loop_avx2
vmovq QWORD PTR [rcx-96],xmm0
vmovdqu YMMWORD PTR [rcx-88],ymm1
vmovdqu YMMWORD PTR [rcx-56],ymm2
vmovdqu YMMWORD PTR [rcx-24],ymm3
vmovdqu YMMWORD PTR [rcx+8],ymm4
vmovdqu YMMWORD PTR [rcx+40],ymm5
vmovdqu YMMWORD PTR [rcx+72],ymm6
ret
ALIGN 32
rot_left:
dq 3, 18, 36, 41
dq 1, 62, 28, 27
dq 45, 6, 56, 39
dq 10, 61, 55, 8
dq 2, 15, 25, 20
dq 44, 43, 21, 14
ALIGN 32
rot_right:
dq 64-3, 64-18, 64-36, 64-41
dq 64-1, 64-62, 64-28, 64-27
dq 64-45, 64-6, 64-56, 64-39
dq 64-10, 64-61, 64-55, 64-8
dq 64-2, 64-15, 64-25, 64-20
dq 64-44, 64-43, 64-21, 64-14
ALIGN 32
rndc:
dq 1, 1, 1, 1
dq 32898, 32898, 32898, 32898
dq 9223372036854808714, 9223372036854808714, 9223372036854808714, 9223372036854808714
dq 9223372039002292224, 9223372039002292224, 9223372039002292224, 9223372039002292224
dq 32907, 32907, 32907, 32907
dq 2147483649, 2147483649, 2147483649, 2147483649
dq 9223372039002292353, 9223372039002292353, 9223372039002292353, 9223372039002292353
dq 9223372036854808585, 9223372036854808585, 9223372036854808585, 9223372036854808585
dq 138, 138, 138, 138
dq 136, 136, 136, 136
dq 2147516425, 2147516425, 2147516425, 2147516425
dq 2147483658, 2147483658, 2147483658, 2147483658
dq 2147516555, 2147516555, 2147516555, 2147516555
dq 9223372036854775947, 9223372036854775947, 9223372036854775947, 9223372036854775947
dq 9223372036854808713, 9223372036854808713, 9223372036854808713, 9223372036854808713
dq 9223372036854808579, 9223372036854808579, 9223372036854808579, 9223372036854808579
dq 9223372036854808578, 9223372036854808578, 9223372036854808578, 9223372036854808578
dq 9223372036854775936, 9223372036854775936, 9223372036854775936, 9223372036854775936
dq 32778, 32778, 32778, 32778
dq 9223372039002259466, 9223372039002259466, 9223372039002259466, 9223372039002259466
dq 9223372039002292353, 9223372039002292353, 9223372039002292353, 9223372039002292353
dq 9223372036854808704, 9223372036854808704, 9223372036854808704, 9223372036854808704
dq 2147483649, 2147483649, 2147483649, 2147483649
dq 9223372039002292232, 9223372039002292232, 9223372039002292232, 9223372039002292232

View file

@ -1,208 +0,0 @@
/* XMRig
* Copyright (c) 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright (c) 2018-2019 tevador <tevador@gmail.com>
* Copyright (c) 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
* Copyright (c) 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
* Copyright (c) 2018-2021 SChernykh <https://github.com/SChernykh>
* Copyright (c) 2016-2021 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "crypto/astrobwt/sort_indices2.h"
#include "base/tools/bswap_64.h"
#include <cstring>
#ifdef __GNUC__
#define NOINLINE __attribute__((noinline))
#define RESTRICT __restrict__
#elif _MSC_VER
#define NOINLINE __declspec(noinline)
#define RESTRICT __restrict
#else
#define NOINLINE
#define RESTRICT
#endif
#if __has_cpp_attribute(unlikely)
#define UNLIKELY(X) (X) [[unlikely]]
#elif defined __GNUC__
#define UNLIKELY(X) (__builtin_expect((X), 0))
#else
#define UNLIKELY(X) (X)
#endif
static NOINLINE void fix(const uint8_t* RESTRICT v, uint32_t* RESTRICT indices, int32_t i)
{
uint32_t prev_t = indices[i - 1];
uint32_t t = indices[i];
const uint32_t data_a = bswap_32(*(const uint32_t*)(v + (t & 0xFFFF) + 2));
if (data_a < bswap_32(*(const uint32_t*)(v + (prev_t & 0xFFFF) + 2)))
{
const uint32_t t2 = prev_t;
int32_t j = i - 1;
do
{
indices[j + 1] = prev_t;
--j;
if (j < 0) {
break;
}
prev_t = indices[j];
} while (((t ^ prev_t) <= 0xFFFF) && (data_a < bswap_32(*(const uint32_t*)(v + (prev_t & 0xFFFF) + 2))));
indices[j + 1] = t;
t = t2;
}
}
static NOINLINE void sort_indices(uint32_t N, const uint8_t* RESTRICT v, uint32_t* RESTRICT indices, uint32_t* RESTRICT tmp_indices)
{
uint8_t byte_counters[2][256] = {};
uint32_t counters[2][256];
{
#define ITER(X) ++byte_counters[1][v[i + X]];
enum { unroll = 12 };
uint32_t i = 0;
const uint32_t n = N - (unroll - 1);
for (; i < n; i += unroll) {
ITER(0); ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7); ITER(8); ITER(9); ITER(10); ITER(11);
}
for (; i < N; ++i) {
ITER(0);
}
memcpy(&byte_counters[0], &byte_counters[1], 256);
--byte_counters[0][v[0]];
#undef ITER
}
{
uint32_t c0 = byte_counters[0][0];
uint32_t c1 = byte_counters[1][0] - 1;
counters[0][0] = c0;
counters[1][0] = c1;
uint8_t* src = &byte_counters[0][0] + 1;
uint32_t* dst = &counters[0][0] + 1;
const uint8_t* const e = &byte_counters[0][0] + 256;
do {
c0 += src[0];
c1 += src[256];
dst[0] = c0;
dst[256] = c1;
++src;
++dst;
} while (src < e);
}
{
#define ITER(X) \
do { \
const uint32_t byte0 = v[i - X + 0]; \
const uint32_t byte1 = v[i - X + 1]; \
tmp_indices[counters[0][byte1]--] = (byte0 << 24) | (byte1 << 16) | (i - X); \
} while (0)
enum { unroll = 8 };
uint32_t i = N;
for (; i >= unroll; i -= unroll) {
ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7); ITER(8);
}
for (; i > 0; --i) {
ITER(1);
}
#undef ITER
}
{
#define ITER(X) \
do { \
const uint32_t data = tmp_indices[i - X]; \
indices[counters[1][data >> 24]--] = data; \
} while (0)
enum { unroll = 8 };
uint32_t i = N;
for (; i >= unroll; i -= unroll) {
ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7); ITER(8);
}
for (; i > 0; --i) {
ITER(1);
}
#undef ITER
}
{
#define ITER(X) do { if UNLIKELY(a[X * 2] == a[(X + 1) * 2]) fix(v, indices, i + X); } while (0)
enum { unroll = 16 };
uint32_t i = 1;
const uint32_t n = N - (unroll - 1);
const uint16_t* a = ((const uint16_t*)indices) + 1;
for (; i < n; i += unroll, a += unroll * 2) {
ITER(0); ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7);
ITER(8); ITER(9); ITER(10); ITER(11); ITER(12); ITER(13); ITER(14); ITER(15);
}
for (; i < N; ++i, a += 2) {
ITER(0);
}
#undef ITER
}
{
#define ITER(X) a[X] = b[X * 2];
enum { unroll = 32 };
uint16_t* a = (uint16_t*)indices;
uint16_t* b = (uint16_t*)indices;
uint16_t* e = ((uint16_t*)indices) + (N - (unroll - 1));
for (; a < e; a += unroll, b += unroll * 2) {
ITER(0); ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7);
ITER(8); ITER(9); ITER(10); ITER(11); ITER(12); ITER(13); ITER(14); ITER(15);
ITER(16); ITER(17); ITER(18); ITER(19); ITER(20); ITER(21); ITER(22); ITER(23);
ITER(24); ITER(25); ITER(26); ITER(27); ITER(28); ITER(29); ITER(30); ITER(31);
}
e = ((uint16_t*)indices) + N;
for (; a < e; ++a, b += 2) {
ITER(0);
}
#undef ITER
}
}
void sort_indices_astrobwt_v2(uint32_t N, const uint8_t* v, uint32_t* indices, uint32_t* tmp_indices)
{
sort_indices(N, v, indices, tmp_indices);
}

View file

@ -1,26 +0,0 @@
/* XMRig
* Copyright (c) 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright (c) 2018-2019 tevador <tevador@gmail.com>
* Copyright (c) 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
* Copyright (c) 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
* Copyright (c) 2018-2021 SChernykh <https://github.com/SChernykh>
* Copyright (c) 2016-2021 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <stdint.h>
void sort_indices_astrobwt_v2(uint32_t N, const uint8_t* v, uint32_t* indices, uint32_t* tmp_indices);

View file

@ -1,98 +0,0 @@
/*
* ISC License
*
* Copyright (c) 2013-2021
* Frank Denis <j at pureftpd dot org>
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <emmintrin.h>
#include <immintrin.h>
#include <smmintrin.h>
#include <tmmintrin.h>
#define ROUNDS 20
typedef struct salsa_ctx {
uint32_t input[16];
} salsa_ctx;
static const int TR[16] = {
0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3
};
#define LOAD32_LE(p) *((uint32_t*)(p))
#define STORE32_LE(dst, src) memcpy((dst), &(src), sizeof(uint32_t))
static void
salsa_keysetup(salsa_ctx *ctx, const uint8_t *k)
{
ctx->input[TR[1]] = LOAD32_LE(k + 0);
ctx->input[TR[2]] = LOAD32_LE(k + 4);
ctx->input[TR[3]] = LOAD32_LE(k + 8);
ctx->input[TR[4]] = LOAD32_LE(k + 12);
ctx->input[TR[11]] = LOAD32_LE(k + 16);
ctx->input[TR[12]] = LOAD32_LE(k + 20);
ctx->input[TR[13]] = LOAD32_LE(k + 24);
ctx->input[TR[14]] = LOAD32_LE(k + 28);
ctx->input[TR[0]] = 0x61707865;
ctx->input[TR[5]] = 0x3320646e;
ctx->input[TR[10]] = 0x79622d32;
ctx->input[TR[15]] = 0x6b206574;
}
static void
salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
{
ctx->input[TR[6]] = LOAD32_LE(iv + 0);
ctx->input[TR[7]] = LOAD32_LE(iv + 4);
ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
}
static void
salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c,
unsigned long long bytes)
{
uint32_t * const x = &ctx->input[0];
if (!bytes) {
return; /* LCOV_EXCL_LINE */
}
#include "u8.h"
#include "u4.h"
#include "u1.h"
#include "u0.h"
}
int salsa20_stream_avx2(void* c, uint64_t clen, const void* iv, const void* key)
{
struct salsa_ctx ctx;
if (!clen) {
return 0;
}
salsa_keysetup(&ctx, (const uint8_t*)key);
salsa_ivsetup(&ctx, (const uint8_t*)iv, NULL);
memset(c, 0, clen);
salsa20_encrypt_bytes(&ctx, (const uint8_t*)c, (uint8_t*)c, clen);
return 0;
}

View file

@ -1,193 +0,0 @@
if (bytes > 0) {
__m128i diag0 = _mm_loadu_si128((const __m128i *) (x + 0));
__m128i diag1 = _mm_loadu_si128((const __m128i *) (x + 4));
__m128i diag2 = _mm_loadu_si128((const __m128i *) (x + 8));
__m128i diag3 = _mm_loadu_si128((const __m128i *) (x + 12));
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
uint8_t partialblock[64];
unsigned int i;
a0 = diag1;
for (i = 0; i < ROUNDS; i += 4) {
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
}
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((const __m128i *) (x + 0)));
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((const __m128i *) (x + 4)));
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((const __m128i *) (x + 8)));
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((const __m128i *) (x + 12)));
#define ONEQUAD_SHUFFLE(A, B, C, D) \
do { \
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
*(uint32_t *) (partialblock + (A * 4)) = in##A; \
*(uint32_t *) (partialblock + (B * 4)) = in##B; \
*(uint32_t *) (partialblock + (C * 4)) = in##C; \
*(uint32_t *) (partialblock + (D * 4)) = in##D; \
} while (0)
#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
ONEQUAD(0, 12, 8, 4);
ONEQUAD(5, 1, 13, 9);
ONEQUAD(10, 6, 2, 14);
ONEQUAD(15, 11, 7, 3);
#undef ONEQUAD
#undef ONEQUAD_SHUFFLE
for (i = 0; i < bytes; i++) {
c[i] = m[i] ^ partialblock[i];
}
}

View file

@ -1,207 +0,0 @@
while (bytes >= 64) {
__m128i diag0 = _mm_loadu_si128((const __m128i *) (x + 0));
__m128i diag1 = _mm_loadu_si128((const __m128i *) (x + 4));
__m128i diag2 = _mm_loadu_si128((const __m128i *) (x + 8));
__m128i diag3 = _mm_loadu_si128((const __m128i *) (x + 12));
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
uint32_t in8;
uint32_t in9;
int i;
a0 = diag1;
for (i = 0; i < ROUNDS; i += 4) {
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
}
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((const __m128i *) (x + 0)));
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((const __m128i *) (x + 4)));
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((const __m128i *) (x + 8)));
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((const __m128i *) (x + 12)));
#define ONEQUAD_SHUFFLE(A, B, C, D) \
do { \
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
in##A ^= *(const uint32_t *) (m + (A * 4)); \
in##B ^= *(const uint32_t *) (m + (B * 4)); \
in##C ^= *(const uint32_t *) (m + (C * 4)); \
in##D ^= *(const uint32_t *) (m + (D * 4)); \
*(uint32_t *) (c + (A * 4)) = in##A; \
*(uint32_t *) (c + (B * 4)) = in##B; \
*(uint32_t *) (c + (C * 4)) = in##C; \
*(uint32_t *) (c + (D * 4)) = in##D; \
} while (0)
#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
ONEQUAD(0, 12, 8, 4);
ONEQUAD(5, 1, 13, 9);
ONEQUAD(10, 6, 2, 14);
ONEQUAD(15, 11, 7, 3);
#undef ONEQUAD
#undef ONEQUAD_SHUFFLE
in8 = x[8];
in9 = x[13];
in8++;
if (in8 == 0) {
in9++;
}
x[8] = in8;
x[13] = in9;
c += 64;
m += 64;
bytes -= 64;
}

View file

@ -1,547 +0,0 @@
if (bytes >= 256) {
__m128i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
y15;
__m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14,
z15;
__m128i orig0, orig1, orig2, orig3, orig4, orig5, orig6, orig7, orig8,
orig9, orig10, orig11, orig12, orig13, orig14, orig15;
uint32_t in8;
uint32_t in9;
int i;
/* element broadcast immediate for _mm_shuffle_epi32 are in order:
0x00, 0x55, 0xaa, 0xff */
z0 = _mm_loadu_si128((const __m128i *) (x + 0));
z5 = _mm_shuffle_epi32(z0, 0x55);
z10 = _mm_shuffle_epi32(z0, 0xaa);
z15 = _mm_shuffle_epi32(z0, 0xff);
z0 = _mm_shuffle_epi32(z0, 0x00);
z1 = _mm_loadu_si128((const __m128i *) (x + 4));
z6 = _mm_shuffle_epi32(z1, 0xaa);
z11 = _mm_shuffle_epi32(z1, 0xff);
z12 = _mm_shuffle_epi32(z1, 0x00);
z1 = _mm_shuffle_epi32(z1, 0x55);
z2 = _mm_loadu_si128((const __m128i *) (x + 8));
z7 = _mm_shuffle_epi32(z2, 0xff);
z13 = _mm_shuffle_epi32(z2, 0x55);
z2 = _mm_shuffle_epi32(z2, 0xaa);
/* no z8 -> first half of the nonce, will fill later */
z3 = _mm_loadu_si128((const __m128i *) (x + 12));
z4 = _mm_shuffle_epi32(z3, 0x00);
z14 = _mm_shuffle_epi32(z3, 0xaa);
z3 = _mm_shuffle_epi32(z3, 0xff);
/* no z9 -> second half of the nonce, will fill later */
orig0 = z0;
orig1 = z1;
orig2 = z2;
orig3 = z3;
orig4 = z4;
orig5 = z5;
orig6 = z6;
orig7 = z7;
orig10 = z10;
orig11 = z11;
orig12 = z12;
orig13 = z13;
orig14 = z14;
orig15 = z15;
while (bytes >= 256) {
/* vector implementation for z8 and z9 */
/* not sure if it helps for only 4 blocks */
const __m128i addv8 = _mm_set_epi64x(1, 0);
const __m128i addv9 = _mm_set_epi64x(3, 2);
__m128i t8, t9;
uint64_t in89;
in8 = x[8];
in9 = x[13];
in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
t8 = _mm_set1_epi64x(in89);
t9 = _mm_set1_epi64x(in89);
z8 = _mm_add_epi64(addv8, t8);
z9 = _mm_add_epi64(addv9, t9);
t8 = _mm_unpacklo_epi32(z8, z9);
t9 = _mm_unpackhi_epi32(z8, z9);
z8 = _mm_unpacklo_epi32(t8, t9);
z9 = _mm_unpackhi_epi32(t8, t9);
orig8 = z8;
orig9 = z9;
in89 += 4;
x[8] = in89 & 0xFFFFFFFF;
x[13] = (in89 >> 32) & 0xFFFFFFFF;
z5 = orig5;
z10 = orig10;
z15 = orig15;
z14 = orig14;
z3 = orig3;
z6 = orig6;
z11 = orig11;
z1 = orig1;
z7 = orig7;
z13 = orig13;
z2 = orig2;
z9 = orig9;
z0 = orig0;
z12 = orig12;
z4 = orig4;
z8 = orig8;
for (i = 0; i < ROUNDS; i += 2) {
/* the inner loop is a direct translation (regexp search/replace)
* from the amd64-xmm6 ASM */
__m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
r14, r15;
y4 = z12;
y4 = _mm_add_epi32(y4, z0);
r4 = y4;
y4 = _mm_slli_epi32(y4, 7);
z4 = _mm_xor_si128(z4, y4);
r4 = _mm_srli_epi32(r4, 25);
z4 = _mm_xor_si128(z4, r4);
y9 = z1;
y9 = _mm_add_epi32(y9, z5);
r9 = y9;
y9 = _mm_slli_epi32(y9, 7);
z9 = _mm_xor_si128(z9, y9);
r9 = _mm_srli_epi32(r9, 25);
z9 = _mm_xor_si128(z9, r9);
y8 = z0;
y8 = _mm_add_epi32(y8, z4);
r8 = y8;
y8 = _mm_slli_epi32(y8, 9);
z8 = _mm_xor_si128(z8, y8);
r8 = _mm_srli_epi32(r8, 23);
z8 = _mm_xor_si128(z8, r8);
y13 = z5;
y13 = _mm_add_epi32(y13, z9);
r13 = y13;
y13 = _mm_slli_epi32(y13, 9);
z13 = _mm_xor_si128(z13, y13);
r13 = _mm_srli_epi32(r13, 23);
z13 = _mm_xor_si128(z13, r13);
y12 = z4;
y12 = _mm_add_epi32(y12, z8);
r12 = y12;
y12 = _mm_slli_epi32(y12, 13);
z12 = _mm_xor_si128(z12, y12);
r12 = _mm_srli_epi32(r12, 19);
z12 = _mm_xor_si128(z12, r12);
y1 = z9;
y1 = _mm_add_epi32(y1, z13);
r1 = y1;
y1 = _mm_slli_epi32(y1, 13);
z1 = _mm_xor_si128(z1, y1);
r1 = _mm_srli_epi32(r1, 19);
z1 = _mm_xor_si128(z1, r1);
y0 = z8;
y0 = _mm_add_epi32(y0, z12);
r0 = y0;
y0 = _mm_slli_epi32(y0, 18);
z0 = _mm_xor_si128(z0, y0);
r0 = _mm_srli_epi32(r0, 14);
z0 = _mm_xor_si128(z0, r0);
y5 = z13;
y5 = _mm_add_epi32(y5, z1);
r5 = y5;
y5 = _mm_slli_epi32(y5, 18);
z5 = _mm_xor_si128(z5, y5);
r5 = _mm_srli_epi32(r5, 14);
z5 = _mm_xor_si128(z5, r5);
y14 = z6;
y14 = _mm_add_epi32(y14, z10);
r14 = y14;
y14 = _mm_slli_epi32(y14, 7);
z14 = _mm_xor_si128(z14, y14);
r14 = _mm_srli_epi32(r14, 25);
z14 = _mm_xor_si128(z14, r14);
y3 = z11;
y3 = _mm_add_epi32(y3, z15);
r3 = y3;
y3 = _mm_slli_epi32(y3, 7);
z3 = _mm_xor_si128(z3, y3);
r3 = _mm_srli_epi32(r3, 25);
z3 = _mm_xor_si128(z3, r3);
y2 = z10;
y2 = _mm_add_epi32(y2, z14);
r2 = y2;
y2 = _mm_slli_epi32(y2, 9);
z2 = _mm_xor_si128(z2, y2);
r2 = _mm_srli_epi32(r2, 23);
z2 = _mm_xor_si128(z2, r2);
y7 = z15;
y7 = _mm_add_epi32(y7, z3);
r7 = y7;
y7 = _mm_slli_epi32(y7, 9);
z7 = _mm_xor_si128(z7, y7);
r7 = _mm_srli_epi32(r7, 23);
z7 = _mm_xor_si128(z7, r7);
y6 = z14;
y6 = _mm_add_epi32(y6, z2);
r6 = y6;
y6 = _mm_slli_epi32(y6, 13);
z6 = _mm_xor_si128(z6, y6);
r6 = _mm_srli_epi32(r6, 19);
z6 = _mm_xor_si128(z6, r6);
y11 = z3;
y11 = _mm_add_epi32(y11, z7);
r11 = y11;
y11 = _mm_slli_epi32(y11, 13);
z11 = _mm_xor_si128(z11, y11);
r11 = _mm_srli_epi32(r11, 19);
z11 = _mm_xor_si128(z11, r11);
y10 = z2;
y10 = _mm_add_epi32(y10, z6);
r10 = y10;
y10 = _mm_slli_epi32(y10, 18);
z10 = _mm_xor_si128(z10, y10);
r10 = _mm_srli_epi32(r10, 14);
z10 = _mm_xor_si128(z10, r10);
y1 = z3;
y1 = _mm_add_epi32(y1, z0);
r1 = y1;
y1 = _mm_slli_epi32(y1, 7);
z1 = _mm_xor_si128(z1, y1);
r1 = _mm_srli_epi32(r1, 25);
z1 = _mm_xor_si128(z1, r1);
y15 = z7;
y15 = _mm_add_epi32(y15, z11);
r15 = y15;
y15 = _mm_slli_epi32(y15, 18);
z15 = _mm_xor_si128(z15, y15);
r15 = _mm_srli_epi32(r15, 14);
z15 = _mm_xor_si128(z15, r15);
y6 = z4;
y6 = _mm_add_epi32(y6, z5);
r6 = y6;
y6 = _mm_slli_epi32(y6, 7);
z6 = _mm_xor_si128(z6, y6);
r6 = _mm_srli_epi32(r6, 25);
z6 = _mm_xor_si128(z6, r6);
y2 = z0;
y2 = _mm_add_epi32(y2, z1);
r2 = y2;
y2 = _mm_slli_epi32(y2, 9);
z2 = _mm_xor_si128(z2, y2);
r2 = _mm_srli_epi32(r2, 23);
z2 = _mm_xor_si128(z2, r2);
y7 = z5;
y7 = _mm_add_epi32(y7, z6);
r7 = y7;
y7 = _mm_slli_epi32(y7, 9);
z7 = _mm_xor_si128(z7, y7);
r7 = _mm_srli_epi32(r7, 23);
z7 = _mm_xor_si128(z7, r7);
y3 = z1;
y3 = _mm_add_epi32(y3, z2);
r3 = y3;
y3 = _mm_slli_epi32(y3, 13);
z3 = _mm_xor_si128(z3, y3);
r3 = _mm_srli_epi32(r3, 19);
z3 = _mm_xor_si128(z3, r3);
y4 = z6;
y4 = _mm_add_epi32(y4, z7);
r4 = y4;
y4 = _mm_slli_epi32(y4, 13);
z4 = _mm_xor_si128(z4, y4);
r4 = _mm_srli_epi32(r4, 19);
z4 = _mm_xor_si128(z4, r4);
y0 = z2;
y0 = _mm_add_epi32(y0, z3);
r0 = y0;
y0 = _mm_slli_epi32(y0, 18);
z0 = _mm_xor_si128(z0, y0);
r0 = _mm_srli_epi32(r0, 14);
z0 = _mm_xor_si128(z0, r0);
y5 = z7;
y5 = _mm_add_epi32(y5, z4);
r5 = y5;
y5 = _mm_slli_epi32(y5, 18);
z5 = _mm_xor_si128(z5, y5);
r5 = _mm_srli_epi32(r5, 14);
z5 = _mm_xor_si128(z5, r5);
y11 = z9;
y11 = _mm_add_epi32(y11, z10);
r11 = y11;
y11 = _mm_slli_epi32(y11, 7);
z11 = _mm_xor_si128(z11, y11);
r11 = _mm_srli_epi32(r11, 25);
z11 = _mm_xor_si128(z11, r11);
y12 = z14;
y12 = _mm_add_epi32(y12, z15);
r12 = y12;
y12 = _mm_slli_epi32(y12, 7);
z12 = _mm_xor_si128(z12, y12);
r12 = _mm_srli_epi32(r12, 25);
z12 = _mm_xor_si128(z12, r12);
y8 = z10;
y8 = _mm_add_epi32(y8, z11);
r8 = y8;
y8 = _mm_slli_epi32(y8, 9);
z8 = _mm_xor_si128(z8, y8);
r8 = _mm_srli_epi32(r8, 23);
z8 = _mm_xor_si128(z8, r8);
y13 = z15;
y13 = _mm_add_epi32(y13, z12);
r13 = y13;
y13 = _mm_slli_epi32(y13, 9);
z13 = _mm_xor_si128(z13, y13);
r13 = _mm_srli_epi32(r13, 23);
z13 = _mm_xor_si128(z13, r13);
y9 = z11;
y9 = _mm_add_epi32(y9, z8);
r9 = y9;
y9 = _mm_slli_epi32(y9, 13);
z9 = _mm_xor_si128(z9, y9);
r9 = _mm_srli_epi32(r9, 19);
z9 = _mm_xor_si128(z9, r9);
y14 = z12;
y14 = _mm_add_epi32(y14, z13);
r14 = y14;
y14 = _mm_slli_epi32(y14, 13);
z14 = _mm_xor_si128(z14, y14);
r14 = _mm_srli_epi32(r14, 19);
z14 = _mm_xor_si128(z14, r14);
y10 = z8;
y10 = _mm_add_epi32(y10, z9);
r10 = y10;
y10 = _mm_slli_epi32(y10, 18);
z10 = _mm_xor_si128(z10, y10);
r10 = _mm_srli_epi32(r10, 14);
z10 = _mm_xor_si128(z10, r10);
y15 = z13;
y15 = _mm_add_epi32(y15, z14);
r15 = y15;
y15 = _mm_slli_epi32(y15, 18);
z15 = _mm_xor_si128(z15, y15);
r15 = _mm_srli_epi32(r15, 14);
z15 = _mm_xor_si128(z15, r15);
}
/* store data ; this macro replicates the original amd64-xmm6 code */
#define ONEQUAD_SHUFFLE(A, B, C, D) \
z##A = _mm_add_epi32(z##A, orig##A); \
z##B = _mm_add_epi32(z##B, orig##B); \
z##C = _mm_add_epi32(z##C, orig##C); \
z##D = _mm_add_epi32(z##D, orig##D); \
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
z##A = _mm_shuffle_epi32(z##A, 0x39); \
z##B = _mm_shuffle_epi32(z##B, 0x39); \
z##C = _mm_shuffle_epi32(z##C, 0x39); \
z##D = _mm_shuffle_epi32(z##D, 0x39); \
\
in##A ^= *(uint32_t *) (m + 0); \
in##B ^= *(uint32_t *) (m + 4); \
in##C ^= *(uint32_t *) (m + 8); \
in##D ^= *(uint32_t *) (m + 12); \
\
*(uint32_t *) (c + 0) = in##A; \
*(uint32_t *) (c + 4) = in##B; \
*(uint32_t *) (c + 8) = in##C; \
*(uint32_t *) (c + 12) = in##D; \
\
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
z##A = _mm_shuffle_epi32(z##A, 0x39); \
z##B = _mm_shuffle_epi32(z##B, 0x39); \
z##C = _mm_shuffle_epi32(z##C, 0x39); \
z##D = _mm_shuffle_epi32(z##D, 0x39); \
\
in##A ^= *(uint32_t *) (m + 64); \
in##B ^= *(uint32_t *) (m + 68); \
in##C ^= *(uint32_t *) (m + 72); \
in##D ^= *(uint32_t *) (m + 76); \
*(uint32_t *) (c + 64) = in##A; \
*(uint32_t *) (c + 68) = in##B; \
*(uint32_t *) (c + 72) = in##C; \
*(uint32_t *) (c + 76) = in##D; \
\
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
z##A = _mm_shuffle_epi32(z##A, 0x39); \
z##B = _mm_shuffle_epi32(z##B, 0x39); \
z##C = _mm_shuffle_epi32(z##C, 0x39); \
z##D = _mm_shuffle_epi32(z##D, 0x39); \
\
in##A ^= *(uint32_t *) (m + 128); \
in##B ^= *(uint32_t *) (m + 132); \
in##C ^= *(uint32_t *) (m + 136); \
in##D ^= *(uint32_t *) (m + 140); \
*(uint32_t *) (c + 128) = in##A; \
*(uint32_t *) (c + 132) = in##B; \
*(uint32_t *) (c + 136) = in##C; \
*(uint32_t *) (c + 140) = in##D; \
\
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
\
in##A ^= *(uint32_t *) (m + 192); \
in##B ^= *(uint32_t *) (m + 196); \
in##C ^= *(uint32_t *) (m + 200); \
in##D ^= *(uint32_t *) (m + 204); \
*(uint32_t *) (c + 192) = in##A; \
*(uint32_t *) (c + 196) = in##B; \
*(uint32_t *) (c + 200) = in##C; \
*(uint32_t *) (c + 204) = in##D
/* store data ; this macro replaces shuffle+mov by a direct extract; not much
* difference */
#define ONEQUAD_EXTRACT(A, B, C, D) \
z##A = _mm_add_epi32(z##A, orig##A); \
z##B = _mm_add_epi32(z##B, orig##B); \
z##C = _mm_add_epi32(z##C, orig##C); \
z##D = _mm_add_epi32(z##D, orig##D); \
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
in##A ^= *(uint32_t *) (m + 0); \
in##B ^= *(uint32_t *) (m + 4); \
in##C ^= *(uint32_t *) (m + 8); \
in##D ^= *(uint32_t *) (m + 12); \
*(uint32_t *) (c + 0) = in##A; \
*(uint32_t *) (c + 4) = in##B; \
*(uint32_t *) (c + 8) = in##C; \
*(uint32_t *) (c + 12) = in##D; \
\
in##A = _mm_extract_epi32(z##A, 1); \
in##B = _mm_extract_epi32(z##B, 1); \
in##C = _mm_extract_epi32(z##C, 1); \
in##D = _mm_extract_epi32(z##D, 1); \
\
in##A ^= *(uint32_t *) (m + 64); \
in##B ^= *(uint32_t *) (m + 68); \
in##C ^= *(uint32_t *) (m + 72); \
in##D ^= *(uint32_t *) (m + 76); \
*(uint32_t *) (c + 64) = in##A; \
*(uint32_t *) (c + 68) = in##B; \
*(uint32_t *) (c + 72) = in##C; \
*(uint32_t *) (c + 76) = in##D; \
\
in##A = _mm_extract_epi32(z##A, 2); \
in##B = _mm_extract_epi32(z##B, 2); \
in##C = _mm_extract_epi32(z##C, 2); \
in##D = _mm_extract_epi32(z##D, 2); \
\
in##A ^= *(uint32_t *) (m + 128); \
in##B ^= *(uint32_t *) (m + 132); \
in##C ^= *(uint32_t *) (m + 136); \
in##D ^= *(uint32_t *) (m + 140); \
*(uint32_t *) (c + 128) = in##A; \
*(uint32_t *) (c + 132) = in##B; \
*(uint32_t *) (c + 136) = in##C; \
*(uint32_t *) (c + 140) = in##D; \
\
in##A = _mm_extract_epi32(z##A, 3); \
in##B = _mm_extract_epi32(z##B, 3); \
in##C = _mm_extract_epi32(z##C, 3); \
in##D = _mm_extract_epi32(z##D, 3); \
\
in##A ^= *(uint32_t *) (m + 192); \
in##B ^= *(uint32_t *) (m + 196); \
in##C ^= *(uint32_t *) (m + 200); \
in##D ^= *(uint32_t *) (m + 204); \
*(uint32_t *) (c + 192) = in##A; \
*(uint32_t *) (c + 196) = in##B; \
*(uint32_t *) (c + 200) = in##C; \
*(uint32_t *) (c + 204) = in##D
/* store data ; this macro first transpose data in-registers, and then store
* them in memory. much faster with icc. */
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
z##A = _mm_add_epi32(z##A, orig##A); \
z##B = _mm_add_epi32(z##B, orig##B); \
z##C = _mm_add_epi32(z##C, orig##C); \
z##D = _mm_add_epi32(z##D, orig##D); \
y##A = _mm_unpacklo_epi32(z##A, z##B); \
y##B = _mm_unpacklo_epi32(z##C, z##D); \
y##C = _mm_unpackhi_epi32(z##A, z##B); \
y##D = _mm_unpackhi_epi32(z##C, z##D); \
z##A = _mm_unpacklo_epi64(y##A, y##B); \
z##B = _mm_unpackhi_epi64(y##A, y##B); \
z##C = _mm_unpacklo_epi64(y##C, y##D); \
z##D = _mm_unpackhi_epi64(y##C, y##D); \
y##A = _mm_xor_si128(z##A, _mm_loadu_si128((const __m128i *) (m + 0))); \
_mm_storeu_si128((__m128i *) (c + 0), y##A); \
y##B = _mm_xor_si128(z##B, _mm_loadu_si128((const __m128i *) (m + 64))); \
_mm_storeu_si128((__m128i *) (c + 64), y##B); \
y##C = _mm_xor_si128(z##C, _mm_loadu_si128((const __m128i *) (m + 128))); \
_mm_storeu_si128((__m128i *) (c + 128), y##C); \
y##D = _mm_xor_si128(z##D, _mm_loadu_si128((const __m128i *) (m + 192))); \
_mm_storeu_si128((__m128i *) (c + 192), y##D)
#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
ONEQUAD(0, 1, 2, 3);
m += 16;
c += 16;
ONEQUAD(4, 5, 6, 7);
m += 16;
c += 16;
ONEQUAD(8, 9, 10, 11);
m += 16;
c += 16;
ONEQUAD(12, 13, 14, 15);
m -= 48;
c -= 48;
#undef ONEQUAD
#undef ONEQUAD_TRANSPOSE
#undef ONEQUAD_EXTRACT
#undef ONEQUAD_SHUFFLE
bytes -= 256;
c += 256;
m += 256;
}
}

View file

@ -1,477 +0,0 @@
if (bytes >= 512) {
__m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
y15;
/* the naive way seems as fast (if not a bit faster) than the vector way */
__m256i z0 = _mm256_set1_epi32(x[0]);
__m256i z5 = _mm256_set1_epi32(x[1]);
__m256i z10 = _mm256_set1_epi32(x[2]);
__m256i z15 = _mm256_set1_epi32(x[3]);
__m256i z12 = _mm256_set1_epi32(x[4]);
__m256i z1 = _mm256_set1_epi32(x[5]);
__m256i z6 = _mm256_set1_epi32(x[6]);
__m256i z11 = _mm256_set1_epi32(x[7]);
__m256i z8; /* useless */
__m256i z13 = _mm256_set1_epi32(x[9]);
__m256i z2 = _mm256_set1_epi32(x[10]);
__m256i z7 = _mm256_set1_epi32(x[11]);
__m256i z4 = _mm256_set1_epi32(x[12]);
__m256i z9; /* useless */
__m256i z14 = _mm256_set1_epi32(x[14]);
__m256i z3 = _mm256_set1_epi32(x[15]);
__m256i orig0 = z0;
__m256i orig1 = z1;
__m256i orig2 = z2;
__m256i orig3 = z3;
__m256i orig4 = z4;
__m256i orig5 = z5;
__m256i orig6 = z6;
__m256i orig7 = z7;
__m256i orig8;
__m256i orig9;
__m256i orig10 = z10;
__m256i orig11 = z11;
__m256i orig12 = z12;
__m256i orig13 = z13;
__m256i orig14 = z14;
__m256i orig15 = z15;
uint32_t in8;
uint32_t in9;
int i;
while (bytes >= 512) {
/* vector implementation for z8 and z9 */
/* faster than the naive version for 8 blocks */
const __m256i addv8 = _mm256_set_epi64x(3, 2, 1, 0);
const __m256i addv9 = _mm256_set_epi64x(7, 6, 5, 4);
const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
__m256i t8, t9;
uint64_t in89;
in8 = x[8];
in9 = x[13]; /* see arrays above for the address translation */
in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
t8 = _mm256_add_epi64(addv8, z8);
t9 = _mm256_add_epi64(addv9, z9);
z8 = _mm256_unpacklo_epi32(t8, t9);
z9 = _mm256_unpackhi_epi32(t8, t9);
t8 = _mm256_unpacklo_epi32(z8, z9);
t9 = _mm256_unpackhi_epi32(z8, z9);
/* required because unpack* are intra-lane */
z8 = _mm256_permutevar8x32_epi32(t8, permute);
z9 = _mm256_permutevar8x32_epi32(t9, permute);
orig8 = z8;
orig9 = z9;
in89 += 8;
x[8] = in89 & 0xFFFFFFFF;
x[13] = (in89 >> 32) & 0xFFFFFFFF;
z5 = orig5;
z10 = orig10;
z15 = orig15;
z14 = orig14;
z3 = orig3;
z6 = orig6;
z11 = orig11;
z1 = orig1;
z7 = orig7;
z13 = orig13;
z2 = orig2;
z9 = orig9;
z0 = orig0;
z12 = orig12;
z4 = orig4;
z8 = orig8;
for (i = 0; i < ROUNDS; i += 2) {
/* the inner loop is a direct translation (regexp search/replace)
* from the amd64-xmm6 ASM */
__m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
r14, r15;
y4 = z12;
y4 = _mm256_add_epi32(y4, z0);
r4 = y4;
y4 = _mm256_slli_epi32(y4, 7);
z4 = _mm256_xor_si256(z4, y4);
r4 = _mm256_srli_epi32(r4, 25);
z4 = _mm256_xor_si256(z4, r4);
y9 = z1;
y9 = _mm256_add_epi32(y9, z5);
r9 = y9;
y9 = _mm256_slli_epi32(y9, 7);
z9 = _mm256_xor_si256(z9, y9);
r9 = _mm256_srli_epi32(r9, 25);
z9 = _mm256_xor_si256(z9, r9);
y8 = z0;
y8 = _mm256_add_epi32(y8, z4);
r8 = y8;
y8 = _mm256_slli_epi32(y8, 9);
z8 = _mm256_xor_si256(z8, y8);
r8 = _mm256_srli_epi32(r8, 23);
z8 = _mm256_xor_si256(z8, r8);
y13 = z5;
y13 = _mm256_add_epi32(y13, z9);
r13 = y13;
y13 = _mm256_slli_epi32(y13, 9);
z13 = _mm256_xor_si256(z13, y13);
r13 = _mm256_srli_epi32(r13, 23);
z13 = _mm256_xor_si256(z13, r13);
y12 = z4;
y12 = _mm256_add_epi32(y12, z8);
r12 = y12;
y12 = _mm256_slli_epi32(y12, 13);
z12 = _mm256_xor_si256(z12, y12);
r12 = _mm256_srli_epi32(r12, 19);
z12 = _mm256_xor_si256(z12, r12);
y1 = z9;
y1 = _mm256_add_epi32(y1, z13);
r1 = y1;
y1 = _mm256_slli_epi32(y1, 13);
z1 = _mm256_xor_si256(z1, y1);
r1 = _mm256_srli_epi32(r1, 19);
z1 = _mm256_xor_si256(z1, r1);
y0 = z8;
y0 = _mm256_add_epi32(y0, z12);
r0 = y0;
y0 = _mm256_slli_epi32(y0, 18);
z0 = _mm256_xor_si256(z0, y0);
r0 = _mm256_srli_epi32(r0, 14);
z0 = _mm256_xor_si256(z0, r0);
y5 = z13;
y5 = _mm256_add_epi32(y5, z1);
r5 = y5;
y5 = _mm256_slli_epi32(y5, 18);
z5 = _mm256_xor_si256(z5, y5);
r5 = _mm256_srli_epi32(r5, 14);
z5 = _mm256_xor_si256(z5, r5);
y14 = z6;
y14 = _mm256_add_epi32(y14, z10);
r14 = y14;
y14 = _mm256_slli_epi32(y14, 7);
z14 = _mm256_xor_si256(z14, y14);
r14 = _mm256_srli_epi32(r14, 25);
z14 = _mm256_xor_si256(z14, r14);
y3 = z11;
y3 = _mm256_add_epi32(y3, z15);
r3 = y3;
y3 = _mm256_slli_epi32(y3, 7);
z3 = _mm256_xor_si256(z3, y3);
r3 = _mm256_srli_epi32(r3, 25);
z3 = _mm256_xor_si256(z3, r3);
y2 = z10;
y2 = _mm256_add_epi32(y2, z14);
r2 = y2;
y2 = _mm256_slli_epi32(y2, 9);
z2 = _mm256_xor_si256(z2, y2);
r2 = _mm256_srli_epi32(r2, 23);
z2 = _mm256_xor_si256(z2, r2);
y7 = z15;
y7 = _mm256_add_epi32(y7, z3);
r7 = y7;
y7 = _mm256_slli_epi32(y7, 9);
z7 = _mm256_xor_si256(z7, y7);
r7 = _mm256_srli_epi32(r7, 23);
z7 = _mm256_xor_si256(z7, r7);
y6 = z14;
y6 = _mm256_add_epi32(y6, z2);
r6 = y6;
y6 = _mm256_slli_epi32(y6, 13);
z6 = _mm256_xor_si256(z6, y6);
r6 = _mm256_srli_epi32(r6, 19);
z6 = _mm256_xor_si256(z6, r6);
y11 = z3;
y11 = _mm256_add_epi32(y11, z7);
r11 = y11;
y11 = _mm256_slli_epi32(y11, 13);
z11 = _mm256_xor_si256(z11, y11);
r11 = _mm256_srli_epi32(r11, 19);
z11 = _mm256_xor_si256(z11, r11);
y10 = z2;
y10 = _mm256_add_epi32(y10, z6);
r10 = y10;
y10 = _mm256_slli_epi32(y10, 18);
z10 = _mm256_xor_si256(z10, y10);
r10 = _mm256_srli_epi32(r10, 14);
z10 = _mm256_xor_si256(z10, r10);
y1 = z3;
y1 = _mm256_add_epi32(y1, z0);
r1 = y1;
y1 = _mm256_slli_epi32(y1, 7);
z1 = _mm256_xor_si256(z1, y1);
r1 = _mm256_srli_epi32(r1, 25);
z1 = _mm256_xor_si256(z1, r1);
y15 = z7;
y15 = _mm256_add_epi32(y15, z11);
r15 = y15;
y15 = _mm256_slli_epi32(y15, 18);
z15 = _mm256_xor_si256(z15, y15);
r15 = _mm256_srli_epi32(r15, 14);
z15 = _mm256_xor_si256(z15, r15);
y6 = z4;
y6 = _mm256_add_epi32(y6, z5);
r6 = y6;
y6 = _mm256_slli_epi32(y6, 7);
z6 = _mm256_xor_si256(z6, y6);
r6 = _mm256_srli_epi32(r6, 25);
z6 = _mm256_xor_si256(z6, r6);
y2 = z0;
y2 = _mm256_add_epi32(y2, z1);
r2 = y2;
y2 = _mm256_slli_epi32(y2, 9);
z2 = _mm256_xor_si256(z2, y2);
r2 = _mm256_srli_epi32(r2, 23);
z2 = _mm256_xor_si256(z2, r2);
y7 = z5;
y7 = _mm256_add_epi32(y7, z6);
r7 = y7;
y7 = _mm256_slli_epi32(y7, 9);
z7 = _mm256_xor_si256(z7, y7);
r7 = _mm256_srli_epi32(r7, 23);
z7 = _mm256_xor_si256(z7, r7);
y3 = z1;
y3 = _mm256_add_epi32(y3, z2);
r3 = y3;
y3 = _mm256_slli_epi32(y3, 13);
z3 = _mm256_xor_si256(z3, y3);
r3 = _mm256_srli_epi32(r3, 19);
z3 = _mm256_xor_si256(z3, r3);
y4 = z6;
y4 = _mm256_add_epi32(y4, z7);
r4 = y4;
y4 = _mm256_slli_epi32(y4, 13);
z4 = _mm256_xor_si256(z4, y4);
r4 = _mm256_srli_epi32(r4, 19);
z4 = _mm256_xor_si256(z4, r4);
y0 = z2;
y0 = _mm256_add_epi32(y0, z3);
r0 = y0;
y0 = _mm256_slli_epi32(y0, 18);
z0 = _mm256_xor_si256(z0, y0);
r0 = _mm256_srli_epi32(r0, 14);
z0 = _mm256_xor_si256(z0, r0);
y5 = z7;
y5 = _mm256_add_epi32(y5, z4);
r5 = y5;
y5 = _mm256_slli_epi32(y5, 18);
z5 = _mm256_xor_si256(z5, y5);
r5 = _mm256_srli_epi32(r5, 14);
z5 = _mm256_xor_si256(z5, r5);
y11 = z9;
y11 = _mm256_add_epi32(y11, z10);
r11 = y11;
y11 = _mm256_slli_epi32(y11, 7);
z11 = _mm256_xor_si256(z11, y11);
r11 = _mm256_srli_epi32(r11, 25);
z11 = _mm256_xor_si256(z11, r11);
y12 = z14;
y12 = _mm256_add_epi32(y12, z15);
r12 = y12;
y12 = _mm256_slli_epi32(y12, 7);
z12 = _mm256_xor_si256(z12, y12);
r12 = _mm256_srli_epi32(r12, 25);
z12 = _mm256_xor_si256(z12, r12);
y8 = z10;
y8 = _mm256_add_epi32(y8, z11);
r8 = y8;
y8 = _mm256_slli_epi32(y8, 9);
z8 = _mm256_xor_si256(z8, y8);
r8 = _mm256_srli_epi32(r8, 23);
z8 = _mm256_xor_si256(z8, r8);
y13 = z15;
y13 = _mm256_add_epi32(y13, z12);
r13 = y13;
y13 = _mm256_slli_epi32(y13, 9);
z13 = _mm256_xor_si256(z13, y13);
r13 = _mm256_srli_epi32(r13, 23);
z13 = _mm256_xor_si256(z13, r13);
y9 = z11;
y9 = _mm256_add_epi32(y9, z8);
r9 = y9;
y9 = _mm256_slli_epi32(y9, 13);
z9 = _mm256_xor_si256(z9, y9);
r9 = _mm256_srli_epi32(r9, 19);
z9 = _mm256_xor_si256(z9, r9);
y14 = z12;
y14 = _mm256_add_epi32(y14, z13);
r14 = y14;
y14 = _mm256_slli_epi32(y14, 13);
z14 = _mm256_xor_si256(z14, y14);
r14 = _mm256_srli_epi32(r14, 19);
z14 = _mm256_xor_si256(z14, r14);
y10 = z8;
y10 = _mm256_add_epi32(y10, z9);
r10 = y10;
y10 = _mm256_slli_epi32(y10, 18);
z10 = _mm256_xor_si256(z10, y10);
r10 = _mm256_srli_epi32(r10, 14);
z10 = _mm256_xor_si256(z10, r10);
y15 = z13;
y15 = _mm256_add_epi32(y15, z14);
r15 = y15;
y15 = _mm256_slli_epi32(y15, 18);
z15 = _mm256_xor_si256(z15, y15);
r15 = _mm256_srli_epi32(r15, 14);
z15 = _mm256_xor_si256(z15, r15);
}
/* store data ; this macro first transpose data in-registers, and then store
* them in memory. much faster with icc. */
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
{ \
__m128i t0, t1, t2, t3; \
z##A = _mm256_add_epi32(z##A, orig##A); \
z##B = _mm256_add_epi32(z##B, orig##B); \
z##C = _mm256_add_epi32(z##C, orig##C); \
z##D = _mm256_add_epi32(z##D, orig##D); \
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0), \
_mm_loadu_si128((const __m128i*) (m + 0))); \
_mm_storeu_si128((__m128i*) (c + 0), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0), \
_mm_loadu_si128((const __m128i*) (m + 64))); \
_mm_storeu_si128((__m128i*) (c + 64), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0), \
_mm_loadu_si128((const __m128i*) (m + 128))); \
_mm_storeu_si128((__m128i*) (c + 128), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0), \
_mm_loadu_si128((const __m128i*) (m + 192))); \
_mm_storeu_si128((__m128i*) (c + 192), t3); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1), \
_mm_loadu_si128((const __m128i*) (m + 256))); \
_mm_storeu_si128((__m128i*) (c + 256), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1), \
_mm_loadu_si128((const __m128i*) (m + 320))); \
_mm_storeu_si128((__m128i*) (c + 320), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1), \
_mm_loadu_si128((const __m128i*) (m + 384))); \
_mm_storeu_si128((__m128i*) (c + 384), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1), \
_mm_loadu_si128((const __m128i*) (m + 448))); \
_mm_storeu_si128((__m128i*) (c + 448), t3); \
}
#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
#define ONEQUAD_UNPCK(A, B, C, D) \
{ \
z##A = _mm256_add_epi32(z##A, orig##A); \
z##B = _mm256_add_epi32(z##B, orig##B); \
z##C = _mm256_add_epi32(z##C, orig##C); \
z##D = _mm256_add_epi32(z##D, orig##D); \
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
}
#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \
{ \
ONEQUAD_UNPCK(A, B, C, D); \
ONEQUAD_UNPCK(A2, B2, C2, D2); \
y##A = _mm256_permute2x128_si256(z##A, z##A2, 0x20); \
y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31); \
y##B = _mm256_permute2x128_si256(z##B, z##B2, 0x20); \
y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31); \
y##C = _mm256_permute2x128_si256(z##C, z##C2, 0x20); \
y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31); \
y##D = _mm256_permute2x128_si256(z##D, z##D2, 0x20); \
y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31); \
y##A = _mm256_xor_si256(y##A, \
_mm256_loadu_si256((const __m256i*) (m + 0))); \
y##B = _mm256_xor_si256( \
y##B, _mm256_loadu_si256((const __m256i*) (m + 64))); \
y##C = _mm256_xor_si256( \
y##C, _mm256_loadu_si256((const __m256i*) (m + 128))); \
y##D = _mm256_xor_si256( \
y##D, _mm256_loadu_si256((const __m256i*) (m + 192))); \
y##A2 = _mm256_xor_si256( \
y##A2, _mm256_loadu_si256((const __m256i*) (m + 256))); \
y##B2 = _mm256_xor_si256( \
y##B2, _mm256_loadu_si256((const __m256i*) (m + 320))); \
y##C2 = _mm256_xor_si256( \
y##C2, _mm256_loadu_si256((const __m256i*) (m + 384))); \
y##D2 = _mm256_xor_si256( \
y##D2, _mm256_loadu_si256((const __m256i*) (m + 448))); \
_mm256_storeu_si256((__m256i*) (c + 0), y##A); \
_mm256_storeu_si256((__m256i*) (c + 64), y##B); \
_mm256_storeu_si256((__m256i*) (c + 128), y##C); \
_mm256_storeu_si256((__m256i*) (c + 192), y##D); \
_mm256_storeu_si256((__m256i*) (c + 256), y##A2); \
_mm256_storeu_si256((__m256i*) (c + 320), y##B2); \
_mm256_storeu_si256((__m256i*) (c + 384), y##C2); \
_mm256_storeu_si256((__m256i*) (c + 448), y##D2); \
}
ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
m += 32;
c += 32;
ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
m -= 32;
c -= 32;
#undef ONEQUAD
#undef ONEQUAD_TRANSPOSE
#undef ONEQUAD_UNPCK
#undef ONEOCTO
bytes -= 512;
c += 512;
m += 512;
}
}

View file

@ -35,11 +35,6 @@
#endif
#ifdef XMRIG_ALGO_ASTROBWT
# include "crypto/astrobwt/AstroBWT.h"
#endif
#define ADD_FN(algo) do { \
m_map[algo] = new cn_hash_fun_array{}; \
m_map[algo]->data[AV_SINGLE][Assembly::NONE] = cryptonight_single_hash<algo, false, 0>; \
@ -375,12 +370,6 @@ xmrig::CnHash::CnHash()
m_map[Algorithm::AR2_WRKZ]->data[AV_SINGLE_SOFT][Assembly::NONE] = argon2::single_hash<Algorithm::AR2_WRKZ>;
# endif
# ifdef XMRIG_ALGO_ASTROBWT
m_map[Algorithm::ASTROBWT_DERO_2] = new cn_hash_fun_array{};
m_map[Algorithm::ASTROBWT_DERO_2]->data[AV_SINGLE][Assembly::NONE] = astrobwt::single_hash<Algorithm::ASTROBWT_DERO_2>;
m_map[Algorithm::ASTROBWT_DERO_2]->data[AV_SINGLE_SOFT][Assembly::NONE] = astrobwt::single_hash<Algorithm::ASTROBWT_DERO_2>;
# endif
# ifdef XMRIG_ALGO_GHOSTRIDER
ADD_FN(Algorithm::CN_GR_0);
ADD_FN(Algorithm::CN_GR_1);

View file

@ -432,23 +432,6 @@ const static uint8_t argon2_wrkz_test_out[256] = {
#endif
#ifdef XMRIG_ALGO_ASTROBWT
// "astrobwt/v2"
const static uint8_t astrobwt_dero_2_test_out[256] = {
0x48, 0x9E, 0xD2, 0x66, 0x14, 0x27, 0x98, 0x65, 0x03, 0xFB, 0x87, 0x25, 0xE1, 0xD3, 0x98, 0xDA,
0x27, 0xEE, 0x25, 0x3D, 0xB4, 0x37, 0x87, 0x98, 0xBF, 0x5A, 0x5C, 0x94, 0xEE, 0x0C, 0xE2, 0x2A,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
#endif
#ifdef XMRIG_ALGO_GHOSTRIDER
// "GhostRider"
const static uint8_t test_output_gr[256] = {

View file

@ -43,22 +43,22 @@ set(SOURCES
)
if (CMAKE_C_COMPILER_ID MATCHES MSVC)
set_source_files_properties(sph_blake.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_bmw.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_cubehash.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_echo.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_fugue.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_groestl.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_hamsi.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_jh.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_keccak.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_luffa.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_shabal.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_shavite.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_simd.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_sha2.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_skein.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_whirlpool.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
set_source_files_properties(sph_blake.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
set_source_files_properties(sph_bmw.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
set_source_files_properties(sph_cubehash.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
set_source_files_properties(sph_echo.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
set_source_files_properties(sph_fugue.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
set_source_files_properties(sph_groestl.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
set_source_files_properties(sph_hamsi.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
set_source_files_properties(sph_jh.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
set_source_files_properties(sph_keccak.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
set_source_files_properties(sph_luffa.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
set_source_files_properties(sph_shabal.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
set_source_files_properties(sph_shavite.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
set_source_files_properties(sph_simd.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
set_source_files_properties(sph_sha2.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
set_source_files_properties(sph_skein.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
set_source_files_properties(sph_whirlpool.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
elseif (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang)
set_source_files_properties(sph_blake.c PROPERTIES COMPILE_FLAGS "-Os")
set_source_files_properties(sph_bmw.c PROPERTIES COMPILE_FLAGS "-Os")

View file

@ -28,13 +28,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
void randomx_set_huge_pages_jit(bool)
{
}
void randomx_set_optimized_dataset_init(int)
{
}