Merge branch 'dev' into sync-base
This commit is contained in:
commit
3ded8e6734
119 changed files with 5867 additions and 14547 deletions
|
@ -1,139 +0,0 @@
|
|||
/* XMRig
|
||||
* Copyright (c) 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
* Copyright (c) 2018-2019 tevador <tevador@gmail.com>
|
||||
* Copyright (c) 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
|
||||
* Copyright (c) 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
|
||||
* Copyright (c) 2018-2021 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2021 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "crypto/astrobwt/AstroBWT.h"
|
||||
#include "backend/cpu/Cpu.h"
|
||||
#include "base/crypto/sha3.h"
|
||||
#include "base/tools/bswap_64.h"
|
||||
#include "crypto/cn/CryptoNight.h"
|
||||
#include "crypto/astrobwt/sort_indices2.h"
|
||||
|
||||
|
||||
#include <limits>
|
||||
|
||||
|
||||
static bool astrobwtInitialized = false;
|
||||
|
||||
#ifdef ASTROBWT_AVX2
|
||||
static bool hasAVX2 = false;
|
||||
|
||||
extern "C"
|
||||
#ifndef _MSC_VER
|
||||
__attribute__((ms_abi))
|
||||
#endif
|
||||
void SHA3_256_AVX2_ASM(const void* in, size_t inBytes, void* out);
|
||||
#endif
|
||||
|
||||
#ifdef XMRIG_ARM
|
||||
extern "C" {
|
||||
#include "salsa20_ref/ecrypt-sync.h"
|
||||
}
|
||||
|
||||
static void Salsa20_XORKeyStream(const void* key, void* output, size_t size)
|
||||
{
|
||||
uint8_t iv[8] = {};
|
||||
ECRYPT_ctx ctx;
|
||||
ECRYPT_keysetup(&ctx, static_cast<const uint8_t*>(key), 256, 64);
|
||||
ECRYPT_ivsetup(&ctx, iv);
|
||||
ECRYPT_keystream_bytes(&ctx, static_cast<uint8_t*>(output), size);
|
||||
memset(static_cast<uint8_t*>(output) - 16, 0, 16);
|
||||
memset(static_cast<uint8_t*>(output) + size, 0, 16);
|
||||
}
|
||||
#else
|
||||
#include "Salsa20.hpp"
|
||||
|
||||
static void Salsa20_XORKeyStream(const void* key, void* output, size_t size)
|
||||
{
|
||||
const uint64_t iv = 0;
|
||||
ZeroTier::Salsa20 s(key, &iv);
|
||||
s.XORKeyStream(output, static_cast<uint32_t>(size));
|
||||
memset(static_cast<uint8_t*>(output) - 16, 0, 16);
|
||||
memset(static_cast<uint8_t*>(output) + size, 0, 16);
|
||||
}
|
||||
|
||||
extern "C" int salsa20_stream_avx2(void* c, uint64_t clen, const void* iv, const void* key);
|
||||
|
||||
static void Salsa20_XORKeyStream_AVX256(const void* key, void* output, size_t size)
|
||||
{
|
||||
const uint64_t iv = 0;
|
||||
salsa20_stream_avx2(output, size, &iv, key);
|
||||
memset(static_cast<uint8_t*>(output) - 16, 0, 16);
|
||||
memset(static_cast<uint8_t*>(output) + size, 0, 16);
|
||||
}
|
||||
#endif
|
||||
|
||||
bool xmrig::astrobwt::astrobwt_dero_v2(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash)
|
||||
{
|
||||
constexpr size_t N = 9973;
|
||||
constexpr size_t STRIDE = 10240;
|
||||
|
||||
alignas(8) uint8_t key[32];
|
||||
uint8_t* scratchpad_ptr = (uint8_t*)(scratchpad) + 64;
|
||||
uint8_t* v = scratchpad_ptr;
|
||||
uint32_t* indices = (uint32_t*)(scratchpad_ptr + STRIDE);
|
||||
uint32_t* tmp_indices = (uint32_t*)(scratchpad_ptr + STRIDE * 5);
|
||||
|
||||
#ifdef ASTROBWT_AVX2
|
||||
if (hasAVX2) {
|
||||
SHA3_256_AVX2_ASM(input_data, input_size, key);
|
||||
Salsa20_XORKeyStream_AVX256(key, v, N);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
sha3_HashBuffer(256, SHA3_FLAGS_NONE, input_data, input_size, key, sizeof(key));
|
||||
Salsa20_XORKeyStream(key, v, N);
|
||||
}
|
||||
|
||||
sort_indices_astrobwt_v2(N, v, indices, tmp_indices);
|
||||
|
||||
#ifdef ASTROBWT_AVX2
|
||||
if (hasAVX2) {
|
||||
SHA3_256_AVX2_ASM(indices, N * 2, output_hash);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
sha3_HashBuffer(256, SHA3_FLAGS_NONE, indices, N * 2, output_hash, 32);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void xmrig::astrobwt::init()
|
||||
{
|
||||
if (!astrobwtInitialized) {
|
||||
# ifdef ASTROBWT_AVX2
|
||||
hasAVX2 = Cpu::info()->hasAVX2();
|
||||
# endif
|
||||
|
||||
astrobwtInitialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
void xmrig::astrobwt::single_hash<xmrig::Algorithm::ASTROBWT_DERO_2>(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t)
|
||||
{
|
||||
astrobwt_dero_v2(input, static_cast<uint32_t>(size), ctx[0]->memory, output);
|
||||
}
|
|
@ -1,43 +0,0 @@
|
|||
/* XMRig
|
||||
* Copyright (c) 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
* Copyright (c) 2018-2019 tevador <tevador@gmail.com>
|
||||
* Copyright (c) 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
|
||||
* Copyright (c) 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
|
||||
* Copyright (c) 2018-2021 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2021 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "base/crypto/Algorithm.h"
|
||||
|
||||
|
||||
struct cryptonight_ctx;
|
||||
|
||||
|
||||
namespace xmrig {
|
||||
|
||||
|
||||
namespace astrobwt {
|
||||
|
||||
bool astrobwt_dero_v2(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash);
|
||||
void init();
|
||||
|
||||
template<Algorithm::Id ALGO>
|
||||
void single_hash(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t);
|
||||
|
||||
template<>
|
||||
void single_hash<Algorithm::ASTROBWT_DERO_2>(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t);
|
||||
|
||||
}} // namespace xmrig::astrobwt
|
|
@ -1,352 +0,0 @@
|
|||
/*
|
||||
* Based on public domain code available at: http://cr.yp.to/snuffle.html
|
||||
*
|
||||
* Modifications and C-native SSE macro based SSE implementation by
|
||||
* Adam Ierymenko <adam.ierymenko@zerotier.com>.
|
||||
*
|
||||
* Additional modifications and code cleanup for AstroBWT by
|
||||
* SChernykh <https://github.com/SChernykh>
|
||||
*
|
||||
* Since the original was public domain, this is too.
|
||||
*/
|
||||
|
||||
#include "Salsa20.hpp"
|
||||
|
||||
// Statically compute and define SSE constants
|
||||
class _s20sseconsts
|
||||
{
|
||||
public:
|
||||
_s20sseconsts()
|
||||
{
|
||||
maskLo32 = _mm_shuffle_epi32(_mm_cvtsi32_si128(-1), _MM_SHUFFLE(1, 0, 1, 0));
|
||||
maskHi32 = _mm_slli_epi64(maskLo32, 32);
|
||||
}
|
||||
__m128i maskLo32,maskHi32;
|
||||
};
|
||||
static const _s20sseconsts _S20SSECONSTANTS;
|
||||
|
||||
namespace ZeroTier {
|
||||
|
||||
void Salsa20::init(const void *key,const void *iv)
|
||||
{
|
||||
const uint32_t *const k = (const uint32_t *)key;
|
||||
_state.i[0] = 0x61707865;
|
||||
_state.i[1] = 0x3320646e;
|
||||
_state.i[2] = 0x79622d32;
|
||||
_state.i[3] = 0x6b206574;
|
||||
_state.i[4] = k[3];
|
||||
_state.i[5] = 0;
|
||||
_state.i[6] = k[7];
|
||||
_state.i[7] = k[2];
|
||||
_state.i[8] = 0;
|
||||
_state.i[9] = k[6];
|
||||
_state.i[10] = k[1];
|
||||
_state.i[11] = ((const uint32_t *)iv)[1];
|
||||
_state.i[12] = k[5];
|
||||
_state.i[13] = k[0];
|
||||
_state.i[14] = ((const uint32_t *)iv)[0];
|
||||
_state.i[15] = k[4];
|
||||
}
|
||||
|
||||
void Salsa20::XORKeyStream(void *out,unsigned int bytes)
|
||||
{
|
||||
uint8_t tmp[64];
|
||||
uint8_t *c = (uint8_t *)out;
|
||||
uint8_t *ctarget = c;
|
||||
unsigned int i;
|
||||
|
||||
if (!bytes)
|
||||
return;
|
||||
|
||||
for (;;) {
|
||||
if (bytes < 64) {
|
||||
for (i = 0;i < bytes;++i)
|
||||
tmp[i] = 0;
|
||||
ctarget = c;
|
||||
c = tmp;
|
||||
}
|
||||
|
||||
__m128i X0 = _mm_loadu_si128((const __m128i *)&(_state.v[0]));
|
||||
__m128i X1 = _mm_loadu_si128((const __m128i *)&(_state.v[1]));
|
||||
__m128i X2 = _mm_loadu_si128((const __m128i *)&(_state.v[2]));
|
||||
__m128i X3 = _mm_loadu_si128((const __m128i *)&(_state.v[3]));
|
||||
__m128i T;
|
||||
__m128i X0s = X0;
|
||||
__m128i X1s = X1;
|
||||
__m128i X2s = X2;
|
||||
__m128i X3s = X3;
|
||||
|
||||
// 2X round -------------------------------------------------------------
|
||||
T = _mm_add_epi32(X0, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X1, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X3, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x93);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x39);
|
||||
T = _mm_add_epi32(X0, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X3, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X1, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x39);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x93);
|
||||
|
||||
// 2X round -------------------------------------------------------------
|
||||
T = _mm_add_epi32(X0, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X1, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X3, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x93);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x39);
|
||||
T = _mm_add_epi32(X0, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X3, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X1, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x39);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x93);
|
||||
|
||||
// 2X round -------------------------------------------------------------
|
||||
T = _mm_add_epi32(X0, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X1, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X3, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x93);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x39);
|
||||
T = _mm_add_epi32(X0, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X3, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X1, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x39);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x93);
|
||||
|
||||
// 2X round -------------------------------------------------------------
|
||||
T = _mm_add_epi32(X0, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X1, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X3, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x93);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x39);
|
||||
T = _mm_add_epi32(X0, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X3, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X1, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x39);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x93);
|
||||
|
||||
// 2X round -------------------------------------------------------------
|
||||
T = _mm_add_epi32(X0, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X1, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X3, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x93);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x39);
|
||||
T = _mm_add_epi32(X0, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X3, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X1, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x39);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x93);
|
||||
|
||||
// 2X round -------------------------------------------------------------
|
||||
T = _mm_add_epi32(X0, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X1, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X3, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x93);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x39);
|
||||
T = _mm_add_epi32(X0, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X3, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X1, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x39);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x93);
|
||||
|
||||
// 2X round -------------------------------------------------------------
|
||||
T = _mm_add_epi32(X0, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X1, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X3, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x93);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x39);
|
||||
T = _mm_add_epi32(X0, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X3, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X1, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x39);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x93);
|
||||
|
||||
// 2X round -------------------------------------------------------------
|
||||
T = _mm_add_epi32(X0, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X1, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X3, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x93);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x39);
|
||||
T = _mm_add_epi32(X0, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X3, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X1, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x39);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x93);
|
||||
|
||||
// 2X round -------------------------------------------------------------
|
||||
T = _mm_add_epi32(X0, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X1, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X3, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x93);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x39);
|
||||
T = _mm_add_epi32(X0, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X3, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X1, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x39);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x93);
|
||||
|
||||
// 2X round -------------------------------------------------------------
|
||||
T = _mm_add_epi32(X0, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X1, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X3, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x93);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x39);
|
||||
T = _mm_add_epi32(X0, X1);
|
||||
X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
|
||||
T = _mm_add_epi32(X3, X0);
|
||||
X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
|
||||
T = _mm_add_epi32(X2, X3);
|
||||
X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
|
||||
T = _mm_add_epi32(X1, X2);
|
||||
X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
|
||||
X1 = _mm_shuffle_epi32(X1, 0x39);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
X3 = _mm_shuffle_epi32(X3, 0x93);
|
||||
|
||||
X0 = _mm_add_epi32(X0s,X0);
|
||||
X1 = _mm_add_epi32(X1s,X1);
|
||||
X2 = _mm_add_epi32(X2s,X2);
|
||||
X3 = _mm_add_epi32(X3s,X3);
|
||||
|
||||
__m128i k02 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X0, 32), _mm_srli_epi64(X3, 32)), _MM_SHUFFLE(0, 1, 2, 3));
|
||||
__m128i k13 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X1, 32), _mm_srli_epi64(X0, 32)), _MM_SHUFFLE(0, 1, 2, 3));
|
||||
__m128i k20 = _mm_or_si128(_mm_and_si128(X2, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X1, _S20SSECONSTANTS.maskHi32));
|
||||
__m128i k31 = _mm_or_si128(_mm_and_si128(X3, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X2, _S20SSECONSTANTS.maskHi32));
|
||||
_mm_storeu_ps(reinterpret_cast<float *>(c),_mm_castsi128_ps(_mm_unpackhi_epi64(k02,k20)));
|
||||
_mm_storeu_ps(reinterpret_cast<float *>(c) + 4,_mm_castsi128_ps(_mm_unpackhi_epi64(k13,k31)));
|
||||
_mm_storeu_ps(reinterpret_cast<float *>(c) + 8,_mm_castsi128_ps(_mm_unpacklo_epi64(k20,k02)));
|
||||
_mm_storeu_ps(reinterpret_cast<float *>(c) + 12,_mm_castsi128_ps(_mm_unpacklo_epi64(k31,k13)));
|
||||
|
||||
if (!(++_state.i[8])) {
|
||||
++_state.i[5]; // state reordered for SSE
|
||||
/* stopping at 2^70 bytes per nonce is user's responsibility */
|
||||
}
|
||||
|
||||
if (bytes <= 64) {
|
||||
if (bytes < 64) {
|
||||
for (i = 0;i < bytes;++i)
|
||||
ctarget[i] = c[i];
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
bytes -= 64;
|
||||
c += 64;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ZeroTier
|
|
@ -1,52 +0,0 @@
|
|||
/*
|
||||
* Based on public domain code available at: http://cr.yp.to/snuffle.html
|
||||
*
|
||||
* This therefore is public domain.
|
||||
*/
|
||||
|
||||
#ifndef ZT_SALSA20_HPP
|
||||
#define ZT_SALSA20_HPP
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
namespace ZeroTier {
|
||||
|
||||
/**
|
||||
* Salsa20 stream cipher
|
||||
*/
|
||||
class Salsa20
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* @param key 256-bit (32 byte) key
|
||||
* @param iv 64-bit initialization vector
|
||||
*/
|
||||
Salsa20(const void *key,const void *iv)
|
||||
{
|
||||
init(key,iv);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize cipher
|
||||
*
|
||||
* @param key Key bits
|
||||
* @param iv 64-bit initialization vector
|
||||
*/
|
||||
void init(const void *key,const void *iv);
|
||||
|
||||
void XORKeyStream(void *out,unsigned int bytes);
|
||||
|
||||
private:
|
||||
union {
|
||||
__m128i v[4];
|
||||
uint32_t i[16];
|
||||
} _state;
|
||||
};
|
||||
|
||||
} // namespace ZeroTier
|
||||
|
||||
#endif
|
|
@ -1,272 +0,0 @@
|
|||
/* ecrypt-config.h */
|
||||
|
||||
/* *** Normally, it should not be necessary to edit this file. *** */
|
||||
|
||||
#ifndef ECRYPT_CONFIG
|
||||
#define ECRYPT_CONFIG
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
/* Guess the endianness of the target architecture. */
|
||||
|
||||
/*
|
||||
* The LITTLE endian machines:
|
||||
*/
|
||||
#if defined(__ultrix) /* Older MIPS */
|
||||
#define ECRYPT_LITTLE_ENDIAN
|
||||
#elif defined(__alpha) /* Alpha */
|
||||
#define ECRYPT_LITTLE_ENDIAN
|
||||
#elif defined(i386) /* x86 (gcc) */
|
||||
#define ECRYPT_LITTLE_ENDIAN
|
||||
#elif defined(__i386) /* x86 (gcc) */
|
||||
#define ECRYPT_LITTLE_ENDIAN
|
||||
#elif defined(_M_IX86) /* x86 (MSC, Borland) */
|
||||
#define ECRYPT_LITTLE_ENDIAN
|
||||
#elif defined(_MSC_VER) /* x86 (surely MSC) */
|
||||
#define ECRYPT_LITTLE_ENDIAN
|
||||
#elif defined(__INTEL_COMPILER) /* x86 (surely Intel compiler icl.exe) */
|
||||
#define ECRYPT_LITTLE_ENDIAN
|
||||
|
||||
/*
|
||||
* The BIG endian machines:
|
||||
*/
|
||||
#elif defined(sun) /* Newer Sparc's */
|
||||
#define ECRYPT_BIG_ENDIAN
|
||||
#elif defined(__ppc__) /* PowerPC */
|
||||
#define ECRYPT_BIG_ENDIAN
|
||||
|
||||
/*
|
||||
* Finally machines with UNKNOWN endianness:
|
||||
*/
|
||||
#elif defined (_AIX) /* RS6000 */
|
||||
#define ECRYPT_UNKNOWN
|
||||
#elif defined(__hpux) /* HP-PA */
|
||||
#define ECRYPT_UNKNOWN
|
||||
#elif defined(__aux) /* 68K */
|
||||
#define ECRYPT_UNKNOWN
|
||||
#elif defined(__dgux) /* 88K (but P6 in latest boxes) */
|
||||
#define ECRYPT_UNKNOWN
|
||||
#elif defined(__sgi) /* Newer MIPS */
|
||||
#define ECRYPT_UNKNOWN
|
||||
#else /* Any other processor */
|
||||
#define ECRYPT_UNKNOWN
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Find minimal-width types to store 8-bit, 16-bit, 32-bit, and 64-bit
|
||||
* integers.
|
||||
*
|
||||
* Note: to enable 64-bit types on 32-bit compilers, it might be
|
||||
* necessary to switch from ISO C90 mode to ISO C99 mode (e.g., gcc
|
||||
* -std=c99).
|
||||
*/
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
/* --- check char --- */
|
||||
|
||||
#if (UCHAR_MAX / 0xFU > 0xFU)
|
||||
#ifndef I8T
|
||||
#define I8T char
|
||||
#define U8C(v) (v##U)
|
||||
|
||||
#if (UCHAR_MAX == 0xFFU)
|
||||
#define ECRYPT_I8T_IS_BYTE
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if (UCHAR_MAX / 0xFFU > 0xFFU)
|
||||
#ifndef I16T
|
||||
#define I16T char
|
||||
#define U16C(v) (v##U)
|
||||
#endif
|
||||
|
||||
#if (UCHAR_MAX / 0xFFFFU > 0xFFFFU)
|
||||
#ifndef I32T
|
||||
#define I32T char
|
||||
#define U32C(v) (v##U)
|
||||
#endif
|
||||
|
||||
#if (UCHAR_MAX / 0xFFFFFFFFU > 0xFFFFFFFFU)
|
||||
#ifndef I64T
|
||||
#define I64T char
|
||||
#define U64C(v) (v##U)
|
||||
#define ECRYPT_NATIVE64
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* --- check short --- */
|
||||
|
||||
#if (USHRT_MAX / 0xFU > 0xFU)
|
||||
#ifndef I8T
|
||||
#define I8T short
|
||||
#define U8C(v) (v##U)
|
||||
|
||||
#if (USHRT_MAX == 0xFFU)
|
||||
#define ECRYPT_I8T_IS_BYTE
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if (USHRT_MAX / 0xFFU > 0xFFU)
|
||||
#ifndef I16T
|
||||
#define I16T short
|
||||
#define U16C(v) (v##U)
|
||||
#endif
|
||||
|
||||
#if (USHRT_MAX / 0xFFFFU > 0xFFFFU)
|
||||
#ifndef I32T
|
||||
#define I32T short
|
||||
#define U32C(v) (v##U)
|
||||
#endif
|
||||
|
||||
#if (USHRT_MAX / 0xFFFFFFFFU > 0xFFFFFFFFU)
|
||||
#ifndef I64T
|
||||
#define I64T short
|
||||
#define U64C(v) (v##U)
|
||||
#define ECRYPT_NATIVE64
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* --- check int --- */
|
||||
|
||||
#if (UINT_MAX / 0xFU > 0xFU)
|
||||
#ifndef I8T
|
||||
#define I8T int
|
||||
#define U8C(v) (v##U)
|
||||
|
||||
#if (ULONG_MAX == 0xFFU)
|
||||
#define ECRYPT_I8T_IS_BYTE
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if (UINT_MAX / 0xFFU > 0xFFU)
|
||||
#ifndef I16T
|
||||
#define I16T int
|
||||
#define U16C(v) (v##U)
|
||||
#endif
|
||||
|
||||
#if (UINT_MAX / 0xFFFFU > 0xFFFFU)
|
||||
#ifndef I32T
|
||||
#define I32T int
|
||||
#define U32C(v) (v##U)
|
||||
#endif
|
||||
|
||||
#if (UINT_MAX / 0xFFFFFFFFU > 0xFFFFFFFFU)
|
||||
#ifndef I64T
|
||||
#define I64T int
|
||||
#define U64C(v) (v##U)
|
||||
#define ECRYPT_NATIVE64
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* --- check long --- */
|
||||
|
||||
#if (ULONG_MAX / 0xFUL > 0xFUL)
|
||||
#ifndef I8T
|
||||
#define I8T long
|
||||
#define U8C(v) (v##UL)
|
||||
|
||||
#if (ULONG_MAX == 0xFFUL)
|
||||
#define ECRYPT_I8T_IS_BYTE
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if (ULONG_MAX / 0xFFUL > 0xFFUL)
|
||||
#ifndef I16T
|
||||
#define I16T long
|
||||
#define U16C(v) (v##UL)
|
||||
#endif
|
||||
|
||||
#if (ULONG_MAX / 0xFFFFUL > 0xFFFFUL)
|
||||
#ifndef I32T
|
||||
#define I32T long
|
||||
#define U32C(v) (v##UL)
|
||||
#endif
|
||||
|
||||
#if (ULONG_MAX / 0xFFFFFFFFUL > 0xFFFFFFFFUL)
|
||||
#ifndef I64T
|
||||
#define I64T long
|
||||
#define U64C(v) (v##UL)
|
||||
#define ECRYPT_NATIVE64
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* --- check long long --- */
|
||||
|
||||
#ifdef ULLONG_MAX
|
||||
|
||||
#if (ULLONG_MAX / 0xFULL > 0xFULL)
|
||||
#ifndef I8T
|
||||
#define I8T long long
|
||||
#define U8C(v) (v##ULL)
|
||||
|
||||
#if (ULLONG_MAX == 0xFFULL)
|
||||
#define ECRYPT_I8T_IS_BYTE
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if (ULLONG_MAX / 0xFFULL > 0xFFULL)
|
||||
#ifndef I16T
|
||||
#define I16T long long
|
||||
#define U16C(v) (v##ULL)
|
||||
#endif
|
||||
|
||||
#if (ULLONG_MAX / 0xFFFFULL > 0xFFFFULL)
|
||||
#ifndef I32T
|
||||
#define I32T long long
|
||||
#define U32C(v) (v##ULL)
|
||||
#endif
|
||||
|
||||
#if (ULLONG_MAX / 0xFFFFFFFFULL > 0xFFFFFFFFULL)
|
||||
#ifndef I64T
|
||||
#define I64T long long
|
||||
#define U64C(v) (v##ULL)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/* --- check __int64 --- */
|
||||
|
||||
#ifdef _UI64_MAX
|
||||
|
||||
#if (_UI64_MAX / 0xFFFFFFFFui64 > 0xFFFFFFFFui64)
|
||||
#ifndef I64T
|
||||
#define I64T __int64
|
||||
#define U64C(v) (v##ui64)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
#endif
|
|
@ -1,46 +0,0 @@
|
|||
/* ecrypt-machine.h */
|
||||
|
||||
/*
|
||||
* This file is included by 'ecrypt-portable.h'. It allows to override
|
||||
* the default macros for specific platforms. Please carefully check
|
||||
* the machine code generated by your compiler (with optimisations
|
||||
* turned on) before deciding to edit this file.
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
#if (defined(ECRYPT_DEFAULT_ROT) && !defined(ECRYPT_MACHINE_ROT))
|
||||
|
||||
#define ECRYPT_MACHINE_ROT
|
||||
|
||||
#if (defined(WIN32) && defined(_MSC_VER))
|
||||
|
||||
#undef ROTL32
|
||||
#undef ROTR32
|
||||
#undef ROTL64
|
||||
#undef ROTR64
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#define ROTL32(v, n) _lrotl(v, n)
|
||||
#define ROTR32(v, n) _lrotr(v, n)
|
||||
#define ROTL64(v, n) _rotl64(v, n)
|
||||
#define ROTR64(v, n) _rotr64(v, n)
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
#if (defined(ECRYPT_DEFAULT_SWAP) && !defined(ECRYPT_MACHINE_SWAP))
|
||||
|
||||
#define ECRYPT_MACHINE_SWAP
|
||||
|
||||
/*
|
||||
* If you want to overwrite the default swap macros, put it here. And so on.
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
|
@ -1,303 +0,0 @@
|
|||
/* ecrypt-portable.h */
|
||||
|
||||
/*
|
||||
* WARNING: the conversions defined below are implemented as macros,
|
||||
* and should be used carefully. They should NOT be used with
|
||||
* parameters which perform some action. E.g., the following two lines
|
||||
* are not equivalent:
|
||||
*
|
||||
* 1) ++x; y = ROTL32(x, n);
|
||||
* 2) y = ROTL32(++x, n);
|
||||
*/
|
||||
|
||||
/*
|
||||
* *** Please do not edit this file. ***
|
||||
*
|
||||
* The default macros can be overridden for specific architectures by
|
||||
* editing 'ecrypt-machine.h'.
|
||||
*/
|
||||
|
||||
#ifndef ECRYPT_PORTABLE
|
||||
#define ECRYPT_PORTABLE
|
||||
|
||||
#include "ecrypt-config.h"
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* The following types are defined (if available):
|
||||
*
|
||||
* u8: unsigned integer type, at least 8 bits
|
||||
* u16: unsigned integer type, at least 16 bits
|
||||
* u32: unsigned integer type, at least 32 bits
|
||||
* u64: unsigned integer type, at least 64 bits
|
||||
*
|
||||
* s8, s16, s32, s64 -> signed counterparts of u8, u16, u32, u64
|
||||
*
|
||||
* The selection of minimum-width integer types is taken care of by
|
||||
* 'ecrypt-config.h'. Note: to enable 64-bit types on 32-bit
|
||||
* compilers, it might be necessary to switch from ISO C90 mode to ISO
|
||||
* C99 mode (e.g., gcc -std=c99).
|
||||
*/
|
||||
|
||||
#ifdef I8T
|
||||
typedef signed I8T s8;
|
||||
typedef unsigned I8T u8;
|
||||
#endif
|
||||
|
||||
#ifdef I16T
|
||||
typedef signed I16T s16;
|
||||
typedef unsigned I16T u16;
|
||||
#endif
|
||||
|
||||
#ifdef I32T
|
||||
typedef signed I32T s32;
|
||||
typedef unsigned I32T u32;
|
||||
#endif
|
||||
|
||||
#ifdef I64T
|
||||
typedef signed I64T s64;
|
||||
typedef unsigned I64T u64;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The following macros are used to obtain exact-width results.
|
||||
*/
|
||||
|
||||
#define U8V(v) ((u8)(v) & U8C(0xFF))
|
||||
#define U16V(v) ((u16)(v) & U16C(0xFFFF))
|
||||
#define U32V(v) ((u32)(v) & U32C(0xFFFFFFFF))
|
||||
#define U64V(v) ((u64)(v) & U64C(0xFFFFFFFFFFFFFFFF))
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* The following macros return words with their bits rotated over n
|
||||
* positions to the left/right.
|
||||
*/
|
||||
|
||||
#define ECRYPT_DEFAULT_ROT
|
||||
|
||||
#define ROTL8(v, n) \
|
||||
(U8V((v) << (n)) | ((v) >> (8 - (n))))
|
||||
|
||||
#define ROTL16(v, n) \
|
||||
(U16V((v) << (n)) | ((v) >> (16 - (n))))
|
||||
|
||||
#define ROTL32(v, n) \
|
||||
(U32V((v) << (n)) | ((v) >> (32 - (n))))
|
||||
|
||||
#define ROTL64(v, n) \
|
||||
(U64V((v) << (n)) | ((v) >> (64 - (n))))
|
||||
|
||||
#define ROTR8(v, n) ROTL8(v, 8 - (n))
|
||||
#define ROTR16(v, n) ROTL16(v, 16 - (n))
|
||||
#define ROTR32(v, n) ROTL32(v, 32 - (n))
|
||||
#define ROTR64(v, n) ROTL64(v, 64 - (n))
|
||||
|
||||
#include "ecrypt-machine.h"
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* The following macros return a word with bytes in reverse order.
|
||||
*/
|
||||
|
||||
#define ECRYPT_DEFAULT_SWAP
|
||||
|
||||
#define SWAP16(v) \
|
||||
ROTL16(v, 8)
|
||||
|
||||
#define SWAP32(v) \
|
||||
((ROTL32(v, 8) & U32C(0x00FF00FF)) | \
|
||||
(ROTL32(v, 24) & U32C(0xFF00FF00)))
|
||||
|
||||
#ifdef ECRYPT_NATIVE64
|
||||
#define SWAP64(v) \
|
||||
((ROTL64(v, 8) & U64C(0x000000FF000000FF)) | \
|
||||
(ROTL64(v, 24) & U64C(0x0000FF000000FF00)) | \
|
||||
(ROTL64(v, 40) & U64C(0x00FF000000FF0000)) | \
|
||||
(ROTL64(v, 56) & U64C(0xFF000000FF000000)))
|
||||
#else
|
||||
#define SWAP64(v) \
|
||||
(((u64)SWAP32(U32V(v)) << 32) | (u64)SWAP32(U32V(v >> 32)))
|
||||
#endif
|
||||
|
||||
#include "ecrypt-machine.h"
|
||||
|
||||
#define ECRYPT_DEFAULT_WTOW
|
||||
|
||||
#ifdef ECRYPT_LITTLE_ENDIAN
|
||||
#define U16TO16_LITTLE(v) (v)
|
||||
#define U32TO32_LITTLE(v) (v)
|
||||
#define U64TO64_LITTLE(v) (v)
|
||||
|
||||
#define U16TO16_BIG(v) SWAP16(v)
|
||||
#define U32TO32_BIG(v) SWAP32(v)
|
||||
#define U64TO64_BIG(v) SWAP64(v)
|
||||
#endif
|
||||
|
||||
#ifdef ECRYPT_BIG_ENDIAN
|
||||
#define U16TO16_LITTLE(v) SWAP16(v)
|
||||
#define U32TO32_LITTLE(v) SWAP32(v)
|
||||
#define U64TO64_LITTLE(v) SWAP64(v)
|
||||
|
||||
#define U16TO16_BIG(v) (v)
|
||||
#define U32TO32_BIG(v) (v)
|
||||
#define U64TO64_BIG(v) (v)
|
||||
#endif
|
||||
|
||||
#include "ecrypt-machine.h"
|
||||
|
||||
/*
|
||||
* The following macros load words from an array of bytes with
|
||||
* different types of endianness, and vice versa.
|
||||
*/
|
||||
|
||||
#define ECRYPT_DEFAULT_BTOW
|
||||
|
||||
#if (!defined(ECRYPT_UNKNOWN) && defined(ECRYPT_I8T_IS_BYTE))
|
||||
|
||||
#define U8TO16_LITTLE(p) U16TO16_LITTLE(((u16*)(p))[0])
|
||||
#define U8TO32_LITTLE(p) U32TO32_LITTLE(((u32*)(p))[0])
|
||||
#define U8TO64_LITTLE(p) U64TO64_LITTLE(((u64*)(p))[0])
|
||||
|
||||
#define U8TO16_BIG(p) U16TO16_BIG(((u16*)(p))[0])
|
||||
#define U8TO32_BIG(p) U32TO32_BIG(((u32*)(p))[0])
|
||||
#define U8TO64_BIG(p) U64TO64_BIG(((u64*)(p))[0])
|
||||
|
||||
#define U16TO8_LITTLE(p, v) (((u16*)(p))[0] = U16TO16_LITTLE(v))
|
||||
#define U32TO8_LITTLE(p, v) (((u32*)(p))[0] = U32TO32_LITTLE(v))
|
||||
#define U64TO8_LITTLE(p, v) (((u64*)(p))[0] = U64TO64_LITTLE(v))
|
||||
|
||||
#define U16TO8_BIG(p, v) (((u16*)(p))[0] = U16TO16_BIG(v))
|
||||
#define U32TO8_BIG(p, v) (((u32*)(p))[0] = U32TO32_BIG(v))
|
||||
#define U64TO8_BIG(p, v) (((u64*)(p))[0] = U64TO64_BIG(v))
|
||||
|
||||
#else
|
||||
|
||||
#define U8TO16_LITTLE(p) \
|
||||
(((u16)((p)[0]) ) | \
|
||||
((u16)((p)[1]) << 8))
|
||||
|
||||
#define U8TO32_LITTLE(p) \
|
||||
(((u32)((p)[0]) ) | \
|
||||
((u32)((p)[1]) << 8) | \
|
||||
((u32)((p)[2]) << 16) | \
|
||||
((u32)((p)[3]) << 24))
|
||||
|
||||
#ifdef ECRYPT_NATIVE64
|
||||
#define U8TO64_LITTLE(p) \
|
||||
(((u64)((p)[0]) ) | \
|
||||
((u64)((p)[1]) << 8) | \
|
||||
((u64)((p)[2]) << 16) | \
|
||||
((u64)((p)[3]) << 24) | \
|
||||
((u64)((p)[4]) << 32) | \
|
||||
((u64)((p)[5]) << 40) | \
|
||||
((u64)((p)[6]) << 48) | \
|
||||
((u64)((p)[7]) << 56))
|
||||
#else
|
||||
#define U8TO64_LITTLE(p) \
|
||||
((u64)U8TO32_LITTLE(p) | ((u64)U8TO32_LITTLE((p) + 4) << 32))
|
||||
#endif
|
||||
|
||||
#define U8TO16_BIG(p) \
|
||||
(((u16)((p)[0]) << 8) | \
|
||||
((u16)((p)[1]) ))
|
||||
|
||||
#define U8TO32_BIG(p) \
|
||||
(((u32)((p)[0]) << 24) | \
|
||||
((u32)((p)[1]) << 16) | \
|
||||
((u32)((p)[2]) << 8) | \
|
||||
((u32)((p)[3]) ))
|
||||
|
||||
#ifdef ECRYPT_NATIVE64
|
||||
#define U8TO64_BIG(p) \
|
||||
(((u64)((p)[0]) << 56) | \
|
||||
((u64)((p)[1]) << 48) | \
|
||||
((u64)((p)[2]) << 40) | \
|
||||
((u64)((p)[3]) << 32) | \
|
||||
((u64)((p)[4]) << 24) | \
|
||||
((u64)((p)[5]) << 16) | \
|
||||
((u64)((p)[6]) << 8) | \
|
||||
((u64)((p)[7]) ))
|
||||
#else
|
||||
#define U8TO64_BIG(p) \
|
||||
(((u64)U8TO32_BIG(p) << 32) | (u64)U8TO32_BIG((p) + 4))
|
||||
#endif
|
||||
|
||||
#define U16TO8_LITTLE(p, v) \
|
||||
do { \
|
||||
(p)[0] = U8V((v) ); \
|
||||
(p)[1] = U8V((v) >> 8); \
|
||||
} while (0)
|
||||
|
||||
#define U32TO8_LITTLE(p, v) \
|
||||
do { \
|
||||
(p)[0] = U8V((v) ); \
|
||||
(p)[1] = U8V((v) >> 8); \
|
||||
(p)[2] = U8V((v) >> 16); \
|
||||
(p)[3] = U8V((v) >> 24); \
|
||||
} while (0)
|
||||
|
||||
#ifdef ECRYPT_NATIVE64
|
||||
#define U64TO8_LITTLE(p, v) \
|
||||
do { \
|
||||
(p)[0] = U8V((v) ); \
|
||||
(p)[1] = U8V((v) >> 8); \
|
||||
(p)[2] = U8V((v) >> 16); \
|
||||
(p)[3] = U8V((v) >> 24); \
|
||||
(p)[4] = U8V((v) >> 32); \
|
||||
(p)[5] = U8V((v) >> 40); \
|
||||
(p)[6] = U8V((v) >> 48); \
|
||||
(p)[7] = U8V((v) >> 56); \
|
||||
} while (0)
|
||||
#else
|
||||
#define U64TO8_LITTLE(p, v) \
|
||||
do { \
|
||||
U32TO8_LITTLE((p), U32V((v) )); \
|
||||
U32TO8_LITTLE((p) + 4, U32V((v) >> 32)); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#define U16TO8_BIG(p, v) \
|
||||
do { \
|
||||
(p)[0] = U8V((v) ); \
|
||||
(p)[1] = U8V((v) >> 8); \
|
||||
} while (0)
|
||||
|
||||
#define U32TO8_BIG(p, v) \
|
||||
do { \
|
||||
(p)[0] = U8V((v) >> 24); \
|
||||
(p)[1] = U8V((v) >> 16); \
|
||||
(p)[2] = U8V((v) >> 8); \
|
||||
(p)[3] = U8V((v) ); \
|
||||
} while (0)
|
||||
|
||||
#ifdef ECRYPT_NATIVE64
|
||||
#define U64TO8_BIG(p, v) \
|
||||
do { \
|
||||
(p)[0] = U8V((v) >> 56); \
|
||||
(p)[1] = U8V((v) >> 48); \
|
||||
(p)[2] = U8V((v) >> 40); \
|
||||
(p)[3] = U8V((v) >> 32); \
|
||||
(p)[4] = U8V((v) >> 24); \
|
||||
(p)[5] = U8V((v) >> 16); \
|
||||
(p)[6] = U8V((v) >> 8); \
|
||||
(p)[7] = U8V((v) ); \
|
||||
} while (0)
|
||||
#else
|
||||
#define U64TO8_BIG(p, v) \
|
||||
do { \
|
||||
U32TO8_BIG((p), U32V((v) >> 32)); \
|
||||
U32TO8_BIG((p) + 4, U32V((v) )); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#include "ecrypt-machine.h"
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
#endif
|
|
@ -1,279 +0,0 @@
|
|||
/* ecrypt-sync.h */
|
||||
|
||||
/*
|
||||
* Header file for synchronous stream ciphers without authentication
|
||||
* mechanism.
|
||||
*
|
||||
* *** Please only edit parts marked with "[edit]". ***
|
||||
*/
|
||||
|
||||
#ifndef ECRYPT_SYNC
|
||||
#define ECRYPT_SYNC
|
||||
|
||||
#include "ecrypt-portable.h"
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
/* Cipher parameters */
|
||||
|
||||
/*
|
||||
* The name of your cipher.
|
||||
*/
|
||||
#define ECRYPT_NAME "Salsa20" /* [edit] */
|
||||
#define ECRYPT_PROFILE "S!_H."
|
||||
|
||||
/*
|
||||
* Specify which key and IV sizes are supported by your cipher. A user
|
||||
* should be able to enumerate the supported sizes by running the
|
||||
* following code:
|
||||
*
|
||||
* for (i = 0; ECRYPT_KEYSIZE(i) <= ECRYPT_MAXKEYSIZE; ++i)
|
||||
* {
|
||||
* keysize = ECRYPT_KEYSIZE(i);
|
||||
*
|
||||
* ...
|
||||
* }
|
||||
*
|
||||
* All sizes are in bits.
|
||||
*/
|
||||
|
||||
#define ECRYPT_MAXKEYSIZE 256 /* [edit] */
|
||||
#define ECRYPT_KEYSIZE(i) (128 + (i)*128) /* [edit] */
|
||||
|
||||
#define ECRYPT_MAXIVSIZE 64 /* [edit] */
|
||||
#define ECRYPT_IVSIZE(i) (64 + (i)*64) /* [edit] */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
/* Data structures */
|
||||
|
||||
/*
|
||||
* ECRYPT_ctx is the structure containing the representation of the
|
||||
* internal state of your cipher.
|
||||
*/
|
||||
|
||||
typedef struct
|
||||
{
|
||||
u32 input[16]; /* could be compressed */
|
||||
/*
|
||||
* [edit]
|
||||
*
|
||||
* Put here all state variable needed during the encryption process.
|
||||
*/
|
||||
} ECRYPT_ctx;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
/* Mandatory functions */
|
||||
|
||||
/*
|
||||
* Key and message independent initialization. This function will be
|
||||
* called once when the program starts (e.g., to build expanded S-box
|
||||
* tables).
|
||||
*/
|
||||
void ECRYPT_init();
|
||||
|
||||
/*
|
||||
* Key setup. It is the user's responsibility to select the values of
|
||||
* keysize and ivsize from the set of supported values specified
|
||||
* above.
|
||||
*/
|
||||
void ECRYPT_keysetup(
|
||||
ECRYPT_ctx* ctx,
|
||||
const u8* key,
|
||||
u32 keysize, /* Key size in bits. */
|
||||
u32 ivsize); /* IV size in bits. */
|
||||
|
||||
/*
|
||||
* IV setup. After having called ECRYPT_keysetup(), the user is
|
||||
* allowed to call ECRYPT_ivsetup() different times in order to
|
||||
* encrypt/decrypt different messages with the same key but different
|
||||
* IV's.
|
||||
*/
|
||||
void ECRYPT_ivsetup(
|
||||
ECRYPT_ctx* ctx,
|
||||
const u8* iv);
|
||||
|
||||
/*
|
||||
* Encryption/decryption of arbitrary length messages.
|
||||
*
|
||||
* For efficiency reasons, the API provides two types of
|
||||
* encrypt/decrypt functions. The ECRYPT_encrypt_bytes() function
|
||||
* (declared here) encrypts byte strings of arbitrary length, while
|
||||
* the ECRYPT_encrypt_blocks() function (defined later) only accepts
|
||||
* lengths which are multiples of ECRYPT_BLOCKLENGTH.
|
||||
*
|
||||
* The user is allowed to make multiple calls to
|
||||
* ECRYPT_encrypt_blocks() to incrementally encrypt a long message,
|
||||
* but he is NOT allowed to make additional encryption calls once he
|
||||
* has called ECRYPT_encrypt_bytes() (unless he starts a new message
|
||||
* of course). For example, this sequence of calls is acceptable:
|
||||
*
|
||||
* ECRYPT_keysetup();
|
||||
*
|
||||
* ECRYPT_ivsetup();
|
||||
* ECRYPT_encrypt_blocks();
|
||||
* ECRYPT_encrypt_blocks();
|
||||
* ECRYPT_encrypt_bytes();
|
||||
*
|
||||
* ECRYPT_ivsetup();
|
||||
* ECRYPT_encrypt_blocks();
|
||||
* ECRYPT_encrypt_blocks();
|
||||
*
|
||||
* ECRYPT_ivsetup();
|
||||
* ECRYPT_encrypt_bytes();
|
||||
*
|
||||
* The following sequence is not:
|
||||
*
|
||||
* ECRYPT_keysetup();
|
||||
* ECRYPT_ivsetup();
|
||||
* ECRYPT_encrypt_blocks();
|
||||
* ECRYPT_encrypt_bytes();
|
||||
* ECRYPT_encrypt_blocks();
|
||||
*/
|
||||
|
||||
void ECRYPT_encrypt_bytes(
|
||||
ECRYPT_ctx* ctx,
|
||||
const u8* plaintext,
|
||||
u8* ciphertext,
|
||||
u32 msglen); /* Message length in bytes. */
|
||||
|
||||
void ECRYPT_decrypt_bytes(
|
||||
ECRYPT_ctx* ctx,
|
||||
const u8* ciphertext,
|
||||
u8* plaintext,
|
||||
u32 msglen); /* Message length in bytes. */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
/* Optional features */
|
||||
|
||||
/*
|
||||
* For testing purposes it can sometimes be useful to have a function
|
||||
* which immediately generates keystream without having to provide it
|
||||
* with a zero plaintext. If your cipher cannot provide this function
|
||||
* (e.g., because it is not strictly a synchronous cipher), please
|
||||
* reset the ECRYPT_GENERATES_KEYSTREAM flag.
|
||||
*/
|
||||
|
||||
#define ECRYPT_GENERATES_KEYSTREAM
|
||||
#ifdef ECRYPT_GENERATES_KEYSTREAM
|
||||
|
||||
void ECRYPT_keystream_bytes(
|
||||
ECRYPT_ctx* ctx,
|
||||
u8* keystream,
|
||||
u32 length); /* Length of keystream in bytes. */
|
||||
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
/* Optional optimizations */
|
||||
|
||||
/*
|
||||
* By default, the functions in this section are implemented using
|
||||
* calls to functions declared above. However, you might want to
|
||||
* implement them differently for performance reasons.
|
||||
*/
|
||||
|
||||
/*
|
||||
* All-in-one encryption/decryption of (short) packets.
|
||||
*
|
||||
* The default definitions of these functions can be found in
|
||||
* "ecrypt-sync.c". If you want to implement them differently, please
|
||||
* undef the ECRYPT_USES_DEFAULT_ALL_IN_ONE flag.
|
||||
*/
|
||||
#define ECRYPT_USES_DEFAULT_ALL_IN_ONE /* [edit] */
|
||||
|
||||
void ECRYPT_encrypt_packet(
|
||||
ECRYPT_ctx* ctx,
|
||||
const u8* iv,
|
||||
const u8* plaintext,
|
||||
u8* ciphertext,
|
||||
u32 msglen);
|
||||
|
||||
void ECRYPT_decrypt_packet(
|
||||
ECRYPT_ctx* ctx,
|
||||
const u8* iv,
|
||||
const u8* ciphertext,
|
||||
u8* plaintext,
|
||||
u32 msglen);
|
||||
|
||||
/*
|
||||
* Encryption/decryption of blocks.
|
||||
*
|
||||
* By default, these functions are defined as macros. If you want to
|
||||
* provide a different implementation, please undef the
|
||||
* ECRYPT_USES_DEFAULT_BLOCK_MACROS flag and implement the functions
|
||||
* declared below.
|
||||
*/
|
||||
|
||||
#define ECRYPT_BLOCKLENGTH 64 /* [edit] */
|
||||
|
||||
#define ECRYPT_USES_DEFAULT_BLOCK_MACROS /* [edit] */
|
||||
#ifdef ECRYPT_USES_DEFAULT_BLOCK_MACROS
|
||||
|
||||
#define ECRYPT_encrypt_blocks(ctx, plaintext, ciphertext, blocks) \
|
||||
ECRYPT_encrypt_bytes(ctx, plaintext, ciphertext, \
|
||||
(blocks) * ECRYPT_BLOCKLENGTH)
|
||||
|
||||
#define ECRYPT_decrypt_blocks(ctx, ciphertext, plaintext, blocks) \
|
||||
ECRYPT_decrypt_bytes(ctx, ciphertext, plaintext, \
|
||||
(blocks) * ECRYPT_BLOCKLENGTH)
|
||||
|
||||
#ifdef ECRYPT_GENERATES_KEYSTREAM
|
||||
|
||||
#define ECRYPT_keystream_blocks(ctx, keystream, blocks) \
|
||||
ECRYPT_keystream_bytes(ctx, keystream, \
|
||||
(blocks) * ECRYPT_BLOCKLENGTH)
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
void ECRYPT_encrypt_blocks(
|
||||
ECRYPT_ctx* ctx,
|
||||
const u8* plaintext,
|
||||
u8* ciphertext,
|
||||
u32 blocks); /* Message length in blocks. */
|
||||
|
||||
void ECRYPT_decrypt_blocks(
|
||||
ECRYPT_ctx* ctx,
|
||||
const u8* ciphertext,
|
||||
u8* plaintext,
|
||||
u32 blocks); /* Message length in blocks. */
|
||||
|
||||
#ifdef ECRYPT_GENERATES_KEYSTREAM
|
||||
|
||||
void ECRYPT_keystream_blocks(
|
||||
ECRYPT_ctx* ctx,
|
||||
const u8* keystream,
|
||||
u32 blocks); /* Keystream length in blocks. */
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* If your cipher can be implemented in different ways, you can use
|
||||
* the ECRYPT_VARIANT parameter to allow the user to choose between
|
||||
* them at compile time (e.g., gcc -DECRYPT_VARIANT=3 ...). Please
|
||||
* only use this possibility if you really think it could make a
|
||||
* significant difference and keep the number of variants
|
||||
* (ECRYPT_MAXVARIANT) as small as possible (definitely not more than
|
||||
* 10). Note also that all variants should have exactly the same
|
||||
* external interface (i.e., the same ECRYPT_BLOCKLENGTH, etc.).
|
||||
*/
|
||||
#define ECRYPT_MAXVARIANT 1 /* [edit] */
|
||||
|
||||
#ifndef ECRYPT_VARIANT
|
||||
#define ECRYPT_VARIANT 1
|
||||
#endif
|
||||
|
||||
#if (ECRYPT_VARIANT > ECRYPT_MAXVARIANT)
|
||||
#error this variant does not exist
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
#endif
|
|
@ -1,219 +0,0 @@
|
|||
/*
|
||||
salsa20-merged.c version 20051118
|
||||
D. J. Bernstein
|
||||
Public domain.
|
||||
*/
|
||||
|
||||
#include "ecrypt-sync.h"
|
||||
|
||||
#define ROTATE(v,c) (ROTL32(v,c))
|
||||
#define XOR(v,w) ((v) ^ (w))
|
||||
#define PLUS(v,w) (U32V((v) + (w)))
|
||||
#define PLUSONE(v) (PLUS((v),1))
|
||||
|
||||
void ECRYPT_init(void)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
static const char sigma[16] = "expand 32-byte k";
|
||||
static const char tau[16] = "expand 16-byte k";
|
||||
|
||||
void ECRYPT_keysetup(ECRYPT_ctx *x,const u8 *k,u32 kbits,u32 ivbits)
|
||||
{
|
||||
const char *constants;
|
||||
|
||||
x->input[1] = U8TO32_LITTLE(k + 0);
|
||||
x->input[2] = U8TO32_LITTLE(k + 4);
|
||||
x->input[3] = U8TO32_LITTLE(k + 8);
|
||||
x->input[4] = U8TO32_LITTLE(k + 12);
|
||||
if (kbits == 256) { /* recommended */
|
||||
k += 16;
|
||||
constants = sigma;
|
||||
} else { /* kbits == 128 */
|
||||
constants = tau;
|
||||
}
|
||||
x->input[11] = U8TO32_LITTLE(k + 0);
|
||||
x->input[12] = U8TO32_LITTLE(k + 4);
|
||||
x->input[13] = U8TO32_LITTLE(k + 8);
|
||||
x->input[14] = U8TO32_LITTLE(k + 12);
|
||||
x->input[0] = U8TO32_LITTLE(constants + 0);
|
||||
x->input[5] = U8TO32_LITTLE(constants + 4);
|
||||
x->input[10] = U8TO32_LITTLE(constants + 8);
|
||||
x->input[15] = U8TO32_LITTLE(constants + 12);
|
||||
}
|
||||
|
||||
void ECRYPT_ivsetup(ECRYPT_ctx *x,const u8 *iv)
|
||||
{
|
||||
x->input[6] = U8TO32_LITTLE(iv + 0);
|
||||
x->input[7] = U8TO32_LITTLE(iv + 4);
|
||||
x->input[8] = 0;
|
||||
x->input[9] = 0;
|
||||
}
|
||||
|
||||
void ECRYPT_encrypt_bytes(ECRYPT_ctx *x,const u8 *m,u8 *c,u32 bytes)
|
||||
{
|
||||
u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
u32 j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
|
||||
u8 *ctarget = 0;
|
||||
u8 tmp[64];
|
||||
int i;
|
||||
|
||||
if (!bytes) return;
|
||||
|
||||
j0 = x->input[0];
|
||||
j1 = x->input[1];
|
||||
j2 = x->input[2];
|
||||
j3 = x->input[3];
|
||||
j4 = x->input[4];
|
||||
j5 = x->input[5];
|
||||
j6 = x->input[6];
|
||||
j7 = x->input[7];
|
||||
j8 = x->input[8];
|
||||
j9 = x->input[9];
|
||||
j10 = x->input[10];
|
||||
j11 = x->input[11];
|
||||
j12 = x->input[12];
|
||||
j13 = x->input[13];
|
||||
j14 = x->input[14];
|
||||
j15 = x->input[15];
|
||||
|
||||
for (;;) {
|
||||
if (bytes < 64) {
|
||||
for (i = 0;i < bytes;++i) tmp[i] = m[i];
|
||||
m = tmp;
|
||||
ctarget = c;
|
||||
c = tmp;
|
||||
}
|
||||
x0 = j0;
|
||||
x1 = j1;
|
||||
x2 = j2;
|
||||
x3 = j3;
|
||||
x4 = j4;
|
||||
x5 = j5;
|
||||
x6 = j6;
|
||||
x7 = j7;
|
||||
x8 = j8;
|
||||
x9 = j9;
|
||||
x10 = j10;
|
||||
x11 = j11;
|
||||
x12 = j12;
|
||||
x13 = j13;
|
||||
x14 = j14;
|
||||
x15 = j15;
|
||||
for (i = 20;i > 0;i -= 2) {
|
||||
x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
|
||||
x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
|
||||
x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
|
||||
x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
|
||||
x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
|
||||
x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
|
||||
x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
|
||||
x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
|
||||
x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
|
||||
x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
|
||||
x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
|
||||
x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
|
||||
x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
|
||||
x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
|
||||
x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
|
||||
x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
|
||||
x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
|
||||
x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
|
||||
x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
|
||||
x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
|
||||
x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
|
||||
x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
|
||||
x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
|
||||
x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
|
||||
x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
|
||||
x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
|
||||
x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
|
||||
x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
|
||||
x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
|
||||
x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
|
||||
x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
|
||||
x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
|
||||
}
|
||||
x0 = PLUS(x0,j0);
|
||||
x1 = PLUS(x1,j1);
|
||||
x2 = PLUS(x2,j2);
|
||||
x3 = PLUS(x3,j3);
|
||||
x4 = PLUS(x4,j4);
|
||||
x5 = PLUS(x5,j5);
|
||||
x6 = PLUS(x6,j6);
|
||||
x7 = PLUS(x7,j7);
|
||||
x8 = PLUS(x8,j8);
|
||||
x9 = PLUS(x9,j9);
|
||||
x10 = PLUS(x10,j10);
|
||||
x11 = PLUS(x11,j11);
|
||||
x12 = PLUS(x12,j12);
|
||||
x13 = PLUS(x13,j13);
|
||||
x14 = PLUS(x14,j14);
|
||||
x15 = PLUS(x15,j15);
|
||||
|
||||
x0 = XOR(x0,U8TO32_LITTLE(m + 0));
|
||||
x1 = XOR(x1,U8TO32_LITTLE(m + 4));
|
||||
x2 = XOR(x2,U8TO32_LITTLE(m + 8));
|
||||
x3 = XOR(x3,U8TO32_LITTLE(m + 12));
|
||||
x4 = XOR(x4,U8TO32_LITTLE(m + 16));
|
||||
x5 = XOR(x5,U8TO32_LITTLE(m + 20));
|
||||
x6 = XOR(x6,U8TO32_LITTLE(m + 24));
|
||||
x7 = XOR(x7,U8TO32_LITTLE(m + 28));
|
||||
x8 = XOR(x8,U8TO32_LITTLE(m + 32));
|
||||
x9 = XOR(x9,U8TO32_LITTLE(m + 36));
|
||||
x10 = XOR(x10,U8TO32_LITTLE(m + 40));
|
||||
x11 = XOR(x11,U8TO32_LITTLE(m + 44));
|
||||
x12 = XOR(x12,U8TO32_LITTLE(m + 48));
|
||||
x13 = XOR(x13,U8TO32_LITTLE(m + 52));
|
||||
x14 = XOR(x14,U8TO32_LITTLE(m + 56));
|
||||
x15 = XOR(x15,U8TO32_LITTLE(m + 60));
|
||||
|
||||
j8 = PLUSONE(j8);
|
||||
if (!j8) {
|
||||
j9 = PLUSONE(j9);
|
||||
/* stopping at 2^70 bytes per nonce is user's responsibility */
|
||||
}
|
||||
|
||||
U32TO8_LITTLE(c + 0,x0);
|
||||
U32TO8_LITTLE(c + 4,x1);
|
||||
U32TO8_LITTLE(c + 8,x2);
|
||||
U32TO8_LITTLE(c + 12,x3);
|
||||
U32TO8_LITTLE(c + 16,x4);
|
||||
U32TO8_LITTLE(c + 20,x5);
|
||||
U32TO8_LITTLE(c + 24,x6);
|
||||
U32TO8_LITTLE(c + 28,x7);
|
||||
U32TO8_LITTLE(c + 32,x8);
|
||||
U32TO8_LITTLE(c + 36,x9);
|
||||
U32TO8_LITTLE(c + 40,x10);
|
||||
U32TO8_LITTLE(c + 44,x11);
|
||||
U32TO8_LITTLE(c + 48,x12);
|
||||
U32TO8_LITTLE(c + 52,x13);
|
||||
U32TO8_LITTLE(c + 56,x14);
|
||||
U32TO8_LITTLE(c + 60,x15);
|
||||
|
||||
if (bytes <= 64) {
|
||||
if (bytes < 64) {
|
||||
for (i = 0;i < bytes;++i) ctarget[i] = c[i];
|
||||
}
|
||||
x->input[8] = j8;
|
||||
x->input[9] = j9;
|
||||
return;
|
||||
}
|
||||
bytes -= 64;
|
||||
c += 64;
|
||||
m += 64;
|
||||
}
|
||||
}
|
||||
|
||||
void ECRYPT_decrypt_bytes(ECRYPT_ctx *x,const u8 *c,u8 *m,u32 bytes)
|
||||
{
|
||||
ECRYPT_encrypt_bytes(x,c,m,bytes);
|
||||
}
|
||||
|
||||
void ECRYPT_keystream_bytes(ECRYPT_ctx *x,u8 *stream,u32 bytes)
|
||||
{
|
||||
u32 i;
|
||||
for (i = 0; i < bytes; ++i) stream[i] = 0;
|
||||
ECRYPT_encrypt_bytes(x,stream,stream,bytes);
|
||||
}
|
|
@ -1,57 +0,0 @@
|
|||
;# XMRig
|
||||
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
;# Copyright 2018-2019 tevador <tevador@gmail.com>
|
||||
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
|
||||
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
;#
|
||||
;# This program is free software: you can redistribute it and/or modify
|
||||
;# it under the terms of the GNU General Public License as published by
|
||||
;# the Free Software Foundation, either version 3 of the License, or
|
||||
;# (at your option) any later version.
|
||||
;#
|
||||
;# This program is distributed in the hope that it will be useful,
|
||||
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;# GNU General Public License for more details.
|
||||
;#
|
||||
;# You should have received a copy of the GNU General Public License
|
||||
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
;#
|
||||
|
||||
.intel_syntax noprefix
|
||||
#if defined(__APPLE__)
|
||||
.text
|
||||
#define DECL(x) _##x
|
||||
#else
|
||||
.section .text
|
||||
#define DECL(x) x
|
||||
#endif
|
||||
|
||||
#define ALIGN .balign
|
||||
#define dq .quad
|
||||
|
||||
.global DECL(SHA3_256_AVX2_ASM)
|
||||
|
||||
ALIGN 64
|
||||
DECL(SHA3_256_AVX2_ASM):
|
||||
|
||||
#include "sha3_256_avx2.inc"
|
||||
|
||||
KeccakF1600_AVX2_ASM:
|
||||
lea r8,[rip+rot_left+96]
|
||||
lea r9,[rip+rot_right+96]
|
||||
lea r10,[rip+rndc]
|
||||
|
||||
#include "sha3_256_keccakf1600_avx2.inc"
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
|
@ -1,45 +0,0 @@
|
|||
;# XMRig
|
||||
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
;# Copyright 2018-2019 tevador <tevador@gmail.com>
|
||||
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
|
||||
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
;#
|
||||
;# This program is free software: you can redistribute it and/or modify
|
||||
;# it under the terms of the GNU General Public License as published by
|
||||
;# the Free Software Foundation, either version 3 of the License, or
|
||||
;# (at your option) any later version.
|
||||
;#
|
||||
;# This program is distributed in the hope that it will be useful,
|
||||
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;# GNU General Public License for more details.
|
||||
;#
|
||||
;# You should have received a copy of the GNU General Public License
|
||||
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
;#
|
||||
|
||||
_SHA3_256_AVX2_ASM SEGMENT PAGE READ EXECUTE
|
||||
PUBLIC SHA3_256_AVX2_ASM
|
||||
|
||||
ALIGN 64
|
||||
SHA3_256_AVX2_ASM:
|
||||
|
||||
include sha3_256_avx2.inc
|
||||
|
||||
KeccakF1600_AVX2_ASM:
|
||||
lea r8,[rot_left+96]
|
||||
lea r9,[rot_right+96]
|
||||
lea r10,[rndc]
|
||||
|
||||
include sha3_256_keccakf1600_avx2.inc
|
||||
|
||||
_SHA3_256_AVX2_ASM ENDS
|
||||
END
|
|
@ -1,162 +0,0 @@
|
|||
;# XMRig
|
||||
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
;# Copyright 2018-2019 tevador <tevador@gmail.com>
|
||||
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
|
||||
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
;#
|
||||
;# This program is free software: you can redistribute it and/or modify
|
||||
;# it under the terms of the GNU General Public License as published by
|
||||
;# the Free Software Foundation, either version 3 of the License, or
|
||||
;# (at your option) any later version.
|
||||
;#
|
||||
;# This program is distributed in the hope that it will be useful,
|
||||
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;# GNU General Public License for more details.
|
||||
;#
|
||||
;# You should have received a copy of the GNU General Public License
|
||||
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
;#
|
||||
|
||||
vzeroupper
|
||||
|
||||
mov qword ptr [rsp+8],rbx
|
||||
mov qword ptr [rsp+16],rsi
|
||||
mov qword ptr [rsp+24],rdi
|
||||
push rbp
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
sub rsp, 80
|
||||
movdqu xmmword ptr [rsp+64], xmm6
|
||||
movdqu xmmword ptr [rsp+48], xmm7
|
||||
movdqu xmmword ptr [rsp+32], xmm8
|
||||
movdqu xmmword ptr [rsp+16], xmm9
|
||||
movdqu xmmword ptr [rsp+0], xmm10
|
||||
sub rsp, 80
|
||||
movdqu xmmword ptr [rsp+64], xmm11
|
||||
movdqu xmmword ptr [rsp+48], xmm12
|
||||
movdqu xmmword ptr [rsp+32], xmm13
|
||||
movdqu xmmword ptr [rsp+16], xmm14
|
||||
movdqu xmmword ptr [rsp+0], xmm15
|
||||
|
||||
sub rsp,320
|
||||
lea rbp,[rsp+64]
|
||||
and rbp,-32
|
||||
vpxor xmm0,xmm0,xmm0
|
||||
xor edi,edi
|
||||
mov dword ptr [rbp],50462976
|
||||
mov r12,rdx
|
||||
mov dword ptr [rbp+4],169150212
|
||||
mov r14,rdx
|
||||
mov dword ptr [rbp+8],218436623
|
||||
shr r14,3
|
||||
and r12d,7
|
||||
mov dword ptr [rbp+12],135009046
|
||||
mov r13,r8
|
||||
mov byte ptr [rbp+16],9
|
||||
mov rsi,rcx
|
||||
mov ebx,edi
|
||||
vmovdqa ymmword ptr [rbp+32],ymm0
|
||||
vmovdqa ymmword ptr [rbp+64],ymm0
|
||||
vmovdqa ymmword ptr [rbp+96],ymm0
|
||||
vmovdqa ymmword ptr [rbp+128],ymm0
|
||||
vmovdqa ymmword ptr [rbp+160],ymm0
|
||||
vmovdqa ymmword ptr [rbp+192],ymm0
|
||||
vmovdqa ymmword ptr [rbp+224],ymm0
|
||||
test r14,r14
|
||||
je sha3_main_loop_end
|
||||
|
||||
sha3_main_loop:
|
||||
movzx eax,byte ptr [rbp+rbx]
|
||||
lea rcx,[rbp+32]
|
||||
lea rcx,[rcx+rax*8]
|
||||
mov rax,qword ptr [rsi]
|
||||
xor qword ptr [rcx],rax
|
||||
lea r15,[rbx+1]
|
||||
cmp rbx,16
|
||||
jne skip_keccak
|
||||
|
||||
lea rcx,[rbp+32]
|
||||
call KeccakF1600_AVX2_ASM
|
||||
|
||||
skip_keccak:
|
||||
cmp rbx,16
|
||||
mov rax,rdi
|
||||
cmovne rax,r15
|
||||
add rsi,8
|
||||
mov rbx,rax
|
||||
sub r14,1
|
||||
jne sha3_main_loop
|
||||
|
||||
sha3_main_loop_end:
|
||||
mov rdx,rdi
|
||||
test r12,r12
|
||||
je sha3_tail_loop_end
|
||||
mov r8,rdi
|
||||
|
||||
sha3_tail_loop:
|
||||
movzx eax,byte ptr [rdx+rsi]
|
||||
inc rdx
|
||||
shlx rcx,rax,r8
|
||||
or rdi,rcx
|
||||
add r8,8
|
||||
cmp rdx,r12
|
||||
jb sha3_tail_loop
|
||||
|
||||
sha3_tail_loop_end:
|
||||
movzx eax,byte ptr [rbp+rbx]
|
||||
lea rdx,[rbp+32]
|
||||
lea rdx,[rdx+rax*8]
|
||||
mov ecx,6
|
||||
lea rax,[r12*8]
|
||||
shlx rcx,rcx,rax
|
||||
xor rcx,qword ptr [rdx]
|
||||
mov eax,1
|
||||
shl rax,63
|
||||
xor rcx,rdi
|
||||
mov qword ptr [rdx],rcx
|
||||
xor qword ptr [rbp+104],rax
|
||||
|
||||
lea rcx,[rbp+32]
|
||||
call KeccakF1600_AVX2_ASM
|
||||
|
||||
vmovups ymm0,ymmword ptr [rbp+32]
|
||||
vmovups ymmword ptr [r13],ymm0
|
||||
vzeroupper
|
||||
|
||||
add rsp,320
|
||||
|
||||
movdqu xmm15, xmmword ptr [rsp]
|
||||
movdqu xmm14, xmmword ptr [rsp+16]
|
||||
movdqu xmm13, xmmword ptr [rsp+32]
|
||||
movdqu xmm12, xmmword ptr [rsp+48]
|
||||
movdqu xmm11, xmmword ptr [rsp+64]
|
||||
add rsp, 80
|
||||
movdqu xmm10, xmmword ptr [rsp]
|
||||
movdqu xmm9, xmmword ptr [rsp+16]
|
||||
movdqu xmm8, xmmword ptr [rsp+32]
|
||||
movdqu xmm7, xmmword ptr [rsp+48]
|
||||
movdqu xmm6, xmmword ptr [rsp+64]
|
||||
add rsp, 80
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbp
|
||||
mov rbx,qword ptr [rsp+8]
|
||||
mov rsi,qword ptr [rsp+16]
|
||||
mov rdi,qword ptr [rsp+24]
|
||||
|
||||
ret
|
|
@ -1,203 +0,0 @@
|
|||
;# XMRig
|
||||
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
;# Copyright 2018-2019 tevador <tevador@gmail.com>
|
||||
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
|
||||
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
;#
|
||||
;# This program is free software: you can redistribute it and/or modify
|
||||
;# it under the terms of the GNU General Public License as published by
|
||||
;# the Free Software Foundation, either version 3 of the License, or
|
||||
;# (at your option) any later version.
|
||||
;#
|
||||
;# This program is distributed in the hope that it will be useful,
|
||||
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;# GNU General Public License for more details.
|
||||
;#
|
||||
;# You should have received a copy of the GNU General Public License
|
||||
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
;#
|
||||
|
||||
mov eax,24
|
||||
lea rcx,[rcx+96]
|
||||
vpbroadcastq ymm0,QWORD PTR [rcx-96]
|
||||
vmovdqu ymm1,YMMWORD PTR [rcx-88]
|
||||
vmovdqu ymm2,YMMWORD PTR [rcx-56]
|
||||
vmovdqu ymm3,YMMWORD PTR [rcx-24]
|
||||
vmovdqu ymm4,YMMWORD PTR [rcx+8]
|
||||
vmovdqu ymm5,YMMWORD PTR [rcx+40]
|
||||
vmovdqu ymm6,YMMWORD PTR [rcx+72]
|
||||
|
||||
ALIGN 64
|
||||
Loop_avx2:
|
||||
vpshufd ymm13,ymm2,78
|
||||
vpxor ymm12,ymm5,ymm3
|
||||
vpxor ymm9,ymm4,ymm6
|
||||
vpxor ymm12,ymm12,ymm1
|
||||
vpxor ymm12,ymm12,ymm9
|
||||
vpermq ymm11,ymm12,147
|
||||
vpxor ymm13,ymm13,ymm2
|
||||
vpermq ymm7,ymm13,78
|
||||
vpsrlq ymm8,ymm12,63
|
||||
vpaddq ymm9,ymm12,ymm12
|
||||
vpor ymm8,ymm8,ymm9
|
||||
vpermq ymm15,ymm8,57
|
||||
vpxor ymm14,ymm8,ymm11
|
||||
vpermq ymm14,ymm14,0
|
||||
vpxor ymm13,ymm13,ymm0
|
||||
vpxor ymm13,ymm13,ymm7
|
||||
vpsrlq ymm7,ymm13,63
|
||||
vpaddq ymm8,ymm13,ymm13
|
||||
vpor ymm8,ymm8,ymm7
|
||||
vpxor ymm2,ymm2,ymm14
|
||||
vpxor ymm0,ymm0,ymm14
|
||||
vpblendd ymm15,ymm15,ymm8,192
|
||||
vpblendd ymm11,ymm11,ymm13,3
|
||||
vpxor ymm15,ymm15,ymm11
|
||||
vpsllvq ymm10,ymm2,YMMWORD PTR [r8-96]
|
||||
vpsrlvq ymm2,ymm2,YMMWORD PTR [r9-96]
|
||||
vpor ymm2,ymm2,ymm10
|
||||
vpxor ymm3,ymm3,ymm15
|
||||
vpsllvq ymm11,ymm3,YMMWORD PTR [r8-32]
|
||||
vpsrlvq ymm3,ymm3,YMMWORD PTR [r9-32]
|
||||
vpor ymm3,ymm3,ymm11
|
||||
vpxor ymm4,ymm4,ymm15
|
||||
vpsllvq ymm12,ymm4,YMMWORD PTR [r8]
|
||||
vpsrlvq ymm4,ymm4,YMMWORD PTR [r9]
|
||||
vpor ymm4,ymm4,ymm12
|
||||
vpxor ymm5,ymm5,ymm15
|
||||
vpsllvq ymm13,ymm5,YMMWORD PTR [r8+32]
|
||||
vpsrlvq ymm5,ymm5,YMMWORD PTR [r9+32]
|
||||
vpor ymm5,ymm5,ymm13
|
||||
vpxor ymm6,ymm6,ymm15
|
||||
vpermq ymm10,ymm2,141
|
||||
vpermq ymm11,ymm3,141
|
||||
vpsllvq ymm14,ymm6,YMMWORD PTR [r8+64]
|
||||
vpsrlvq ymm8,ymm6,YMMWORD PTR [r9+64]
|
||||
vpor ymm8,ymm8,ymm14
|
||||
vpxor ymm1,ymm1,ymm15
|
||||
vpermq ymm12,ymm4,27
|
||||
vpermq ymm13,ymm5,114
|
||||
vpsllvq ymm15,ymm1,YMMWORD PTR [r8-64]
|
||||
vpsrlvq ymm9,ymm1,YMMWORD PTR [r9-64]
|
||||
vpor ymm9,ymm9,ymm15
|
||||
vpsrldq ymm14,ymm8,8
|
||||
vpandn ymm7,ymm8,ymm14
|
||||
vpblendd ymm3,ymm9,ymm13,12
|
||||
vpblendd ymm15,ymm11,ymm9,12
|
||||
vpblendd ymm5,ymm10,ymm11,12
|
||||
vpblendd ymm14,ymm9,ymm10,12
|
||||
vpblendd ymm3,ymm3,ymm11,48
|
||||
vpblendd ymm15,ymm15,ymm12,48
|
||||
vpblendd ymm5,ymm5,ymm9,48
|
||||
vpblendd ymm14,ymm14,ymm13,48
|
||||
vpblendd ymm3,ymm3,ymm12,192
|
||||
vpblendd ymm15,ymm15,ymm13,192
|
||||
vpblendd ymm5,ymm5,ymm13,192
|
||||
vpblendd ymm14,ymm14,ymm11,192
|
||||
vpandn ymm3,ymm3,ymm15
|
||||
vpandn ymm5,ymm5,ymm14
|
||||
vpblendd ymm6,ymm12,ymm9,12
|
||||
vpblendd ymm15,ymm10,ymm12,12
|
||||
vpxor ymm3,ymm3,ymm10
|
||||
vpblendd ymm6,ymm6,ymm10,48
|
||||
vpblendd ymm15,ymm15,ymm11,48
|
||||
vpxor ymm5,ymm5,ymm12
|
||||
vpblendd ymm6,ymm6,ymm11,192
|
||||
vpblendd ymm15,ymm15,ymm9,192
|
||||
vpandn ymm6,ymm6,ymm15
|
||||
vpxor ymm6,ymm6,ymm13
|
||||
vpermq ymm4,ymm8,30
|
||||
vpblendd ymm15,ymm4,ymm0,48
|
||||
vpermq ymm1,ymm8,57
|
||||
vpblendd ymm1,ymm1,ymm0,192
|
||||
vpandn ymm1,ymm1,ymm15
|
||||
vpblendd ymm2,ymm11,ymm12,12
|
||||
vpblendd ymm14,ymm13,ymm11,12
|
||||
vpblendd ymm2,ymm2,ymm13,48
|
||||
vpblendd ymm14,ymm14,ymm10,48
|
||||
vpblendd ymm2,ymm2,ymm10,192
|
||||
vpblendd ymm14,ymm14,ymm12,192
|
||||
vpandn ymm2,ymm2,ymm14
|
||||
vpxor ymm2,ymm2,ymm9
|
||||
vpermq ymm7,ymm7,0
|
||||
vpermq ymm3,ymm3,27
|
||||
vpermq ymm5,ymm5,141
|
||||
vpermq ymm6,ymm6,114
|
||||
vpblendd ymm4,ymm13,ymm10,12
|
||||
vpblendd ymm14,ymm12,ymm13,12
|
||||
vpblendd ymm4,ymm4,ymm12,48
|
||||
vpblendd ymm14,ymm14,ymm9,48
|
||||
vpblendd ymm4,ymm4,ymm9,192
|
||||
vpblendd ymm14,ymm14,ymm10,192
|
||||
vpandn ymm4,ymm4,ymm14
|
||||
vpxor ymm0,ymm0,ymm7
|
||||
vpxor ymm1,ymm1,ymm8
|
||||
vpxor ymm4,ymm4,ymm11
|
||||
vpxor ymm0,ymm0,YMMWORD PTR [r10]
|
||||
lea r10,[r10+32]
|
||||
dec eax
|
||||
jnz Loop_avx2
|
||||
|
||||
vmovq QWORD PTR [rcx-96],xmm0
|
||||
vmovdqu YMMWORD PTR [rcx-88],ymm1
|
||||
vmovdqu YMMWORD PTR [rcx-56],ymm2
|
||||
vmovdqu YMMWORD PTR [rcx-24],ymm3
|
||||
vmovdqu YMMWORD PTR [rcx+8],ymm4
|
||||
vmovdqu YMMWORD PTR [rcx+40],ymm5
|
||||
vmovdqu YMMWORD PTR [rcx+72],ymm6
|
||||
|
||||
ret
|
||||
|
||||
ALIGN 32
|
||||
rot_left:
|
||||
dq 3, 18, 36, 41
|
||||
dq 1, 62, 28, 27
|
||||
dq 45, 6, 56, 39
|
||||
dq 10, 61, 55, 8
|
||||
dq 2, 15, 25, 20
|
||||
dq 44, 43, 21, 14
|
||||
|
||||
ALIGN 32
|
||||
rot_right:
|
||||
dq 64-3, 64-18, 64-36, 64-41
|
||||
dq 64-1, 64-62, 64-28, 64-27
|
||||
dq 64-45, 64-6, 64-56, 64-39
|
||||
dq 64-10, 64-61, 64-55, 64-8
|
||||
dq 64-2, 64-15, 64-25, 64-20
|
||||
dq 64-44, 64-43, 64-21, 64-14
|
||||
|
||||
ALIGN 32
|
||||
rndc:
|
||||
dq 1, 1, 1, 1
|
||||
dq 32898, 32898, 32898, 32898
|
||||
dq 9223372036854808714, 9223372036854808714, 9223372036854808714, 9223372036854808714
|
||||
dq 9223372039002292224, 9223372039002292224, 9223372039002292224, 9223372039002292224
|
||||
dq 32907, 32907, 32907, 32907
|
||||
dq 2147483649, 2147483649, 2147483649, 2147483649
|
||||
dq 9223372039002292353, 9223372039002292353, 9223372039002292353, 9223372039002292353
|
||||
dq 9223372036854808585, 9223372036854808585, 9223372036854808585, 9223372036854808585
|
||||
dq 138, 138, 138, 138
|
||||
dq 136, 136, 136, 136
|
||||
dq 2147516425, 2147516425, 2147516425, 2147516425
|
||||
dq 2147483658, 2147483658, 2147483658, 2147483658
|
||||
dq 2147516555, 2147516555, 2147516555, 2147516555
|
||||
dq 9223372036854775947, 9223372036854775947, 9223372036854775947, 9223372036854775947
|
||||
dq 9223372036854808713, 9223372036854808713, 9223372036854808713, 9223372036854808713
|
||||
dq 9223372036854808579, 9223372036854808579, 9223372036854808579, 9223372036854808579
|
||||
dq 9223372036854808578, 9223372036854808578, 9223372036854808578, 9223372036854808578
|
||||
dq 9223372036854775936, 9223372036854775936, 9223372036854775936, 9223372036854775936
|
||||
dq 32778, 32778, 32778, 32778
|
||||
dq 9223372039002259466, 9223372039002259466, 9223372039002259466, 9223372039002259466
|
||||
dq 9223372039002292353, 9223372039002292353, 9223372039002292353, 9223372039002292353
|
||||
dq 9223372036854808704, 9223372036854808704, 9223372036854808704, 9223372036854808704
|
||||
dq 2147483649, 2147483649, 2147483649, 2147483649
|
||||
dq 9223372039002292232, 9223372039002292232, 9223372039002292232, 9223372039002292232
|
|
@ -1,208 +0,0 @@
|
|||
/* XMRig
|
||||
* Copyright (c) 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
* Copyright (c) 2018-2019 tevador <tevador@gmail.com>
|
||||
* Copyright (c) 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
|
||||
* Copyright (c) 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
|
||||
* Copyright (c) 2018-2021 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2021 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "crypto/astrobwt/sort_indices2.h"
|
||||
#include "base/tools/bswap_64.h"
|
||||
#include <cstring>
|
||||
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define NOINLINE __attribute__((noinline))
|
||||
#define RESTRICT __restrict__
|
||||
#elif _MSC_VER
|
||||
#define NOINLINE __declspec(noinline)
|
||||
#define RESTRICT __restrict
|
||||
#else
|
||||
#define NOINLINE
|
||||
#define RESTRICT
|
||||
#endif
|
||||
|
||||
|
||||
#if __has_cpp_attribute(unlikely)
|
||||
#define UNLIKELY(X) (X) [[unlikely]]
|
||||
#elif defined __GNUC__
|
||||
#define UNLIKELY(X) (__builtin_expect((X), 0))
|
||||
#else
|
||||
#define UNLIKELY(X) (X)
|
||||
#endif
|
||||
|
||||
|
||||
static NOINLINE void fix(const uint8_t* RESTRICT v, uint32_t* RESTRICT indices, int32_t i)
|
||||
{
|
||||
uint32_t prev_t = indices[i - 1];
|
||||
uint32_t t = indices[i];
|
||||
|
||||
const uint32_t data_a = bswap_32(*(const uint32_t*)(v + (t & 0xFFFF) + 2));
|
||||
if (data_a < bswap_32(*(const uint32_t*)(v + (prev_t & 0xFFFF) + 2)))
|
||||
{
|
||||
const uint32_t t2 = prev_t;
|
||||
int32_t j = i - 1;
|
||||
do
|
||||
{
|
||||
indices[j + 1] = prev_t;
|
||||
--j;
|
||||
|
||||
if (j < 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
prev_t = indices[j];
|
||||
} while (((t ^ prev_t) <= 0xFFFF) && (data_a < bswap_32(*(const uint32_t*)(v + (prev_t & 0xFFFF) + 2))));
|
||||
indices[j + 1] = t;
|
||||
t = t2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static NOINLINE void sort_indices(uint32_t N, const uint8_t* RESTRICT v, uint32_t* RESTRICT indices, uint32_t* RESTRICT tmp_indices)
|
||||
{
|
||||
uint8_t byte_counters[2][256] = {};
|
||||
uint32_t counters[2][256];
|
||||
|
||||
{
|
||||
#define ITER(X) ++byte_counters[1][v[i + X]];
|
||||
|
||||
enum { unroll = 12 };
|
||||
|
||||
uint32_t i = 0;
|
||||
const uint32_t n = N - (unroll - 1);
|
||||
for (; i < n; i += unroll) {
|
||||
ITER(0); ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7); ITER(8); ITER(9); ITER(10); ITER(11);
|
||||
}
|
||||
for (; i < N; ++i) {
|
||||
ITER(0);
|
||||
}
|
||||
memcpy(&byte_counters[0], &byte_counters[1], 256);
|
||||
--byte_counters[0][v[0]];
|
||||
|
||||
#undef ITER
|
||||
}
|
||||
|
||||
{
|
||||
uint32_t c0 = byte_counters[0][0];
|
||||
uint32_t c1 = byte_counters[1][0] - 1;
|
||||
counters[0][0] = c0;
|
||||
counters[1][0] = c1;
|
||||
uint8_t* src = &byte_counters[0][0] + 1;
|
||||
uint32_t* dst = &counters[0][0] + 1;
|
||||
const uint8_t* const e = &byte_counters[0][0] + 256;
|
||||
do {
|
||||
c0 += src[0];
|
||||
c1 += src[256];
|
||||
dst[0] = c0;
|
||||
dst[256] = c1;
|
||||
++src;
|
||||
++dst;
|
||||
} while (src < e);
|
||||
}
|
||||
|
||||
{
|
||||
#define ITER(X) \
|
||||
do { \
|
||||
const uint32_t byte0 = v[i - X + 0]; \
|
||||
const uint32_t byte1 = v[i - X + 1]; \
|
||||
tmp_indices[counters[0][byte1]--] = (byte0 << 24) | (byte1 << 16) | (i - X); \
|
||||
} while (0)
|
||||
|
||||
enum { unroll = 8 };
|
||||
|
||||
uint32_t i = N;
|
||||
for (; i >= unroll; i -= unroll) {
|
||||
ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7); ITER(8);
|
||||
}
|
||||
for (; i > 0; --i) {
|
||||
ITER(1);
|
||||
}
|
||||
|
||||
#undef ITER
|
||||
}
|
||||
|
||||
{
|
||||
#define ITER(X) \
|
||||
do { \
|
||||
const uint32_t data = tmp_indices[i - X]; \
|
||||
indices[counters[1][data >> 24]--] = data; \
|
||||
} while (0)
|
||||
|
||||
enum { unroll = 8 };
|
||||
|
||||
uint32_t i = N;
|
||||
for (; i >= unroll; i -= unroll) {
|
||||
ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7); ITER(8);
|
||||
}
|
||||
for (; i > 0; --i) {
|
||||
ITER(1);
|
||||
}
|
||||
|
||||
#undef ITER
|
||||
}
|
||||
|
||||
{
|
||||
#define ITER(X) do { if UNLIKELY(a[X * 2] == a[(X + 1) * 2]) fix(v, indices, i + X); } while (0)
|
||||
|
||||
enum { unroll = 16 };
|
||||
|
||||
uint32_t i = 1;
|
||||
const uint32_t n = N - (unroll - 1);
|
||||
const uint16_t* a = ((const uint16_t*)indices) + 1;
|
||||
|
||||
for (; i < n; i += unroll, a += unroll * 2) {
|
||||
ITER(0); ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7);
|
||||
ITER(8); ITER(9); ITER(10); ITER(11); ITER(12); ITER(13); ITER(14); ITER(15);
|
||||
}
|
||||
for (; i < N; ++i, a += 2) {
|
||||
ITER(0);
|
||||
}
|
||||
|
||||
#undef ITER
|
||||
}
|
||||
|
||||
{
|
||||
#define ITER(X) a[X] = b[X * 2];
|
||||
|
||||
enum { unroll = 32 };
|
||||
|
||||
uint16_t* a = (uint16_t*)indices;
|
||||
uint16_t* b = (uint16_t*)indices;
|
||||
uint16_t* e = ((uint16_t*)indices) + (N - (unroll - 1));
|
||||
|
||||
for (; a < e; a += unroll, b += unroll * 2) {
|
||||
ITER(0); ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7);
|
||||
ITER(8); ITER(9); ITER(10); ITER(11); ITER(12); ITER(13); ITER(14); ITER(15);
|
||||
ITER(16); ITER(17); ITER(18); ITER(19); ITER(20); ITER(21); ITER(22); ITER(23);
|
||||
ITER(24); ITER(25); ITER(26); ITER(27); ITER(28); ITER(29); ITER(30); ITER(31);
|
||||
}
|
||||
|
||||
e = ((uint16_t*)indices) + N;
|
||||
for (; a < e; ++a, b += 2) {
|
||||
ITER(0);
|
||||
}
|
||||
|
||||
#undef ITER
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void sort_indices_astrobwt_v2(uint32_t N, const uint8_t* v, uint32_t* indices, uint32_t* tmp_indices)
|
||||
{
|
||||
sort_indices(N, v, indices, tmp_indices);
|
||||
}
|
|
@ -1,26 +0,0 @@
|
|||
/* XMRig
|
||||
* Copyright (c) 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
* Copyright (c) 2018-2019 tevador <tevador@gmail.com>
|
||||
* Copyright (c) 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
|
||||
* Copyright (c) 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
|
||||
* Copyright (c) 2018-2021 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2021 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
void sort_indices_astrobwt_v2(uint32_t N, const uint8_t* v, uint32_t* indices, uint32_t* tmp_indices);
|
|
@ -1,98 +0,0 @@
|
|||
/*
|
||||
* ISC License
|
||||
*
|
||||
* Copyright (c) 2013-2021
|
||||
* Frank Denis <j at pureftpd dot org>
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <immintrin.h>
|
||||
#include <smmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
|
||||
#define ROUNDS 20
|
||||
|
||||
typedef struct salsa_ctx {
|
||||
uint32_t input[16];
|
||||
} salsa_ctx;
|
||||
|
||||
static const int TR[16] = {
|
||||
0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3
|
||||
};
|
||||
|
||||
#define LOAD32_LE(p) *((uint32_t*)(p))
|
||||
#define STORE32_LE(dst, src) memcpy((dst), &(src), sizeof(uint32_t))
|
||||
|
||||
static void
|
||||
salsa_keysetup(salsa_ctx *ctx, const uint8_t *k)
|
||||
{
|
||||
ctx->input[TR[1]] = LOAD32_LE(k + 0);
|
||||
ctx->input[TR[2]] = LOAD32_LE(k + 4);
|
||||
ctx->input[TR[3]] = LOAD32_LE(k + 8);
|
||||
ctx->input[TR[4]] = LOAD32_LE(k + 12);
|
||||
ctx->input[TR[11]] = LOAD32_LE(k + 16);
|
||||
ctx->input[TR[12]] = LOAD32_LE(k + 20);
|
||||
ctx->input[TR[13]] = LOAD32_LE(k + 24);
|
||||
ctx->input[TR[14]] = LOAD32_LE(k + 28);
|
||||
ctx->input[TR[0]] = 0x61707865;
|
||||
ctx->input[TR[5]] = 0x3320646e;
|
||||
ctx->input[TR[10]] = 0x79622d32;
|
||||
ctx->input[TR[15]] = 0x6b206574;
|
||||
}
|
||||
|
||||
static void
|
||||
salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
|
||||
{
|
||||
ctx->input[TR[6]] = LOAD32_LE(iv + 0);
|
||||
ctx->input[TR[7]] = LOAD32_LE(iv + 4);
|
||||
ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
|
||||
ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
|
||||
}
|
||||
|
||||
static void
|
||||
salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c,
|
||||
unsigned long long bytes)
|
||||
{
|
||||
uint32_t * const x = &ctx->input[0];
|
||||
|
||||
if (!bytes) {
|
||||
return; /* LCOV_EXCL_LINE */
|
||||
}
|
||||
|
||||
#include "u8.h"
|
||||
#include "u4.h"
|
||||
#include "u1.h"
|
||||
#include "u0.h"
|
||||
}
|
||||
|
||||
int salsa20_stream_avx2(void* c, uint64_t clen, const void* iv, const void* key)
|
||||
{
|
||||
struct salsa_ctx ctx;
|
||||
|
||||
if (!clen) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
salsa_keysetup(&ctx, (const uint8_t*)key);
|
||||
salsa_ivsetup(&ctx, (const uint8_t*)iv, NULL);
|
||||
memset(c, 0, clen);
|
||||
salsa20_encrypt_bytes(&ctx, (const uint8_t*)c, (uint8_t*)c, clen);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,193 +0,0 @@
|
|||
if (bytes > 0) {
|
||||
__m128i diag0 = _mm_loadu_si128((const __m128i *) (x + 0));
|
||||
__m128i diag1 = _mm_loadu_si128((const __m128i *) (x + 4));
|
||||
__m128i diag2 = _mm_loadu_si128((const __m128i *) (x + 8));
|
||||
__m128i diag3 = _mm_loadu_si128((const __m128i *) (x + 12));
|
||||
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
|
||||
uint8_t partialblock[64];
|
||||
|
||||
unsigned int i;
|
||||
|
||||
a0 = diag1;
|
||||
for (i = 0; i < ROUNDS; i += 4) {
|
||||
a0 = _mm_add_epi32(a0, diag0);
|
||||
a1 = diag0;
|
||||
b0 = a0;
|
||||
a0 = _mm_slli_epi32(a0, 7);
|
||||
b0 = _mm_srli_epi32(b0, 25);
|
||||
diag3 = _mm_xor_si128(diag3, a0);
|
||||
|
||||
diag3 = _mm_xor_si128(diag3, b0);
|
||||
|
||||
a1 = _mm_add_epi32(a1, diag3);
|
||||
a2 = diag3;
|
||||
b1 = a1;
|
||||
a1 = _mm_slli_epi32(a1, 9);
|
||||
b1 = _mm_srli_epi32(b1, 23);
|
||||
diag2 = _mm_xor_si128(diag2, a1);
|
||||
diag3 = _mm_shuffle_epi32(diag3, 0x93);
|
||||
diag2 = _mm_xor_si128(diag2, b1);
|
||||
|
||||
a2 = _mm_add_epi32(a2, diag2);
|
||||
a3 = diag2;
|
||||
b2 = a2;
|
||||
a2 = _mm_slli_epi32(a2, 13);
|
||||
b2 = _mm_srli_epi32(b2, 19);
|
||||
diag1 = _mm_xor_si128(diag1, a2);
|
||||
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
|
||||
diag1 = _mm_xor_si128(diag1, b2);
|
||||
|
||||
a3 = _mm_add_epi32(a3, diag1);
|
||||
a4 = diag3;
|
||||
b3 = a3;
|
||||
a3 = _mm_slli_epi32(a3, 18);
|
||||
b3 = _mm_srli_epi32(b3, 14);
|
||||
diag0 = _mm_xor_si128(diag0, a3);
|
||||
diag1 = _mm_shuffle_epi32(diag1, 0x39);
|
||||
diag0 = _mm_xor_si128(diag0, b3);
|
||||
|
||||
a4 = _mm_add_epi32(a4, diag0);
|
||||
a5 = diag0;
|
||||
b4 = a4;
|
||||
a4 = _mm_slli_epi32(a4, 7);
|
||||
b4 = _mm_srli_epi32(b4, 25);
|
||||
diag1 = _mm_xor_si128(diag1, a4);
|
||||
|
||||
diag1 = _mm_xor_si128(diag1, b4);
|
||||
|
||||
a5 = _mm_add_epi32(a5, diag1);
|
||||
a6 = diag1;
|
||||
b5 = a5;
|
||||
a5 = _mm_slli_epi32(a5, 9);
|
||||
b5 = _mm_srli_epi32(b5, 23);
|
||||
diag2 = _mm_xor_si128(diag2, a5);
|
||||
diag1 = _mm_shuffle_epi32(diag1, 0x93);
|
||||
diag2 = _mm_xor_si128(diag2, b5);
|
||||
|
||||
a6 = _mm_add_epi32(a6, diag2);
|
||||
a7 = diag2;
|
||||
b6 = a6;
|
||||
a6 = _mm_slli_epi32(a6, 13);
|
||||
b6 = _mm_srli_epi32(b6, 19);
|
||||
diag3 = _mm_xor_si128(diag3, a6);
|
||||
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
|
||||
diag3 = _mm_xor_si128(diag3, b6);
|
||||
|
||||
a7 = _mm_add_epi32(a7, diag3);
|
||||
a0 = diag1;
|
||||
b7 = a7;
|
||||
a7 = _mm_slli_epi32(a7, 18);
|
||||
b7 = _mm_srli_epi32(b7, 14);
|
||||
diag0 = _mm_xor_si128(diag0, a7);
|
||||
diag3 = _mm_shuffle_epi32(diag3, 0x39);
|
||||
diag0 = _mm_xor_si128(diag0, b7);
|
||||
|
||||
a0 = _mm_add_epi32(a0, diag0);
|
||||
a1 = diag0;
|
||||
b0 = a0;
|
||||
a0 = _mm_slli_epi32(a0, 7);
|
||||
b0 = _mm_srli_epi32(b0, 25);
|
||||
diag3 = _mm_xor_si128(diag3, a0);
|
||||
|
||||
diag3 = _mm_xor_si128(diag3, b0);
|
||||
|
||||
a1 = _mm_add_epi32(a1, diag3);
|
||||
a2 = diag3;
|
||||
b1 = a1;
|
||||
a1 = _mm_slli_epi32(a1, 9);
|
||||
b1 = _mm_srli_epi32(b1, 23);
|
||||
diag2 = _mm_xor_si128(diag2, a1);
|
||||
diag3 = _mm_shuffle_epi32(diag3, 0x93);
|
||||
diag2 = _mm_xor_si128(diag2, b1);
|
||||
|
||||
a2 = _mm_add_epi32(a2, diag2);
|
||||
a3 = diag2;
|
||||
b2 = a2;
|
||||
a2 = _mm_slli_epi32(a2, 13);
|
||||
b2 = _mm_srli_epi32(b2, 19);
|
||||
diag1 = _mm_xor_si128(diag1, a2);
|
||||
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
|
||||
diag1 = _mm_xor_si128(diag1, b2);
|
||||
|
||||
a3 = _mm_add_epi32(a3, diag1);
|
||||
a4 = diag3;
|
||||
b3 = a3;
|
||||
a3 = _mm_slli_epi32(a3, 18);
|
||||
b3 = _mm_srli_epi32(b3, 14);
|
||||
diag0 = _mm_xor_si128(diag0, a3);
|
||||
diag1 = _mm_shuffle_epi32(diag1, 0x39);
|
||||
diag0 = _mm_xor_si128(diag0, b3);
|
||||
|
||||
a4 = _mm_add_epi32(a4, diag0);
|
||||
a5 = diag0;
|
||||
b4 = a4;
|
||||
a4 = _mm_slli_epi32(a4, 7);
|
||||
b4 = _mm_srli_epi32(b4, 25);
|
||||
diag1 = _mm_xor_si128(diag1, a4);
|
||||
|
||||
diag1 = _mm_xor_si128(diag1, b4);
|
||||
|
||||
a5 = _mm_add_epi32(a5, diag1);
|
||||
a6 = diag1;
|
||||
b5 = a5;
|
||||
a5 = _mm_slli_epi32(a5, 9);
|
||||
b5 = _mm_srli_epi32(b5, 23);
|
||||
diag2 = _mm_xor_si128(diag2, a5);
|
||||
diag1 = _mm_shuffle_epi32(diag1, 0x93);
|
||||
diag2 = _mm_xor_si128(diag2, b5);
|
||||
|
||||
a6 = _mm_add_epi32(a6, diag2);
|
||||
a7 = diag2;
|
||||
b6 = a6;
|
||||
a6 = _mm_slli_epi32(a6, 13);
|
||||
b6 = _mm_srli_epi32(b6, 19);
|
||||
diag3 = _mm_xor_si128(diag3, a6);
|
||||
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
|
||||
diag3 = _mm_xor_si128(diag3, b6);
|
||||
|
||||
a7 = _mm_add_epi32(a7, diag3);
|
||||
a0 = diag1;
|
||||
b7 = a7;
|
||||
a7 = _mm_slli_epi32(a7, 18);
|
||||
b7 = _mm_srli_epi32(b7, 14);
|
||||
diag0 = _mm_xor_si128(diag0, a7);
|
||||
diag3 = _mm_shuffle_epi32(diag3, 0x39);
|
||||
diag0 = _mm_xor_si128(diag0, b7);
|
||||
}
|
||||
|
||||
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((const __m128i *) (x + 0)));
|
||||
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((const __m128i *) (x + 4)));
|
||||
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((const __m128i *) (x + 8)));
|
||||
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((const __m128i *) (x + 12)));
|
||||
|
||||
#define ONEQUAD_SHUFFLE(A, B, C, D) \
|
||||
do { \
|
||||
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
|
||||
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
|
||||
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
|
||||
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
|
||||
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
|
||||
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
|
||||
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
|
||||
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
|
||||
*(uint32_t *) (partialblock + (A * 4)) = in##A; \
|
||||
*(uint32_t *) (partialblock + (B * 4)) = in##B; \
|
||||
*(uint32_t *) (partialblock + (C * 4)) = in##C; \
|
||||
*(uint32_t *) (partialblock + (D * 4)) = in##D; \
|
||||
} while (0)
|
||||
|
||||
#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
|
||||
|
||||
ONEQUAD(0, 12, 8, 4);
|
||||
ONEQUAD(5, 1, 13, 9);
|
||||
ONEQUAD(10, 6, 2, 14);
|
||||
ONEQUAD(15, 11, 7, 3);
|
||||
|
||||
#undef ONEQUAD
|
||||
#undef ONEQUAD_SHUFFLE
|
||||
|
||||
for (i = 0; i < bytes; i++) {
|
||||
c[i] = m[i] ^ partialblock[i];
|
||||
}
|
||||
}
|
|
@ -1,207 +0,0 @@
|
|||
while (bytes >= 64) {
|
||||
__m128i diag0 = _mm_loadu_si128((const __m128i *) (x + 0));
|
||||
__m128i diag1 = _mm_loadu_si128((const __m128i *) (x + 4));
|
||||
__m128i diag2 = _mm_loadu_si128((const __m128i *) (x + 8));
|
||||
__m128i diag3 = _mm_loadu_si128((const __m128i *) (x + 12));
|
||||
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
|
||||
|
||||
uint32_t in8;
|
||||
uint32_t in9;
|
||||
int i;
|
||||
|
||||
a0 = diag1;
|
||||
for (i = 0; i < ROUNDS; i += 4) {
|
||||
a0 = _mm_add_epi32(a0, diag0);
|
||||
a1 = diag0;
|
||||
b0 = a0;
|
||||
a0 = _mm_slli_epi32(a0, 7);
|
||||
b0 = _mm_srli_epi32(b0, 25);
|
||||
diag3 = _mm_xor_si128(diag3, a0);
|
||||
|
||||
diag3 = _mm_xor_si128(diag3, b0);
|
||||
|
||||
a1 = _mm_add_epi32(a1, diag3);
|
||||
a2 = diag3;
|
||||
b1 = a1;
|
||||
a1 = _mm_slli_epi32(a1, 9);
|
||||
b1 = _mm_srli_epi32(b1, 23);
|
||||
diag2 = _mm_xor_si128(diag2, a1);
|
||||
diag3 = _mm_shuffle_epi32(diag3, 0x93);
|
||||
diag2 = _mm_xor_si128(diag2, b1);
|
||||
|
||||
a2 = _mm_add_epi32(a2, diag2);
|
||||
a3 = diag2;
|
||||
b2 = a2;
|
||||
a2 = _mm_slli_epi32(a2, 13);
|
||||
b2 = _mm_srli_epi32(b2, 19);
|
||||
diag1 = _mm_xor_si128(diag1, a2);
|
||||
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
|
||||
diag1 = _mm_xor_si128(diag1, b2);
|
||||
|
||||
a3 = _mm_add_epi32(a3, diag1);
|
||||
a4 = diag3;
|
||||
b3 = a3;
|
||||
a3 = _mm_slli_epi32(a3, 18);
|
||||
b3 = _mm_srli_epi32(b3, 14);
|
||||
diag0 = _mm_xor_si128(diag0, a3);
|
||||
diag1 = _mm_shuffle_epi32(diag1, 0x39);
|
||||
diag0 = _mm_xor_si128(diag0, b3);
|
||||
|
||||
a4 = _mm_add_epi32(a4, diag0);
|
||||
a5 = diag0;
|
||||
b4 = a4;
|
||||
a4 = _mm_slli_epi32(a4, 7);
|
||||
b4 = _mm_srli_epi32(b4, 25);
|
||||
diag1 = _mm_xor_si128(diag1, a4);
|
||||
|
||||
diag1 = _mm_xor_si128(diag1, b4);
|
||||
|
||||
a5 = _mm_add_epi32(a5, diag1);
|
||||
a6 = diag1;
|
||||
b5 = a5;
|
||||
a5 = _mm_slli_epi32(a5, 9);
|
||||
b5 = _mm_srli_epi32(b5, 23);
|
||||
diag2 = _mm_xor_si128(diag2, a5);
|
||||
diag1 = _mm_shuffle_epi32(diag1, 0x93);
|
||||
diag2 = _mm_xor_si128(diag2, b5);
|
||||
|
||||
a6 = _mm_add_epi32(a6, diag2);
|
||||
a7 = diag2;
|
||||
b6 = a6;
|
||||
a6 = _mm_slli_epi32(a6, 13);
|
||||
b6 = _mm_srli_epi32(b6, 19);
|
||||
diag3 = _mm_xor_si128(diag3, a6);
|
||||
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
|
||||
diag3 = _mm_xor_si128(diag3, b6);
|
||||
|
||||
a7 = _mm_add_epi32(a7, diag3);
|
||||
a0 = diag1;
|
||||
b7 = a7;
|
||||
a7 = _mm_slli_epi32(a7, 18);
|
||||
b7 = _mm_srli_epi32(b7, 14);
|
||||
diag0 = _mm_xor_si128(diag0, a7);
|
||||
diag3 = _mm_shuffle_epi32(diag3, 0x39);
|
||||
diag0 = _mm_xor_si128(diag0, b7);
|
||||
|
||||
a0 = _mm_add_epi32(a0, diag0);
|
||||
a1 = diag0;
|
||||
b0 = a0;
|
||||
a0 = _mm_slli_epi32(a0, 7);
|
||||
b0 = _mm_srli_epi32(b0, 25);
|
||||
diag3 = _mm_xor_si128(diag3, a0);
|
||||
|
||||
diag3 = _mm_xor_si128(diag3, b0);
|
||||
|
||||
a1 = _mm_add_epi32(a1, diag3);
|
||||
a2 = diag3;
|
||||
b1 = a1;
|
||||
a1 = _mm_slli_epi32(a1, 9);
|
||||
b1 = _mm_srli_epi32(b1, 23);
|
||||
diag2 = _mm_xor_si128(diag2, a1);
|
||||
diag3 = _mm_shuffle_epi32(diag3, 0x93);
|
||||
diag2 = _mm_xor_si128(diag2, b1);
|
||||
|
||||
a2 = _mm_add_epi32(a2, diag2);
|
||||
a3 = diag2;
|
||||
b2 = a2;
|
||||
a2 = _mm_slli_epi32(a2, 13);
|
||||
b2 = _mm_srli_epi32(b2, 19);
|
||||
diag1 = _mm_xor_si128(diag1, a2);
|
||||
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
|
||||
diag1 = _mm_xor_si128(diag1, b2);
|
||||
|
||||
a3 = _mm_add_epi32(a3, diag1);
|
||||
a4 = diag3;
|
||||
b3 = a3;
|
||||
a3 = _mm_slli_epi32(a3, 18);
|
||||
b3 = _mm_srli_epi32(b3, 14);
|
||||
diag0 = _mm_xor_si128(diag0, a3);
|
||||
diag1 = _mm_shuffle_epi32(diag1, 0x39);
|
||||
diag0 = _mm_xor_si128(diag0, b3);
|
||||
|
||||
a4 = _mm_add_epi32(a4, diag0);
|
||||
a5 = diag0;
|
||||
b4 = a4;
|
||||
a4 = _mm_slli_epi32(a4, 7);
|
||||
b4 = _mm_srli_epi32(b4, 25);
|
||||
diag1 = _mm_xor_si128(diag1, a4);
|
||||
|
||||
diag1 = _mm_xor_si128(diag1, b4);
|
||||
|
||||
a5 = _mm_add_epi32(a5, diag1);
|
||||
a6 = diag1;
|
||||
b5 = a5;
|
||||
a5 = _mm_slli_epi32(a5, 9);
|
||||
b5 = _mm_srli_epi32(b5, 23);
|
||||
diag2 = _mm_xor_si128(diag2, a5);
|
||||
diag1 = _mm_shuffle_epi32(diag1, 0x93);
|
||||
diag2 = _mm_xor_si128(diag2, b5);
|
||||
|
||||
a6 = _mm_add_epi32(a6, diag2);
|
||||
a7 = diag2;
|
||||
b6 = a6;
|
||||
a6 = _mm_slli_epi32(a6, 13);
|
||||
b6 = _mm_srli_epi32(b6, 19);
|
||||
diag3 = _mm_xor_si128(diag3, a6);
|
||||
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
|
||||
diag3 = _mm_xor_si128(diag3, b6);
|
||||
|
||||
a7 = _mm_add_epi32(a7, diag3);
|
||||
a0 = diag1;
|
||||
b7 = a7;
|
||||
a7 = _mm_slli_epi32(a7, 18);
|
||||
b7 = _mm_srli_epi32(b7, 14);
|
||||
diag0 = _mm_xor_si128(diag0, a7);
|
||||
diag3 = _mm_shuffle_epi32(diag3, 0x39);
|
||||
diag0 = _mm_xor_si128(diag0, b7);
|
||||
}
|
||||
|
||||
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((const __m128i *) (x + 0)));
|
||||
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((const __m128i *) (x + 4)));
|
||||
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((const __m128i *) (x + 8)));
|
||||
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((const __m128i *) (x + 12)));
|
||||
|
||||
#define ONEQUAD_SHUFFLE(A, B, C, D) \
|
||||
do { \
|
||||
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
|
||||
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
|
||||
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
|
||||
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
|
||||
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
|
||||
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
|
||||
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
|
||||
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
|
||||
in##A ^= *(const uint32_t *) (m + (A * 4)); \
|
||||
in##B ^= *(const uint32_t *) (m + (B * 4)); \
|
||||
in##C ^= *(const uint32_t *) (m + (C * 4)); \
|
||||
in##D ^= *(const uint32_t *) (m + (D * 4)); \
|
||||
*(uint32_t *) (c + (A * 4)) = in##A; \
|
||||
*(uint32_t *) (c + (B * 4)) = in##B; \
|
||||
*(uint32_t *) (c + (C * 4)) = in##C; \
|
||||
*(uint32_t *) (c + (D * 4)) = in##D; \
|
||||
} while (0)
|
||||
|
||||
#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
|
||||
|
||||
ONEQUAD(0, 12, 8, 4);
|
||||
ONEQUAD(5, 1, 13, 9);
|
||||
ONEQUAD(10, 6, 2, 14);
|
||||
ONEQUAD(15, 11, 7, 3);
|
||||
|
||||
#undef ONEQUAD
|
||||
#undef ONEQUAD_SHUFFLE
|
||||
|
||||
in8 = x[8];
|
||||
in9 = x[13];
|
||||
in8++;
|
||||
if (in8 == 0) {
|
||||
in9++;
|
||||
}
|
||||
x[8] = in8;
|
||||
x[13] = in9;
|
||||
|
||||
c += 64;
|
||||
m += 64;
|
||||
bytes -= 64;
|
||||
}
|
|
@ -1,547 +0,0 @@
|
|||
if (bytes >= 256) {
|
||||
__m128i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
|
||||
y15;
|
||||
__m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14,
|
||||
z15;
|
||||
__m128i orig0, orig1, orig2, orig3, orig4, orig5, orig6, orig7, orig8,
|
||||
orig9, orig10, orig11, orig12, orig13, orig14, orig15;
|
||||
|
||||
uint32_t in8;
|
||||
uint32_t in9;
|
||||
int i;
|
||||
|
||||
/* element broadcast immediate for _mm_shuffle_epi32 are in order:
|
||||
0x00, 0x55, 0xaa, 0xff */
|
||||
z0 = _mm_loadu_si128((const __m128i *) (x + 0));
|
||||
z5 = _mm_shuffle_epi32(z0, 0x55);
|
||||
z10 = _mm_shuffle_epi32(z0, 0xaa);
|
||||
z15 = _mm_shuffle_epi32(z0, 0xff);
|
||||
z0 = _mm_shuffle_epi32(z0, 0x00);
|
||||
z1 = _mm_loadu_si128((const __m128i *) (x + 4));
|
||||
z6 = _mm_shuffle_epi32(z1, 0xaa);
|
||||
z11 = _mm_shuffle_epi32(z1, 0xff);
|
||||
z12 = _mm_shuffle_epi32(z1, 0x00);
|
||||
z1 = _mm_shuffle_epi32(z1, 0x55);
|
||||
z2 = _mm_loadu_si128((const __m128i *) (x + 8));
|
||||
z7 = _mm_shuffle_epi32(z2, 0xff);
|
||||
z13 = _mm_shuffle_epi32(z2, 0x55);
|
||||
z2 = _mm_shuffle_epi32(z2, 0xaa);
|
||||
/* no z8 -> first half of the nonce, will fill later */
|
||||
z3 = _mm_loadu_si128((const __m128i *) (x + 12));
|
||||
z4 = _mm_shuffle_epi32(z3, 0x00);
|
||||
z14 = _mm_shuffle_epi32(z3, 0xaa);
|
||||
z3 = _mm_shuffle_epi32(z3, 0xff);
|
||||
/* no z9 -> second half of the nonce, will fill later */
|
||||
orig0 = z0;
|
||||
orig1 = z1;
|
||||
orig2 = z2;
|
||||
orig3 = z3;
|
||||
orig4 = z4;
|
||||
orig5 = z5;
|
||||
orig6 = z6;
|
||||
orig7 = z7;
|
||||
orig10 = z10;
|
||||
orig11 = z11;
|
||||
orig12 = z12;
|
||||
orig13 = z13;
|
||||
orig14 = z14;
|
||||
orig15 = z15;
|
||||
|
||||
while (bytes >= 256) {
|
||||
/* vector implementation for z8 and z9 */
|
||||
/* not sure if it helps for only 4 blocks */
|
||||
const __m128i addv8 = _mm_set_epi64x(1, 0);
|
||||
const __m128i addv9 = _mm_set_epi64x(3, 2);
|
||||
__m128i t8, t9;
|
||||
uint64_t in89;
|
||||
|
||||
in8 = x[8];
|
||||
in9 = x[13];
|
||||
in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
|
||||
t8 = _mm_set1_epi64x(in89);
|
||||
t9 = _mm_set1_epi64x(in89);
|
||||
|
||||
z8 = _mm_add_epi64(addv8, t8);
|
||||
z9 = _mm_add_epi64(addv9, t9);
|
||||
|
||||
t8 = _mm_unpacklo_epi32(z8, z9);
|
||||
t9 = _mm_unpackhi_epi32(z8, z9);
|
||||
|
||||
z8 = _mm_unpacklo_epi32(t8, t9);
|
||||
z9 = _mm_unpackhi_epi32(t8, t9);
|
||||
|
||||
orig8 = z8;
|
||||
orig9 = z9;
|
||||
|
||||
in89 += 4;
|
||||
|
||||
x[8] = in89 & 0xFFFFFFFF;
|
||||
x[13] = (in89 >> 32) & 0xFFFFFFFF;
|
||||
|
||||
z5 = orig5;
|
||||
z10 = orig10;
|
||||
z15 = orig15;
|
||||
z14 = orig14;
|
||||
z3 = orig3;
|
||||
z6 = orig6;
|
||||
z11 = orig11;
|
||||
z1 = orig1;
|
||||
|
||||
z7 = orig7;
|
||||
z13 = orig13;
|
||||
z2 = orig2;
|
||||
z9 = orig9;
|
||||
z0 = orig0;
|
||||
z12 = orig12;
|
||||
z4 = orig4;
|
||||
z8 = orig8;
|
||||
|
||||
for (i = 0; i < ROUNDS; i += 2) {
|
||||
/* the inner loop is a direct translation (regexp search/replace)
|
||||
* from the amd64-xmm6 ASM */
|
||||
__m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
|
||||
r14, r15;
|
||||
|
||||
y4 = z12;
|
||||
y4 = _mm_add_epi32(y4, z0);
|
||||
r4 = y4;
|
||||
y4 = _mm_slli_epi32(y4, 7);
|
||||
z4 = _mm_xor_si128(z4, y4);
|
||||
r4 = _mm_srli_epi32(r4, 25);
|
||||
z4 = _mm_xor_si128(z4, r4);
|
||||
|
||||
y9 = z1;
|
||||
y9 = _mm_add_epi32(y9, z5);
|
||||
r9 = y9;
|
||||
y9 = _mm_slli_epi32(y9, 7);
|
||||
z9 = _mm_xor_si128(z9, y9);
|
||||
r9 = _mm_srli_epi32(r9, 25);
|
||||
z9 = _mm_xor_si128(z9, r9);
|
||||
|
||||
y8 = z0;
|
||||
y8 = _mm_add_epi32(y8, z4);
|
||||
r8 = y8;
|
||||
y8 = _mm_slli_epi32(y8, 9);
|
||||
z8 = _mm_xor_si128(z8, y8);
|
||||
r8 = _mm_srli_epi32(r8, 23);
|
||||
z8 = _mm_xor_si128(z8, r8);
|
||||
|
||||
y13 = z5;
|
||||
y13 = _mm_add_epi32(y13, z9);
|
||||
r13 = y13;
|
||||
y13 = _mm_slli_epi32(y13, 9);
|
||||
z13 = _mm_xor_si128(z13, y13);
|
||||
r13 = _mm_srli_epi32(r13, 23);
|
||||
z13 = _mm_xor_si128(z13, r13);
|
||||
|
||||
y12 = z4;
|
||||
y12 = _mm_add_epi32(y12, z8);
|
||||
r12 = y12;
|
||||
y12 = _mm_slli_epi32(y12, 13);
|
||||
z12 = _mm_xor_si128(z12, y12);
|
||||
r12 = _mm_srli_epi32(r12, 19);
|
||||
z12 = _mm_xor_si128(z12, r12);
|
||||
|
||||
y1 = z9;
|
||||
y1 = _mm_add_epi32(y1, z13);
|
||||
r1 = y1;
|
||||
y1 = _mm_slli_epi32(y1, 13);
|
||||
z1 = _mm_xor_si128(z1, y1);
|
||||
r1 = _mm_srli_epi32(r1, 19);
|
||||
z1 = _mm_xor_si128(z1, r1);
|
||||
|
||||
y0 = z8;
|
||||
y0 = _mm_add_epi32(y0, z12);
|
||||
r0 = y0;
|
||||
y0 = _mm_slli_epi32(y0, 18);
|
||||
z0 = _mm_xor_si128(z0, y0);
|
||||
r0 = _mm_srli_epi32(r0, 14);
|
||||
z0 = _mm_xor_si128(z0, r0);
|
||||
|
||||
y5 = z13;
|
||||
y5 = _mm_add_epi32(y5, z1);
|
||||
r5 = y5;
|
||||
y5 = _mm_slli_epi32(y5, 18);
|
||||
z5 = _mm_xor_si128(z5, y5);
|
||||
r5 = _mm_srli_epi32(r5, 14);
|
||||
z5 = _mm_xor_si128(z5, r5);
|
||||
|
||||
y14 = z6;
|
||||
y14 = _mm_add_epi32(y14, z10);
|
||||
r14 = y14;
|
||||
y14 = _mm_slli_epi32(y14, 7);
|
||||
z14 = _mm_xor_si128(z14, y14);
|
||||
r14 = _mm_srli_epi32(r14, 25);
|
||||
z14 = _mm_xor_si128(z14, r14);
|
||||
|
||||
y3 = z11;
|
||||
y3 = _mm_add_epi32(y3, z15);
|
||||
r3 = y3;
|
||||
y3 = _mm_slli_epi32(y3, 7);
|
||||
z3 = _mm_xor_si128(z3, y3);
|
||||
r3 = _mm_srli_epi32(r3, 25);
|
||||
z3 = _mm_xor_si128(z3, r3);
|
||||
|
||||
y2 = z10;
|
||||
y2 = _mm_add_epi32(y2, z14);
|
||||
r2 = y2;
|
||||
y2 = _mm_slli_epi32(y2, 9);
|
||||
z2 = _mm_xor_si128(z2, y2);
|
||||
r2 = _mm_srli_epi32(r2, 23);
|
||||
z2 = _mm_xor_si128(z2, r2);
|
||||
|
||||
y7 = z15;
|
||||
y7 = _mm_add_epi32(y7, z3);
|
||||
r7 = y7;
|
||||
y7 = _mm_slli_epi32(y7, 9);
|
||||
z7 = _mm_xor_si128(z7, y7);
|
||||
r7 = _mm_srli_epi32(r7, 23);
|
||||
z7 = _mm_xor_si128(z7, r7);
|
||||
|
||||
y6 = z14;
|
||||
y6 = _mm_add_epi32(y6, z2);
|
||||
r6 = y6;
|
||||
y6 = _mm_slli_epi32(y6, 13);
|
||||
z6 = _mm_xor_si128(z6, y6);
|
||||
r6 = _mm_srli_epi32(r6, 19);
|
||||
z6 = _mm_xor_si128(z6, r6);
|
||||
|
||||
y11 = z3;
|
||||
y11 = _mm_add_epi32(y11, z7);
|
||||
r11 = y11;
|
||||
y11 = _mm_slli_epi32(y11, 13);
|
||||
z11 = _mm_xor_si128(z11, y11);
|
||||
r11 = _mm_srli_epi32(r11, 19);
|
||||
z11 = _mm_xor_si128(z11, r11);
|
||||
|
||||
y10 = z2;
|
||||
y10 = _mm_add_epi32(y10, z6);
|
||||
r10 = y10;
|
||||
y10 = _mm_slli_epi32(y10, 18);
|
||||
z10 = _mm_xor_si128(z10, y10);
|
||||
r10 = _mm_srli_epi32(r10, 14);
|
||||
z10 = _mm_xor_si128(z10, r10);
|
||||
|
||||
y1 = z3;
|
||||
y1 = _mm_add_epi32(y1, z0);
|
||||
r1 = y1;
|
||||
y1 = _mm_slli_epi32(y1, 7);
|
||||
z1 = _mm_xor_si128(z1, y1);
|
||||
r1 = _mm_srli_epi32(r1, 25);
|
||||
z1 = _mm_xor_si128(z1, r1);
|
||||
|
||||
y15 = z7;
|
||||
y15 = _mm_add_epi32(y15, z11);
|
||||
r15 = y15;
|
||||
y15 = _mm_slli_epi32(y15, 18);
|
||||
z15 = _mm_xor_si128(z15, y15);
|
||||
r15 = _mm_srli_epi32(r15, 14);
|
||||
z15 = _mm_xor_si128(z15, r15);
|
||||
|
||||
y6 = z4;
|
||||
y6 = _mm_add_epi32(y6, z5);
|
||||
r6 = y6;
|
||||
y6 = _mm_slli_epi32(y6, 7);
|
||||
z6 = _mm_xor_si128(z6, y6);
|
||||
r6 = _mm_srli_epi32(r6, 25);
|
||||
z6 = _mm_xor_si128(z6, r6);
|
||||
|
||||
y2 = z0;
|
||||
y2 = _mm_add_epi32(y2, z1);
|
||||
r2 = y2;
|
||||
y2 = _mm_slli_epi32(y2, 9);
|
||||
z2 = _mm_xor_si128(z2, y2);
|
||||
r2 = _mm_srli_epi32(r2, 23);
|
||||
z2 = _mm_xor_si128(z2, r2);
|
||||
|
||||
y7 = z5;
|
||||
y7 = _mm_add_epi32(y7, z6);
|
||||
r7 = y7;
|
||||
y7 = _mm_slli_epi32(y7, 9);
|
||||
z7 = _mm_xor_si128(z7, y7);
|
||||
r7 = _mm_srli_epi32(r7, 23);
|
||||
z7 = _mm_xor_si128(z7, r7);
|
||||
|
||||
y3 = z1;
|
||||
y3 = _mm_add_epi32(y3, z2);
|
||||
r3 = y3;
|
||||
y3 = _mm_slli_epi32(y3, 13);
|
||||
z3 = _mm_xor_si128(z3, y3);
|
||||
r3 = _mm_srli_epi32(r3, 19);
|
||||
z3 = _mm_xor_si128(z3, r3);
|
||||
|
||||
y4 = z6;
|
||||
y4 = _mm_add_epi32(y4, z7);
|
||||
r4 = y4;
|
||||
y4 = _mm_slli_epi32(y4, 13);
|
||||
z4 = _mm_xor_si128(z4, y4);
|
||||
r4 = _mm_srli_epi32(r4, 19);
|
||||
z4 = _mm_xor_si128(z4, r4);
|
||||
|
||||
y0 = z2;
|
||||
y0 = _mm_add_epi32(y0, z3);
|
||||
r0 = y0;
|
||||
y0 = _mm_slli_epi32(y0, 18);
|
||||
z0 = _mm_xor_si128(z0, y0);
|
||||
r0 = _mm_srli_epi32(r0, 14);
|
||||
z0 = _mm_xor_si128(z0, r0);
|
||||
|
||||
y5 = z7;
|
||||
y5 = _mm_add_epi32(y5, z4);
|
||||
r5 = y5;
|
||||
y5 = _mm_slli_epi32(y5, 18);
|
||||
z5 = _mm_xor_si128(z5, y5);
|
||||
r5 = _mm_srli_epi32(r5, 14);
|
||||
z5 = _mm_xor_si128(z5, r5);
|
||||
|
||||
y11 = z9;
|
||||
y11 = _mm_add_epi32(y11, z10);
|
||||
r11 = y11;
|
||||
y11 = _mm_slli_epi32(y11, 7);
|
||||
z11 = _mm_xor_si128(z11, y11);
|
||||
r11 = _mm_srli_epi32(r11, 25);
|
||||
z11 = _mm_xor_si128(z11, r11);
|
||||
|
||||
y12 = z14;
|
||||
y12 = _mm_add_epi32(y12, z15);
|
||||
r12 = y12;
|
||||
y12 = _mm_slli_epi32(y12, 7);
|
||||
z12 = _mm_xor_si128(z12, y12);
|
||||
r12 = _mm_srli_epi32(r12, 25);
|
||||
z12 = _mm_xor_si128(z12, r12);
|
||||
|
||||
y8 = z10;
|
||||
y8 = _mm_add_epi32(y8, z11);
|
||||
r8 = y8;
|
||||
y8 = _mm_slli_epi32(y8, 9);
|
||||
z8 = _mm_xor_si128(z8, y8);
|
||||
r8 = _mm_srli_epi32(r8, 23);
|
||||
z8 = _mm_xor_si128(z8, r8);
|
||||
|
||||
y13 = z15;
|
||||
y13 = _mm_add_epi32(y13, z12);
|
||||
r13 = y13;
|
||||
y13 = _mm_slli_epi32(y13, 9);
|
||||
z13 = _mm_xor_si128(z13, y13);
|
||||
r13 = _mm_srli_epi32(r13, 23);
|
||||
z13 = _mm_xor_si128(z13, r13);
|
||||
|
||||
y9 = z11;
|
||||
y9 = _mm_add_epi32(y9, z8);
|
||||
r9 = y9;
|
||||
y9 = _mm_slli_epi32(y9, 13);
|
||||
z9 = _mm_xor_si128(z9, y9);
|
||||
r9 = _mm_srli_epi32(r9, 19);
|
||||
z9 = _mm_xor_si128(z9, r9);
|
||||
|
||||
y14 = z12;
|
||||
y14 = _mm_add_epi32(y14, z13);
|
||||
r14 = y14;
|
||||
y14 = _mm_slli_epi32(y14, 13);
|
||||
z14 = _mm_xor_si128(z14, y14);
|
||||
r14 = _mm_srli_epi32(r14, 19);
|
||||
z14 = _mm_xor_si128(z14, r14);
|
||||
|
||||
y10 = z8;
|
||||
y10 = _mm_add_epi32(y10, z9);
|
||||
r10 = y10;
|
||||
y10 = _mm_slli_epi32(y10, 18);
|
||||
z10 = _mm_xor_si128(z10, y10);
|
||||
r10 = _mm_srli_epi32(r10, 14);
|
||||
z10 = _mm_xor_si128(z10, r10);
|
||||
|
||||
y15 = z13;
|
||||
y15 = _mm_add_epi32(y15, z14);
|
||||
r15 = y15;
|
||||
y15 = _mm_slli_epi32(y15, 18);
|
||||
z15 = _mm_xor_si128(z15, y15);
|
||||
r15 = _mm_srli_epi32(r15, 14);
|
||||
z15 = _mm_xor_si128(z15, r15);
|
||||
}
|
||||
|
||||
/* store data ; this macro replicates the original amd64-xmm6 code */
|
||||
#define ONEQUAD_SHUFFLE(A, B, C, D) \
|
||||
z##A = _mm_add_epi32(z##A, orig##A); \
|
||||
z##B = _mm_add_epi32(z##B, orig##B); \
|
||||
z##C = _mm_add_epi32(z##C, orig##C); \
|
||||
z##D = _mm_add_epi32(z##D, orig##D); \
|
||||
in##A = _mm_cvtsi128_si32(z##A); \
|
||||
in##B = _mm_cvtsi128_si32(z##B); \
|
||||
in##C = _mm_cvtsi128_si32(z##C); \
|
||||
in##D = _mm_cvtsi128_si32(z##D); \
|
||||
z##A = _mm_shuffle_epi32(z##A, 0x39); \
|
||||
z##B = _mm_shuffle_epi32(z##B, 0x39); \
|
||||
z##C = _mm_shuffle_epi32(z##C, 0x39); \
|
||||
z##D = _mm_shuffle_epi32(z##D, 0x39); \
|
||||
\
|
||||
in##A ^= *(uint32_t *) (m + 0); \
|
||||
in##B ^= *(uint32_t *) (m + 4); \
|
||||
in##C ^= *(uint32_t *) (m + 8); \
|
||||
in##D ^= *(uint32_t *) (m + 12); \
|
||||
\
|
||||
*(uint32_t *) (c + 0) = in##A; \
|
||||
*(uint32_t *) (c + 4) = in##B; \
|
||||
*(uint32_t *) (c + 8) = in##C; \
|
||||
*(uint32_t *) (c + 12) = in##D; \
|
||||
\
|
||||
in##A = _mm_cvtsi128_si32(z##A); \
|
||||
in##B = _mm_cvtsi128_si32(z##B); \
|
||||
in##C = _mm_cvtsi128_si32(z##C); \
|
||||
in##D = _mm_cvtsi128_si32(z##D); \
|
||||
z##A = _mm_shuffle_epi32(z##A, 0x39); \
|
||||
z##B = _mm_shuffle_epi32(z##B, 0x39); \
|
||||
z##C = _mm_shuffle_epi32(z##C, 0x39); \
|
||||
z##D = _mm_shuffle_epi32(z##D, 0x39); \
|
||||
\
|
||||
in##A ^= *(uint32_t *) (m + 64); \
|
||||
in##B ^= *(uint32_t *) (m + 68); \
|
||||
in##C ^= *(uint32_t *) (m + 72); \
|
||||
in##D ^= *(uint32_t *) (m + 76); \
|
||||
*(uint32_t *) (c + 64) = in##A; \
|
||||
*(uint32_t *) (c + 68) = in##B; \
|
||||
*(uint32_t *) (c + 72) = in##C; \
|
||||
*(uint32_t *) (c + 76) = in##D; \
|
||||
\
|
||||
in##A = _mm_cvtsi128_si32(z##A); \
|
||||
in##B = _mm_cvtsi128_si32(z##B); \
|
||||
in##C = _mm_cvtsi128_si32(z##C); \
|
||||
in##D = _mm_cvtsi128_si32(z##D); \
|
||||
z##A = _mm_shuffle_epi32(z##A, 0x39); \
|
||||
z##B = _mm_shuffle_epi32(z##B, 0x39); \
|
||||
z##C = _mm_shuffle_epi32(z##C, 0x39); \
|
||||
z##D = _mm_shuffle_epi32(z##D, 0x39); \
|
||||
\
|
||||
in##A ^= *(uint32_t *) (m + 128); \
|
||||
in##B ^= *(uint32_t *) (m + 132); \
|
||||
in##C ^= *(uint32_t *) (m + 136); \
|
||||
in##D ^= *(uint32_t *) (m + 140); \
|
||||
*(uint32_t *) (c + 128) = in##A; \
|
||||
*(uint32_t *) (c + 132) = in##B; \
|
||||
*(uint32_t *) (c + 136) = in##C; \
|
||||
*(uint32_t *) (c + 140) = in##D; \
|
||||
\
|
||||
in##A = _mm_cvtsi128_si32(z##A); \
|
||||
in##B = _mm_cvtsi128_si32(z##B); \
|
||||
in##C = _mm_cvtsi128_si32(z##C); \
|
||||
in##D = _mm_cvtsi128_si32(z##D); \
|
||||
\
|
||||
in##A ^= *(uint32_t *) (m + 192); \
|
||||
in##B ^= *(uint32_t *) (m + 196); \
|
||||
in##C ^= *(uint32_t *) (m + 200); \
|
||||
in##D ^= *(uint32_t *) (m + 204); \
|
||||
*(uint32_t *) (c + 192) = in##A; \
|
||||
*(uint32_t *) (c + 196) = in##B; \
|
||||
*(uint32_t *) (c + 200) = in##C; \
|
||||
*(uint32_t *) (c + 204) = in##D
|
||||
|
||||
/* store data ; this macro replaces shuffle+mov by a direct extract; not much
|
||||
* difference */
|
||||
#define ONEQUAD_EXTRACT(A, B, C, D) \
|
||||
z##A = _mm_add_epi32(z##A, orig##A); \
|
||||
z##B = _mm_add_epi32(z##B, orig##B); \
|
||||
z##C = _mm_add_epi32(z##C, orig##C); \
|
||||
z##D = _mm_add_epi32(z##D, orig##D); \
|
||||
in##A = _mm_cvtsi128_si32(z##A); \
|
||||
in##B = _mm_cvtsi128_si32(z##B); \
|
||||
in##C = _mm_cvtsi128_si32(z##C); \
|
||||
in##D = _mm_cvtsi128_si32(z##D); \
|
||||
in##A ^= *(uint32_t *) (m + 0); \
|
||||
in##B ^= *(uint32_t *) (m + 4); \
|
||||
in##C ^= *(uint32_t *) (m + 8); \
|
||||
in##D ^= *(uint32_t *) (m + 12); \
|
||||
*(uint32_t *) (c + 0) = in##A; \
|
||||
*(uint32_t *) (c + 4) = in##B; \
|
||||
*(uint32_t *) (c + 8) = in##C; \
|
||||
*(uint32_t *) (c + 12) = in##D; \
|
||||
\
|
||||
in##A = _mm_extract_epi32(z##A, 1); \
|
||||
in##B = _mm_extract_epi32(z##B, 1); \
|
||||
in##C = _mm_extract_epi32(z##C, 1); \
|
||||
in##D = _mm_extract_epi32(z##D, 1); \
|
||||
\
|
||||
in##A ^= *(uint32_t *) (m + 64); \
|
||||
in##B ^= *(uint32_t *) (m + 68); \
|
||||
in##C ^= *(uint32_t *) (m + 72); \
|
||||
in##D ^= *(uint32_t *) (m + 76); \
|
||||
*(uint32_t *) (c + 64) = in##A; \
|
||||
*(uint32_t *) (c + 68) = in##B; \
|
||||
*(uint32_t *) (c + 72) = in##C; \
|
||||
*(uint32_t *) (c + 76) = in##D; \
|
||||
\
|
||||
in##A = _mm_extract_epi32(z##A, 2); \
|
||||
in##B = _mm_extract_epi32(z##B, 2); \
|
||||
in##C = _mm_extract_epi32(z##C, 2); \
|
||||
in##D = _mm_extract_epi32(z##D, 2); \
|
||||
\
|
||||
in##A ^= *(uint32_t *) (m + 128); \
|
||||
in##B ^= *(uint32_t *) (m + 132); \
|
||||
in##C ^= *(uint32_t *) (m + 136); \
|
||||
in##D ^= *(uint32_t *) (m + 140); \
|
||||
*(uint32_t *) (c + 128) = in##A; \
|
||||
*(uint32_t *) (c + 132) = in##B; \
|
||||
*(uint32_t *) (c + 136) = in##C; \
|
||||
*(uint32_t *) (c + 140) = in##D; \
|
||||
\
|
||||
in##A = _mm_extract_epi32(z##A, 3); \
|
||||
in##B = _mm_extract_epi32(z##B, 3); \
|
||||
in##C = _mm_extract_epi32(z##C, 3); \
|
||||
in##D = _mm_extract_epi32(z##D, 3); \
|
||||
\
|
||||
in##A ^= *(uint32_t *) (m + 192); \
|
||||
in##B ^= *(uint32_t *) (m + 196); \
|
||||
in##C ^= *(uint32_t *) (m + 200); \
|
||||
in##D ^= *(uint32_t *) (m + 204); \
|
||||
*(uint32_t *) (c + 192) = in##A; \
|
||||
*(uint32_t *) (c + 196) = in##B; \
|
||||
*(uint32_t *) (c + 200) = in##C; \
|
||||
*(uint32_t *) (c + 204) = in##D
|
||||
|
||||
/* store data ; this macro first transpose data in-registers, and then store
|
||||
* them in memory. much faster with icc. */
|
||||
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
|
||||
z##A = _mm_add_epi32(z##A, orig##A); \
|
||||
z##B = _mm_add_epi32(z##B, orig##B); \
|
||||
z##C = _mm_add_epi32(z##C, orig##C); \
|
||||
z##D = _mm_add_epi32(z##D, orig##D); \
|
||||
y##A = _mm_unpacklo_epi32(z##A, z##B); \
|
||||
y##B = _mm_unpacklo_epi32(z##C, z##D); \
|
||||
y##C = _mm_unpackhi_epi32(z##A, z##B); \
|
||||
y##D = _mm_unpackhi_epi32(z##C, z##D); \
|
||||
z##A = _mm_unpacklo_epi64(y##A, y##B); \
|
||||
z##B = _mm_unpackhi_epi64(y##A, y##B); \
|
||||
z##C = _mm_unpacklo_epi64(y##C, y##D); \
|
||||
z##D = _mm_unpackhi_epi64(y##C, y##D); \
|
||||
y##A = _mm_xor_si128(z##A, _mm_loadu_si128((const __m128i *) (m + 0))); \
|
||||
_mm_storeu_si128((__m128i *) (c + 0), y##A); \
|
||||
y##B = _mm_xor_si128(z##B, _mm_loadu_si128((const __m128i *) (m + 64))); \
|
||||
_mm_storeu_si128((__m128i *) (c + 64), y##B); \
|
||||
y##C = _mm_xor_si128(z##C, _mm_loadu_si128((const __m128i *) (m + 128))); \
|
||||
_mm_storeu_si128((__m128i *) (c + 128), y##C); \
|
||||
y##D = _mm_xor_si128(z##D, _mm_loadu_si128((const __m128i *) (m + 192))); \
|
||||
_mm_storeu_si128((__m128i *) (c + 192), y##D)
|
||||
|
||||
#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
|
||||
|
||||
ONEQUAD(0, 1, 2, 3);
|
||||
m += 16;
|
||||
c += 16;
|
||||
ONEQUAD(4, 5, 6, 7);
|
||||
m += 16;
|
||||
c += 16;
|
||||
ONEQUAD(8, 9, 10, 11);
|
||||
m += 16;
|
||||
c += 16;
|
||||
ONEQUAD(12, 13, 14, 15);
|
||||
m -= 48;
|
||||
c -= 48;
|
||||
|
||||
#undef ONEQUAD
|
||||
#undef ONEQUAD_TRANSPOSE
|
||||
#undef ONEQUAD_EXTRACT
|
||||
#undef ONEQUAD_SHUFFLE
|
||||
|
||||
bytes -= 256;
|
||||
c += 256;
|
||||
m += 256;
|
||||
}
|
||||
}
|
|
@ -1,477 +0,0 @@
|
|||
if (bytes >= 512) {
|
||||
__m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
|
||||
y15;
|
||||
|
||||
/* the naive way seems as fast (if not a bit faster) than the vector way */
|
||||
__m256i z0 = _mm256_set1_epi32(x[0]);
|
||||
__m256i z5 = _mm256_set1_epi32(x[1]);
|
||||
__m256i z10 = _mm256_set1_epi32(x[2]);
|
||||
__m256i z15 = _mm256_set1_epi32(x[3]);
|
||||
__m256i z12 = _mm256_set1_epi32(x[4]);
|
||||
__m256i z1 = _mm256_set1_epi32(x[5]);
|
||||
__m256i z6 = _mm256_set1_epi32(x[6]);
|
||||
__m256i z11 = _mm256_set1_epi32(x[7]);
|
||||
__m256i z8; /* useless */
|
||||
__m256i z13 = _mm256_set1_epi32(x[9]);
|
||||
__m256i z2 = _mm256_set1_epi32(x[10]);
|
||||
__m256i z7 = _mm256_set1_epi32(x[11]);
|
||||
__m256i z4 = _mm256_set1_epi32(x[12]);
|
||||
__m256i z9; /* useless */
|
||||
__m256i z14 = _mm256_set1_epi32(x[14]);
|
||||
__m256i z3 = _mm256_set1_epi32(x[15]);
|
||||
|
||||
__m256i orig0 = z0;
|
||||
__m256i orig1 = z1;
|
||||
__m256i orig2 = z2;
|
||||
__m256i orig3 = z3;
|
||||
__m256i orig4 = z4;
|
||||
__m256i orig5 = z5;
|
||||
__m256i orig6 = z6;
|
||||
__m256i orig7 = z7;
|
||||
__m256i orig8;
|
||||
__m256i orig9;
|
||||
__m256i orig10 = z10;
|
||||
__m256i orig11 = z11;
|
||||
__m256i orig12 = z12;
|
||||
__m256i orig13 = z13;
|
||||
__m256i orig14 = z14;
|
||||
__m256i orig15 = z15;
|
||||
|
||||
uint32_t in8;
|
||||
uint32_t in9;
|
||||
int i;
|
||||
|
||||
while (bytes >= 512) {
|
||||
/* vector implementation for z8 and z9 */
|
||||
/* faster than the naive version for 8 blocks */
|
||||
const __m256i addv8 = _mm256_set_epi64x(3, 2, 1, 0);
|
||||
const __m256i addv9 = _mm256_set_epi64x(7, 6, 5, 4);
|
||||
const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
|
||||
|
||||
__m256i t8, t9;
|
||||
uint64_t in89;
|
||||
|
||||
in8 = x[8];
|
||||
in9 = x[13]; /* see arrays above for the address translation */
|
||||
in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
|
||||
|
||||
z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
|
||||
|
||||
t8 = _mm256_add_epi64(addv8, z8);
|
||||
t9 = _mm256_add_epi64(addv9, z9);
|
||||
|
||||
z8 = _mm256_unpacklo_epi32(t8, t9);
|
||||
z9 = _mm256_unpackhi_epi32(t8, t9);
|
||||
|
||||
t8 = _mm256_unpacklo_epi32(z8, z9);
|
||||
t9 = _mm256_unpackhi_epi32(z8, z9);
|
||||
|
||||
/* required because unpack* are intra-lane */
|
||||
z8 = _mm256_permutevar8x32_epi32(t8, permute);
|
||||
z9 = _mm256_permutevar8x32_epi32(t9, permute);
|
||||
|
||||
orig8 = z8;
|
||||
orig9 = z9;
|
||||
|
||||
in89 += 8;
|
||||
|
||||
x[8] = in89 & 0xFFFFFFFF;
|
||||
x[13] = (in89 >> 32) & 0xFFFFFFFF;
|
||||
|
||||
z5 = orig5;
|
||||
z10 = orig10;
|
||||
z15 = orig15;
|
||||
z14 = orig14;
|
||||
z3 = orig3;
|
||||
z6 = orig6;
|
||||
z11 = orig11;
|
||||
z1 = orig1;
|
||||
|
||||
z7 = orig7;
|
||||
z13 = orig13;
|
||||
z2 = orig2;
|
||||
z9 = orig9;
|
||||
z0 = orig0;
|
||||
z12 = orig12;
|
||||
z4 = orig4;
|
||||
z8 = orig8;
|
||||
|
||||
for (i = 0; i < ROUNDS; i += 2) {
|
||||
/* the inner loop is a direct translation (regexp search/replace)
|
||||
* from the amd64-xmm6 ASM */
|
||||
__m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
|
||||
r14, r15;
|
||||
|
||||
y4 = z12;
|
||||
y4 = _mm256_add_epi32(y4, z0);
|
||||
r4 = y4;
|
||||
y4 = _mm256_slli_epi32(y4, 7);
|
||||
z4 = _mm256_xor_si256(z4, y4);
|
||||
r4 = _mm256_srli_epi32(r4, 25);
|
||||
z4 = _mm256_xor_si256(z4, r4);
|
||||
|
||||
y9 = z1;
|
||||
y9 = _mm256_add_epi32(y9, z5);
|
||||
r9 = y9;
|
||||
y9 = _mm256_slli_epi32(y9, 7);
|
||||
z9 = _mm256_xor_si256(z9, y9);
|
||||
r9 = _mm256_srli_epi32(r9, 25);
|
||||
z9 = _mm256_xor_si256(z9, r9);
|
||||
|
||||
y8 = z0;
|
||||
y8 = _mm256_add_epi32(y8, z4);
|
||||
r8 = y8;
|
||||
y8 = _mm256_slli_epi32(y8, 9);
|
||||
z8 = _mm256_xor_si256(z8, y8);
|
||||
r8 = _mm256_srli_epi32(r8, 23);
|
||||
z8 = _mm256_xor_si256(z8, r8);
|
||||
|
||||
y13 = z5;
|
||||
y13 = _mm256_add_epi32(y13, z9);
|
||||
r13 = y13;
|
||||
y13 = _mm256_slli_epi32(y13, 9);
|
||||
z13 = _mm256_xor_si256(z13, y13);
|
||||
r13 = _mm256_srli_epi32(r13, 23);
|
||||
z13 = _mm256_xor_si256(z13, r13);
|
||||
|
||||
y12 = z4;
|
||||
y12 = _mm256_add_epi32(y12, z8);
|
||||
r12 = y12;
|
||||
y12 = _mm256_slli_epi32(y12, 13);
|
||||
z12 = _mm256_xor_si256(z12, y12);
|
||||
r12 = _mm256_srli_epi32(r12, 19);
|
||||
z12 = _mm256_xor_si256(z12, r12);
|
||||
|
||||
y1 = z9;
|
||||
y1 = _mm256_add_epi32(y1, z13);
|
||||
r1 = y1;
|
||||
y1 = _mm256_slli_epi32(y1, 13);
|
||||
z1 = _mm256_xor_si256(z1, y1);
|
||||
r1 = _mm256_srli_epi32(r1, 19);
|
||||
z1 = _mm256_xor_si256(z1, r1);
|
||||
|
||||
y0 = z8;
|
||||
y0 = _mm256_add_epi32(y0, z12);
|
||||
r0 = y0;
|
||||
y0 = _mm256_slli_epi32(y0, 18);
|
||||
z0 = _mm256_xor_si256(z0, y0);
|
||||
r0 = _mm256_srli_epi32(r0, 14);
|
||||
z0 = _mm256_xor_si256(z0, r0);
|
||||
|
||||
y5 = z13;
|
||||
y5 = _mm256_add_epi32(y5, z1);
|
||||
r5 = y5;
|
||||
y5 = _mm256_slli_epi32(y5, 18);
|
||||
z5 = _mm256_xor_si256(z5, y5);
|
||||
r5 = _mm256_srli_epi32(r5, 14);
|
||||
z5 = _mm256_xor_si256(z5, r5);
|
||||
|
||||
y14 = z6;
|
||||
y14 = _mm256_add_epi32(y14, z10);
|
||||
r14 = y14;
|
||||
y14 = _mm256_slli_epi32(y14, 7);
|
||||
z14 = _mm256_xor_si256(z14, y14);
|
||||
r14 = _mm256_srli_epi32(r14, 25);
|
||||
z14 = _mm256_xor_si256(z14, r14);
|
||||
|
||||
y3 = z11;
|
||||
y3 = _mm256_add_epi32(y3, z15);
|
||||
r3 = y3;
|
||||
y3 = _mm256_slli_epi32(y3, 7);
|
||||
z3 = _mm256_xor_si256(z3, y3);
|
||||
r3 = _mm256_srli_epi32(r3, 25);
|
||||
z3 = _mm256_xor_si256(z3, r3);
|
||||
|
||||
y2 = z10;
|
||||
y2 = _mm256_add_epi32(y2, z14);
|
||||
r2 = y2;
|
||||
y2 = _mm256_slli_epi32(y2, 9);
|
||||
z2 = _mm256_xor_si256(z2, y2);
|
||||
r2 = _mm256_srli_epi32(r2, 23);
|
||||
z2 = _mm256_xor_si256(z2, r2);
|
||||
|
||||
y7 = z15;
|
||||
y7 = _mm256_add_epi32(y7, z3);
|
||||
r7 = y7;
|
||||
y7 = _mm256_slli_epi32(y7, 9);
|
||||
z7 = _mm256_xor_si256(z7, y7);
|
||||
r7 = _mm256_srli_epi32(r7, 23);
|
||||
z7 = _mm256_xor_si256(z7, r7);
|
||||
|
||||
y6 = z14;
|
||||
y6 = _mm256_add_epi32(y6, z2);
|
||||
r6 = y6;
|
||||
y6 = _mm256_slli_epi32(y6, 13);
|
||||
z6 = _mm256_xor_si256(z6, y6);
|
||||
r6 = _mm256_srli_epi32(r6, 19);
|
||||
z6 = _mm256_xor_si256(z6, r6);
|
||||
|
||||
y11 = z3;
|
||||
y11 = _mm256_add_epi32(y11, z7);
|
||||
r11 = y11;
|
||||
y11 = _mm256_slli_epi32(y11, 13);
|
||||
z11 = _mm256_xor_si256(z11, y11);
|
||||
r11 = _mm256_srli_epi32(r11, 19);
|
||||
z11 = _mm256_xor_si256(z11, r11);
|
||||
|
||||
y10 = z2;
|
||||
y10 = _mm256_add_epi32(y10, z6);
|
||||
r10 = y10;
|
||||
y10 = _mm256_slli_epi32(y10, 18);
|
||||
z10 = _mm256_xor_si256(z10, y10);
|
||||
r10 = _mm256_srli_epi32(r10, 14);
|
||||
z10 = _mm256_xor_si256(z10, r10);
|
||||
|
||||
y1 = z3;
|
||||
y1 = _mm256_add_epi32(y1, z0);
|
||||
r1 = y1;
|
||||
y1 = _mm256_slli_epi32(y1, 7);
|
||||
z1 = _mm256_xor_si256(z1, y1);
|
||||
r1 = _mm256_srli_epi32(r1, 25);
|
||||
z1 = _mm256_xor_si256(z1, r1);
|
||||
|
||||
y15 = z7;
|
||||
y15 = _mm256_add_epi32(y15, z11);
|
||||
r15 = y15;
|
||||
y15 = _mm256_slli_epi32(y15, 18);
|
||||
z15 = _mm256_xor_si256(z15, y15);
|
||||
r15 = _mm256_srli_epi32(r15, 14);
|
||||
z15 = _mm256_xor_si256(z15, r15);
|
||||
|
||||
y6 = z4;
|
||||
y6 = _mm256_add_epi32(y6, z5);
|
||||
r6 = y6;
|
||||
y6 = _mm256_slli_epi32(y6, 7);
|
||||
z6 = _mm256_xor_si256(z6, y6);
|
||||
r6 = _mm256_srli_epi32(r6, 25);
|
||||
z6 = _mm256_xor_si256(z6, r6);
|
||||
|
||||
y2 = z0;
|
||||
y2 = _mm256_add_epi32(y2, z1);
|
||||
r2 = y2;
|
||||
y2 = _mm256_slli_epi32(y2, 9);
|
||||
z2 = _mm256_xor_si256(z2, y2);
|
||||
r2 = _mm256_srli_epi32(r2, 23);
|
||||
z2 = _mm256_xor_si256(z2, r2);
|
||||
|
||||
y7 = z5;
|
||||
y7 = _mm256_add_epi32(y7, z6);
|
||||
r7 = y7;
|
||||
y7 = _mm256_slli_epi32(y7, 9);
|
||||
z7 = _mm256_xor_si256(z7, y7);
|
||||
r7 = _mm256_srli_epi32(r7, 23);
|
||||
z7 = _mm256_xor_si256(z7, r7);
|
||||
|
||||
y3 = z1;
|
||||
y3 = _mm256_add_epi32(y3, z2);
|
||||
r3 = y3;
|
||||
y3 = _mm256_slli_epi32(y3, 13);
|
||||
z3 = _mm256_xor_si256(z3, y3);
|
||||
r3 = _mm256_srli_epi32(r3, 19);
|
||||
z3 = _mm256_xor_si256(z3, r3);
|
||||
|
||||
y4 = z6;
|
||||
y4 = _mm256_add_epi32(y4, z7);
|
||||
r4 = y4;
|
||||
y4 = _mm256_slli_epi32(y4, 13);
|
||||
z4 = _mm256_xor_si256(z4, y4);
|
||||
r4 = _mm256_srli_epi32(r4, 19);
|
||||
z4 = _mm256_xor_si256(z4, r4);
|
||||
|
||||
y0 = z2;
|
||||
y0 = _mm256_add_epi32(y0, z3);
|
||||
r0 = y0;
|
||||
y0 = _mm256_slli_epi32(y0, 18);
|
||||
z0 = _mm256_xor_si256(z0, y0);
|
||||
r0 = _mm256_srli_epi32(r0, 14);
|
||||
z0 = _mm256_xor_si256(z0, r0);
|
||||
|
||||
y5 = z7;
|
||||
y5 = _mm256_add_epi32(y5, z4);
|
||||
r5 = y5;
|
||||
y5 = _mm256_slli_epi32(y5, 18);
|
||||
z5 = _mm256_xor_si256(z5, y5);
|
||||
r5 = _mm256_srli_epi32(r5, 14);
|
||||
z5 = _mm256_xor_si256(z5, r5);
|
||||
|
||||
y11 = z9;
|
||||
y11 = _mm256_add_epi32(y11, z10);
|
||||
r11 = y11;
|
||||
y11 = _mm256_slli_epi32(y11, 7);
|
||||
z11 = _mm256_xor_si256(z11, y11);
|
||||
r11 = _mm256_srli_epi32(r11, 25);
|
||||
z11 = _mm256_xor_si256(z11, r11);
|
||||
|
||||
y12 = z14;
|
||||
y12 = _mm256_add_epi32(y12, z15);
|
||||
r12 = y12;
|
||||
y12 = _mm256_slli_epi32(y12, 7);
|
||||
z12 = _mm256_xor_si256(z12, y12);
|
||||
r12 = _mm256_srli_epi32(r12, 25);
|
||||
z12 = _mm256_xor_si256(z12, r12);
|
||||
|
||||
y8 = z10;
|
||||
y8 = _mm256_add_epi32(y8, z11);
|
||||
r8 = y8;
|
||||
y8 = _mm256_slli_epi32(y8, 9);
|
||||
z8 = _mm256_xor_si256(z8, y8);
|
||||
r8 = _mm256_srli_epi32(r8, 23);
|
||||
z8 = _mm256_xor_si256(z8, r8);
|
||||
|
||||
y13 = z15;
|
||||
y13 = _mm256_add_epi32(y13, z12);
|
||||
r13 = y13;
|
||||
y13 = _mm256_slli_epi32(y13, 9);
|
||||
z13 = _mm256_xor_si256(z13, y13);
|
||||
r13 = _mm256_srli_epi32(r13, 23);
|
||||
z13 = _mm256_xor_si256(z13, r13);
|
||||
|
||||
y9 = z11;
|
||||
y9 = _mm256_add_epi32(y9, z8);
|
||||
r9 = y9;
|
||||
y9 = _mm256_slli_epi32(y9, 13);
|
||||
z9 = _mm256_xor_si256(z9, y9);
|
||||
r9 = _mm256_srli_epi32(r9, 19);
|
||||
z9 = _mm256_xor_si256(z9, r9);
|
||||
|
||||
y14 = z12;
|
||||
y14 = _mm256_add_epi32(y14, z13);
|
||||
r14 = y14;
|
||||
y14 = _mm256_slli_epi32(y14, 13);
|
||||
z14 = _mm256_xor_si256(z14, y14);
|
||||
r14 = _mm256_srli_epi32(r14, 19);
|
||||
z14 = _mm256_xor_si256(z14, r14);
|
||||
|
||||
y10 = z8;
|
||||
y10 = _mm256_add_epi32(y10, z9);
|
||||
r10 = y10;
|
||||
y10 = _mm256_slli_epi32(y10, 18);
|
||||
z10 = _mm256_xor_si256(z10, y10);
|
||||
r10 = _mm256_srli_epi32(r10, 14);
|
||||
z10 = _mm256_xor_si256(z10, r10);
|
||||
|
||||
y15 = z13;
|
||||
y15 = _mm256_add_epi32(y15, z14);
|
||||
r15 = y15;
|
||||
y15 = _mm256_slli_epi32(y15, 18);
|
||||
z15 = _mm256_xor_si256(z15, y15);
|
||||
r15 = _mm256_srli_epi32(r15, 14);
|
||||
z15 = _mm256_xor_si256(z15, r15);
|
||||
}
|
||||
|
||||
/* store data ; this macro first transpose data in-registers, and then store
|
||||
* them in memory. much faster with icc. */
|
||||
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
|
||||
{ \
|
||||
__m128i t0, t1, t2, t3; \
|
||||
z##A = _mm256_add_epi32(z##A, orig##A); \
|
||||
z##B = _mm256_add_epi32(z##B, orig##B); \
|
||||
z##C = _mm256_add_epi32(z##C, orig##C); \
|
||||
z##D = _mm256_add_epi32(z##D, orig##D); \
|
||||
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
|
||||
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
|
||||
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
|
||||
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
|
||||
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
|
||||
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
|
||||
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
|
||||
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
|
||||
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0), \
|
||||
_mm_loadu_si128((const __m128i*) (m + 0))); \
|
||||
_mm_storeu_si128((__m128i*) (c + 0), t0); \
|
||||
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0), \
|
||||
_mm_loadu_si128((const __m128i*) (m + 64))); \
|
||||
_mm_storeu_si128((__m128i*) (c + 64), t1); \
|
||||
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0), \
|
||||
_mm_loadu_si128((const __m128i*) (m + 128))); \
|
||||
_mm_storeu_si128((__m128i*) (c + 128), t2); \
|
||||
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0), \
|
||||
_mm_loadu_si128((const __m128i*) (m + 192))); \
|
||||
_mm_storeu_si128((__m128i*) (c + 192), t3); \
|
||||
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1), \
|
||||
_mm_loadu_si128((const __m128i*) (m + 256))); \
|
||||
_mm_storeu_si128((__m128i*) (c + 256), t0); \
|
||||
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1), \
|
||||
_mm_loadu_si128((const __m128i*) (m + 320))); \
|
||||
_mm_storeu_si128((__m128i*) (c + 320), t1); \
|
||||
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1), \
|
||||
_mm_loadu_si128((const __m128i*) (m + 384))); \
|
||||
_mm_storeu_si128((__m128i*) (c + 384), t2); \
|
||||
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1), \
|
||||
_mm_loadu_si128((const __m128i*) (m + 448))); \
|
||||
_mm_storeu_si128((__m128i*) (c + 448), t3); \
|
||||
}
|
||||
|
||||
#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
|
||||
|
||||
#define ONEQUAD_UNPCK(A, B, C, D) \
|
||||
{ \
|
||||
z##A = _mm256_add_epi32(z##A, orig##A); \
|
||||
z##B = _mm256_add_epi32(z##B, orig##B); \
|
||||
z##C = _mm256_add_epi32(z##C, orig##C); \
|
||||
z##D = _mm256_add_epi32(z##D, orig##D); \
|
||||
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
|
||||
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
|
||||
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
|
||||
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
|
||||
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
|
||||
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
|
||||
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
|
||||
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
|
||||
}
|
||||
|
||||
#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \
|
||||
{ \
|
||||
ONEQUAD_UNPCK(A, B, C, D); \
|
||||
ONEQUAD_UNPCK(A2, B2, C2, D2); \
|
||||
y##A = _mm256_permute2x128_si256(z##A, z##A2, 0x20); \
|
||||
y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31); \
|
||||
y##B = _mm256_permute2x128_si256(z##B, z##B2, 0x20); \
|
||||
y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31); \
|
||||
y##C = _mm256_permute2x128_si256(z##C, z##C2, 0x20); \
|
||||
y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31); \
|
||||
y##D = _mm256_permute2x128_si256(z##D, z##D2, 0x20); \
|
||||
y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31); \
|
||||
y##A = _mm256_xor_si256(y##A, \
|
||||
_mm256_loadu_si256((const __m256i*) (m + 0))); \
|
||||
y##B = _mm256_xor_si256( \
|
||||
y##B, _mm256_loadu_si256((const __m256i*) (m + 64))); \
|
||||
y##C = _mm256_xor_si256( \
|
||||
y##C, _mm256_loadu_si256((const __m256i*) (m + 128))); \
|
||||
y##D = _mm256_xor_si256( \
|
||||
y##D, _mm256_loadu_si256((const __m256i*) (m + 192))); \
|
||||
y##A2 = _mm256_xor_si256( \
|
||||
y##A2, _mm256_loadu_si256((const __m256i*) (m + 256))); \
|
||||
y##B2 = _mm256_xor_si256( \
|
||||
y##B2, _mm256_loadu_si256((const __m256i*) (m + 320))); \
|
||||
y##C2 = _mm256_xor_si256( \
|
||||
y##C2, _mm256_loadu_si256((const __m256i*) (m + 384))); \
|
||||
y##D2 = _mm256_xor_si256( \
|
||||
y##D2, _mm256_loadu_si256((const __m256i*) (m + 448))); \
|
||||
_mm256_storeu_si256((__m256i*) (c + 0), y##A); \
|
||||
_mm256_storeu_si256((__m256i*) (c + 64), y##B); \
|
||||
_mm256_storeu_si256((__m256i*) (c + 128), y##C); \
|
||||
_mm256_storeu_si256((__m256i*) (c + 192), y##D); \
|
||||
_mm256_storeu_si256((__m256i*) (c + 256), y##A2); \
|
||||
_mm256_storeu_si256((__m256i*) (c + 320), y##B2); \
|
||||
_mm256_storeu_si256((__m256i*) (c + 384), y##C2); \
|
||||
_mm256_storeu_si256((__m256i*) (c + 448), y##D2); \
|
||||
}
|
||||
|
||||
ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
m += 32;
|
||||
c += 32;
|
||||
ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
|
||||
m -= 32;
|
||||
c -= 32;
|
||||
|
||||
#undef ONEQUAD
|
||||
#undef ONEQUAD_TRANSPOSE
|
||||
#undef ONEQUAD_UNPCK
|
||||
#undef ONEOCTO
|
||||
|
||||
bytes -= 512;
|
||||
c += 512;
|
||||
m += 512;
|
||||
}
|
||||
}
|
|
@ -35,11 +35,6 @@
|
|||
#endif
|
||||
|
||||
|
||||
#ifdef XMRIG_ALGO_ASTROBWT
|
||||
# include "crypto/astrobwt/AstroBWT.h"
|
||||
#endif
|
||||
|
||||
|
||||
#define ADD_FN(algo) do { \
|
||||
m_map[algo] = new cn_hash_fun_array{}; \
|
||||
m_map[algo]->data[AV_SINGLE][Assembly::NONE] = cryptonight_single_hash<algo, false, 0>; \
|
||||
|
@ -375,12 +370,6 @@ xmrig::CnHash::CnHash()
|
|||
m_map[Algorithm::AR2_WRKZ]->data[AV_SINGLE_SOFT][Assembly::NONE] = argon2::single_hash<Algorithm::AR2_WRKZ>;
|
||||
# endif
|
||||
|
||||
# ifdef XMRIG_ALGO_ASTROBWT
|
||||
m_map[Algorithm::ASTROBWT_DERO_2] = new cn_hash_fun_array{};
|
||||
m_map[Algorithm::ASTROBWT_DERO_2]->data[AV_SINGLE][Assembly::NONE] = astrobwt::single_hash<Algorithm::ASTROBWT_DERO_2>;
|
||||
m_map[Algorithm::ASTROBWT_DERO_2]->data[AV_SINGLE_SOFT][Assembly::NONE] = astrobwt::single_hash<Algorithm::ASTROBWT_DERO_2>;
|
||||
# endif
|
||||
|
||||
# ifdef XMRIG_ALGO_GHOSTRIDER
|
||||
ADD_FN(Algorithm::CN_GR_0);
|
||||
ADD_FN(Algorithm::CN_GR_1);
|
||||
|
|
|
@ -432,23 +432,6 @@ const static uint8_t argon2_wrkz_test_out[256] = {
|
|||
#endif
|
||||
|
||||
|
||||
#ifdef XMRIG_ALGO_ASTROBWT
|
||||
// "astrobwt/v2"
|
||||
const static uint8_t astrobwt_dero_2_test_out[256] = {
|
||||
0x48, 0x9E, 0xD2, 0x66, 0x14, 0x27, 0x98, 0x65, 0x03, 0xFB, 0x87, 0x25, 0xE1, 0xD3, 0x98, 0xDA,
|
||||
0x27, 0xEE, 0x25, 0x3D, 0xB4, 0x37, 0x87, 0x98, 0xBF, 0x5A, 0x5C, 0x94, 0xEE, 0x0C, 0xE2, 0x2A,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef XMRIG_ALGO_GHOSTRIDER
|
||||
// "GhostRider"
|
||||
const static uint8_t test_output_gr[256] = {
|
||||
|
|
|
@ -43,22 +43,22 @@ set(SOURCES
|
|||
)
|
||||
|
||||
if (CMAKE_C_COMPILER_ID MATCHES MSVC)
|
||||
set_source_files_properties(sph_blake.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_bmw.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_cubehash.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_echo.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_fugue.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_groestl.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_hamsi.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_jh.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_keccak.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_luffa.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_shabal.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_shavite.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_simd.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_sha2.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_skein.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_whirlpool.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_blake.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_bmw.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_cubehash.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_echo.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_fugue.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_groestl.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_hamsi.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_jh.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_keccak.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_luffa.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_shabal.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_shavite.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_simd.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_sha2.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_skein.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_whirlpool.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
|
||||
elseif (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang)
|
||||
set_source_files_properties(sph_blake.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
set_source_files_properties(sph_bmw.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
|
|
|
@ -28,13 +28,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
|
||||
void randomx_set_huge_pages_jit(bool)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
void randomx_set_optimized_dataset_init(int)
|
||||
{
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue