Merge remote-tracking branch 'remotes/origin/sync-base' into evo

2022-08-06 10:22:49 +07:00 · 2022-08-06 10:22:49 +07:00 · f6dbe32c86
commit f6dbe32c86
parent ce0b879542 4948a8c354
227 changed files with 80367 additions and 12475 deletions
--- a/src/crypto/astrobwt/AstroBWT.cpp
+++ b/src/crypto/astrobwt/AstroBWT.cpp
@ -1,269 +0,0 @@
-/* XMRig
- * Copyright (c) 2018      Lee Clagett              <https://github.com/vtnerd>
- * Copyright (c) 2018-2019 tevador                  <tevador@gmail.com>
- * Copyright (c) 2000      Transmeta Corporation    <https://github.com/intel/msr-tools>
- * Copyright (c) 2004-2008 H. Peter Anvin           <https://github.com/intel/msr-tools>
- * Copyright (c) 2018-2021 SChernykh                <https://github.com/SChernykh>
- * Copyright (c) 2016-2021 XMRig                    <https://github.com/xmrig>, <support@xmrig.com>
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "crypto/astrobwt/AstroBWT.h"
-#include "backend/cpu/Cpu.h"
-#include "base/crypto/sha3.h"
-#include "base/tools/bswap_64.h"
-#include "crypto/cn/CryptoNight.h"
-
-
-#include <limits>
-
-
-constexpr int STAGE1_SIZE = 147253;
-constexpr int ALLOCATION_SIZE = (STAGE1_SIZE + 1048576) + (128 - (STAGE1_SIZE & 63));
-
-constexpr int COUNTING_SORT_BITS = 10;
-constexpr int COUNTING_SORT_SIZE = 1 << COUNTING_SORT_BITS;
-
-static bool astrobwtInitialized = false;
-
-#ifdef ASTROBWT_AVX2
-static bool hasAVX2 = false;
-
-extern "C"
-#ifndef _MSC_VER
-__attribute__((ms_abi))
-#endif
-void SHA3_256_AVX2_ASM(const void* in, size_t inBytes, void* out);
-#endif
-
-#ifdef XMRIG_ARM
-extern "C" {
-#include "salsa20_ref/ecrypt-sync.h"
-}
-
-static void Salsa20_XORKeyStream(const void* key, void* output, size_t size)
-{
-	uint8_t iv[8] = {};
-	ECRYPT_ctx ctx;
-	ECRYPT_keysetup(&ctx, static_cast<const uint8_t*>(key), 256, 64);
-	ECRYPT_ivsetup(&ctx, iv);
-	ECRYPT_keystream_bytes(&ctx, static_cast<uint8_t*>(output), size);
-	memset(static_cast<uint8_t*>(output) - 16, 0, 16);
-	memset(static_cast<uint8_t*>(output) + size, 0, 16);
-}
-#else
-#include "Salsa20.hpp"
-
-static void Salsa20_XORKeyStream(const void* key, void* output, size_t size)
-{
-	const uint64_t iv = 0;
-	ZeroTier::Salsa20 s(key, &iv);
-	s.XORKeyStream(output, static_cast<uint32_t>(size));
-	memset(static_cast<uint8_t*>(output) - 16, 0, 16);
-	memset(static_cast<uint8_t*>(output) + size, 0, 16);
-}
-
-extern "C" int salsa20_stream_avx2(void* c, uint64_t clen, const void* iv, const void* key);
-
-static void Salsa20_XORKeyStream_AVX256(const void* key, void* output, size_t size)
-{
-	const uint64_t iv = 0;
-	salsa20_stream_avx2(output, size, &iv, key);
-	memset(static_cast<uint8_t*>(output) - 16, 0, 16);
-	memset(static_cast<uint8_t*>(output) + size, 0, 16);
-}
-#endif
-
-void sort_indices(int N, const uint8_t* v, uint64_t* indices, uint64_t* tmp_indices)
-{
-	uint32_t counters[2][COUNTING_SORT_SIZE] = {};
-
-	for (int i = 0; i < N; ++i)
-	{
-		const uint64_t k = bswap_64(*reinterpret_cast<const uint64_t*>(v + i));
-		++counters[0][(k >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)];
-		++counters[1][k >> (64 - COUNTING_SORT_BITS)];
-	}
-
-	uint32_t prev[2] = { counters[0][0], counters[1][0] };
-	counters[0][0] = prev[0] - 1;
-	counters[1][0] = prev[1] - 1;
-	for (int i = 1; i < COUNTING_SORT_SIZE; ++i)
-	{
-		const uint32_t cur[2] = { counters[0][i] + prev[0], counters[1][i] + prev[1] };
-		counters[0][i] = cur[0] - 1;
-		counters[1][i] = cur[1] - 1;
-		prev[0] = cur[0];
-		prev[1] = cur[1];
-	}
-
-	for (int i = N - 1; i >= 0; --i)
-	{
-		const uint64_t k = bswap_64(*reinterpret_cast<const uint64_t*>(v + i));
-		tmp_indices[counters[0][(k >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)]--] = (k & (static_cast<uint64_t>(-1) << 21)) | i;
-	}
-
-	for (int i = N - 1; i >= 0; --i)
-	{
-		const uint64_t data = tmp_indices[i];
-		indices[counters[1][data >> (64 - COUNTING_SORT_BITS)]--] = data;
-	}
-
-	auto smaller = [v](uint64_t a, uint64_t b)
-	{
-		const uint64_t value_a = a >> 21;
-		const uint64_t value_b = b >> 21;
-
-		if (value_a < value_b) {
-			return true;
-		}
-
-		if (value_a > value_b) {
-			return false;
-		}
-
-		const uint64_t data_a = bswap_64(*reinterpret_cast<const uint64_t*>(v + (a % (1 << 21)) + 5));
-		const uint64_t data_b = bswap_64(*reinterpret_cast<const uint64_t*>(v + (b % (1 << 21)) + 5));
-		return (data_a < data_b);
-	};
-
-	uint64_t prev_t = indices[0];
-	for (int i = 1; i < N; ++i)
-	{
-		uint64_t t = indices[i];
-		if (smaller(t, prev_t))
-		{
-			const uint64_t t2 = prev_t;
-			int j = i - 1;
-			do
-			{
-				indices[j + 1] = prev_t;
-				--j;
-
-				if (j < 0) {
-					break;
-				}
-
-				prev_t = indices[j];
-			} while (smaller(t, prev_t));
-			indices[j + 1] = t;
-			t = t2;
-		}
-		prev_t = t;
-	}
-}
-
-bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size, bool avx2)
-{
-	alignas(8) uint8_t key[32];
-	uint8_t* scratchpad_ptr = (uint8_t*)(scratchpad) + 64;
-	uint8_t* stage1_output = scratchpad_ptr;
-	uint8_t* stage2_output = scratchpad_ptr;
-	uint64_t* indices = (uint64_t*)(scratchpad_ptr + ALLOCATION_SIZE);
-	uint64_t* tmp_indices = (uint64_t*)(scratchpad_ptr + ALLOCATION_SIZE * 9);
-	uint8_t* stage1_result = (uint8_t*)(tmp_indices);
-	uint8_t* stage2_result = (uint8_t*)(tmp_indices);
-
-#ifdef ASTROBWT_AVX2
-	if (hasAVX2 && avx2) {
-		SHA3_256_AVX2_ASM(input_data, input_size, key);
-		Salsa20_XORKeyStream_AVX256(key, stage1_output, STAGE1_SIZE);
-	}
-	else
-#endif
-	{
-		sha3_HashBuffer(256, SHA3_FLAGS_NONE, input_data, input_size, key, sizeof(key));
-		Salsa20_XORKeyStream(key, stage1_output, STAGE1_SIZE);
-	}
-
-	sort_indices(STAGE1_SIZE + 1, stage1_output, indices, tmp_indices);
-
-	{
-		const uint8_t* tmp = stage1_output - 1;
-		for (int i = 0; i <= STAGE1_SIZE; ++i) {
-			stage1_result[i] = tmp[indices[i] & ((1 << 21) - 1)];
-		}
-	}
-
-#ifdef ASTROBWT_AVX2
-	if (hasAVX2 && avx2)
-		SHA3_256_AVX2_ASM(stage1_result, STAGE1_SIZE + 1, key);
-	else
-#endif
-		sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage1_result, STAGE1_SIZE + 1, key, sizeof(key));
-
-	const int stage2_size = STAGE1_SIZE + (*(uint32_t*)(key) & 0xfffff);
-	if (stage2_size > stage2_max_size) {
-		return false;
-	}
-
-#ifdef ASTROBWT_AVX2
-	if (hasAVX2 && avx2) {
-		Salsa20_XORKeyStream_AVX256(key, stage2_output, stage2_size);
-	}
-	else
-#endif
-	{
-		Salsa20_XORKeyStream(key, stage2_output, stage2_size);
-	}
-
-	sort_indices(stage2_size + 1, stage2_output, indices, tmp_indices);
-
-	{
-		const uint8_t* tmp = stage2_output - 1;
-		int i = 0;
-		const int n = ((stage2_size + 1) / 4) * 4;
-
-		for (; i < n; i += 4)
-		{
-			stage2_result[i + 0] = tmp[indices[i + 0] & ((1 << 21) - 1)];
-			stage2_result[i + 1] = tmp[indices[i + 1] & ((1 << 21) - 1)];
-			stage2_result[i + 2] = tmp[indices[i + 2] & ((1 << 21) - 1)];
-			stage2_result[i + 3] = tmp[indices[i + 3] & ((1 << 21) - 1)];
-		}
-
-		for (; i <= stage2_size; ++i) {
-			stage2_result[i] = tmp[indices[i] & ((1 << 21) - 1)];
-		}
-	}
-
-#ifdef ASTROBWT_AVX2
-	if (hasAVX2 && avx2)
-		SHA3_256_AVX2_ASM(stage2_result, stage2_size + 1, output_hash);
-	else
-#endif
-		sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage2_result, stage2_size + 1, output_hash, 32);
-
-	return true;
-}
-
-
-void xmrig::astrobwt::init()
-{
-	if (!astrobwtInitialized) {
-#		ifdef ASTROBWT_AVX2
-		hasAVX2 = Cpu::info()->hasAVX2();
-#		endif
-
-		astrobwtInitialized = true;
-	}
-}
-
-
-template<>
-void xmrig::astrobwt::single_hash<xmrig::Algorithm::ASTROBWT_DERO>(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t)
-{
-	astrobwt_dero(input, static_cast<uint32_t>(size), ctx[0]->memory, output, std::numeric_limits<int>::max(), true);
-}
--- a/src/crypto/astrobwt/AstroBWT.h
+++ b/src/crypto/astrobwt/AstroBWT.h
@ -1,44 +0,0 @@
-/* XMRig
- * Copyright (c) 2018      Lee Clagett              <https://github.com/vtnerd>
- * Copyright (c) 2018-2019 tevador                  <tevador@gmail.com>
- * Copyright (c) 2000      Transmeta Corporation    <https://github.com/intel/msr-tools>
- * Copyright (c) 2004-2008 H. Peter Anvin           <https://github.com/intel/msr-tools>
- * Copyright (c) 2018-2021 SChernykh                <https://github.com/SChernykh>
- * Copyright (c) 2016-2021 XMRig                    <https://github.com/xmrig>, <support@xmrig.com>
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "base/crypto/Algorithm.h"
-
-
-struct cryptonight_ctx;
-
-
-namespace xmrig {
-
-
-namespace astrobwt {
-
-bool astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size, bool avx2);
-void init();
-
-template<Algorithm::Id ALGO>
-void single_hash(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t);
-
-template<>
-void single_hash<Algorithm::ASTROBWT_DERO>(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t);
-
-
-}} // namespace xmrig::astrobwt
--- a/src/crypto/astrobwt/Salsa20.cpp
+++ b/src/crypto/astrobwt/Salsa20.cpp
@ -1,352 +0,0 @@
-/*
- * Based on public domain code available at: http://cr.yp.to/snuffle.html
- *
- * Modifications and C-native SSE macro based SSE implementation by
- * Adam Ierymenko <adam.ierymenko@zerotier.com>.
- *
- * Additional modifications and code cleanup for AstroBWT by
- * SChernykh <https://github.com/SChernykh>
- *
- * Since the original was public domain, this is too.
- */
-
-#include "Salsa20.hpp"
-
-// Statically compute and define SSE constants
-class _s20sseconsts
-{
-public:
-	_s20sseconsts()
-	{
-		maskLo32 = _mm_shuffle_epi32(_mm_cvtsi32_si128(-1), _MM_SHUFFLE(1, 0, 1, 0));
-		maskHi32 = _mm_slli_epi64(maskLo32, 32);
-	}
-	__m128i maskLo32,maskHi32;
-};
-static const _s20sseconsts _S20SSECONSTANTS;
-
-namespace ZeroTier {
-
-void Salsa20::init(const void *key,const void *iv)
-{
-	const uint32_t *const k = (const uint32_t *)key;
-	_state.i[0] = 0x61707865;
-	_state.i[1] = 0x3320646e;
-	_state.i[2] = 0x79622d32;
-	_state.i[3] = 0x6b206574;
-	_state.i[4] = k[3];
-	_state.i[5] = 0;
-	_state.i[6] = k[7];
-	_state.i[7] = k[2];
-	_state.i[8] = 0;
-	_state.i[9] = k[6];
-	_state.i[10] = k[1];
-	_state.i[11] = ((const uint32_t *)iv)[1];
-	_state.i[12] = k[5];
-	_state.i[13] = k[0];
-	_state.i[14] = ((const uint32_t *)iv)[0];
-	_state.i[15] = k[4];
-}
-
-void Salsa20::XORKeyStream(void *out,unsigned int bytes)
-{
-	uint8_t tmp[64];
-	uint8_t *c = (uint8_t *)out;
-	uint8_t *ctarget = c;
-	unsigned int i;
-
-	if (!bytes)
-		return;
-
-	for (;;) {
-		if (bytes < 64) {
-			for (i = 0;i < bytes;++i)
-				tmp[i] = 0;
-			ctarget = c;
-			c = tmp;
-		}
-
-		__m128i X0 = _mm_loadu_si128((const __m128i *)&(_state.v[0]));
-		__m128i X1 = _mm_loadu_si128((const __m128i *)&(_state.v[1]));
-		__m128i X2 = _mm_loadu_si128((const __m128i *)&(_state.v[2]));
-		__m128i X3 = _mm_loadu_si128((const __m128i *)&(_state.v[3]));
-		__m128i T;
-		__m128i X0s = X0;
-		__m128i X1s = X1;
-		__m128i X2s = X2;
-		__m128i X3s = X3;
-
-		// 2X round -------------------------------------------------------------
-		T = _mm_add_epi32(X0, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X1, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X3, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x93);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x39);
-		T = _mm_add_epi32(X0, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X3, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X1, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x39);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x93);
-
-		// 2X round -------------------------------------------------------------
-		T = _mm_add_epi32(X0, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X1, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X3, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x93);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x39);
-		T = _mm_add_epi32(X0, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X3, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X1, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x39);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x93);
-
-		// 2X round -------------------------------------------------------------
-		T = _mm_add_epi32(X0, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X1, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X3, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x93);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x39);
-		T = _mm_add_epi32(X0, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X3, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X1, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x39);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x93);
-
-		// 2X round -------------------------------------------------------------
-		T = _mm_add_epi32(X0, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X1, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X3, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x93);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x39);
-		T = _mm_add_epi32(X0, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X3, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X1, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x39);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x93);
-
-		// 2X round -------------------------------------------------------------
-		T = _mm_add_epi32(X0, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X1, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X3, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x93);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x39);
-		T = _mm_add_epi32(X0, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X3, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X1, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x39);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x93);
-
-		// 2X round -------------------------------------------------------------
-		T = _mm_add_epi32(X0, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X1, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X3, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x93);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x39);
-		T = _mm_add_epi32(X0, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X3, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X1, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x39);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x93);
-
-		// 2X round -------------------------------------------------------------
-		T = _mm_add_epi32(X0, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X1, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X3, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x93);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x39);
-		T = _mm_add_epi32(X0, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X3, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X1, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x39);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x93);
-
-		// 2X round -------------------------------------------------------------
-		T = _mm_add_epi32(X0, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X1, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X3, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x93);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x39);
-		T = _mm_add_epi32(X0, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X3, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X1, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x39);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x93);
-
-		// 2X round -------------------------------------------------------------
-		T = _mm_add_epi32(X0, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X1, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X3, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x93);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x39);
-		T = _mm_add_epi32(X0, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X3, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X1, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x39);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x93);
-
-		// 2X round -------------------------------------------------------------
-		T = _mm_add_epi32(X0, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X1, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X3, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x93);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x39);
-		T = _mm_add_epi32(X0, X1);
-		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-		T = _mm_add_epi32(X3, X0);
-		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-		T = _mm_add_epi32(X2, X3);
-		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-		T = _mm_add_epi32(X1, X2);
-		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-		X1 = _mm_shuffle_epi32(X1, 0x39);
-		X2 = _mm_shuffle_epi32(X2, 0x4E);
-		X3 = _mm_shuffle_epi32(X3, 0x93);
-
-		X0 = _mm_add_epi32(X0s,X0);
-		X1 = _mm_add_epi32(X1s,X1);
-		X2 = _mm_add_epi32(X2s,X2);
-		X3 = _mm_add_epi32(X3s,X3);
-
-		__m128i k02 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X0, 32), _mm_srli_epi64(X3, 32)), _MM_SHUFFLE(0, 1, 2, 3));
-		__m128i k13 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X1, 32), _mm_srli_epi64(X0, 32)), _MM_SHUFFLE(0, 1, 2, 3));
-		__m128i k20 = _mm_or_si128(_mm_and_si128(X2, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X1, _S20SSECONSTANTS.maskHi32));
-		__m128i k31 = _mm_or_si128(_mm_and_si128(X3, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X2, _S20SSECONSTANTS.maskHi32));
-		_mm_storeu_ps(reinterpret_cast<float *>(c),_mm_castsi128_ps(_mm_unpackhi_epi64(k02,k20)));
-		_mm_storeu_ps(reinterpret_cast<float *>(c) + 4,_mm_castsi128_ps(_mm_unpackhi_epi64(k13,k31)));
-		_mm_storeu_ps(reinterpret_cast<float *>(c) + 8,_mm_castsi128_ps(_mm_unpacklo_epi64(k20,k02)));
-		_mm_storeu_ps(reinterpret_cast<float *>(c) + 12,_mm_castsi128_ps(_mm_unpacklo_epi64(k31,k13)));
-
-		if (!(++_state.i[8])) {
-			++_state.i[5]; // state reordered for SSE
-			/* stopping at 2^70 bytes per nonce is user's responsibility */
-		}
-
-		if (bytes <= 64) {
-			if (bytes < 64) {
-				for (i = 0;i < bytes;++i)
-					ctarget[i] = c[i];
-			}
-
-			return;
-		}
-
-		bytes -= 64;
-		c += 64;
-	}
-}
-
-} // namespace ZeroTier
--- a/src/crypto/astrobwt/Salsa20.hpp
+++ b/src/crypto/astrobwt/Salsa20.hpp
@ -1,52 +0,0 @@
-/*
- * Based on public domain code available at: http://cr.yp.to/snuffle.html
- *
- * This therefore is public domain.
- */
-
-#ifndef ZT_SALSA20_HPP
-#define ZT_SALSA20_HPP
-
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <emmintrin.h>
-
-namespace ZeroTier {
-
-/**
- * Salsa20 stream cipher
- */
-class Salsa20
-{
-public:
-	/**
-	 * @param key 256-bit (32 byte) key
-	 * @param iv 64-bit initialization vector
-	 */
-	Salsa20(const void *key,const void *iv)
-	{
-		init(key,iv);
-	}
-
-	/**
-	 * Initialize cipher
-	 *
-	 * @param key Key bits
-	 * @param iv 64-bit initialization vector
-	 */
-	void init(const void *key,const void *iv);
-
-	void XORKeyStream(void *out,unsigned int bytes);
-
-private:
-	union {
-		__m128i v[4];
-		uint32_t i[16];
-	} _state;
-};
-
-} // namespace ZeroTier
-
-#endif
--- a/src/crypto/astrobwt/salsa20_ref/ecrypt-config.h
+++ b/src/crypto/astrobwt/salsa20_ref/ecrypt-config.h
@ -1,272 +0,0 @@
-/* ecrypt-config.h */
-
-/* *** Normally, it should not be necessary to edit this file. *** */
-
-#ifndef ECRYPT_CONFIG
-#define ECRYPT_CONFIG
-
-/* ------------------------------------------------------------------------- */
-
-/* Guess the endianness of the target architecture. */
-
-/* 
- * The LITTLE endian machines:
- */
-#if defined(__ultrix)           /* Older MIPS */
-#define ECRYPT_LITTLE_ENDIAN
-#elif defined(__alpha)          /* Alpha */
-#define ECRYPT_LITTLE_ENDIAN
-#elif defined(i386)             /* x86 (gcc) */
-#define ECRYPT_LITTLE_ENDIAN
-#elif defined(__i386)           /* x86 (gcc) */
-#define ECRYPT_LITTLE_ENDIAN
-#elif defined(_M_IX86)          /* x86 (MSC, Borland) */
-#define ECRYPT_LITTLE_ENDIAN
-#elif defined(_MSC_VER)         /* x86 (surely MSC) */
-#define ECRYPT_LITTLE_ENDIAN
-#elif defined(__INTEL_COMPILER) /* x86 (surely Intel compiler icl.exe) */
-#define ECRYPT_LITTLE_ENDIAN
-
-/* 
- * The BIG endian machines: 
- */
-#elif defined(sun)              /* Newer Sparc's */
-#define ECRYPT_BIG_ENDIAN
-#elif defined(__ppc__)          /* PowerPC */
-#define ECRYPT_BIG_ENDIAN
-
-/* 
- * Finally machines with UNKNOWN endianness:
- */
-#elif defined (_AIX)            /* RS6000 */
-#define ECRYPT_UNKNOWN
-#elif defined(__hpux)           /* HP-PA */
-#define ECRYPT_UNKNOWN
-#elif defined(__aux)            /* 68K */
-#define ECRYPT_UNKNOWN
-#elif defined(__dgux)           /* 88K (but P6 in latest boxes) */
-#define ECRYPT_UNKNOWN
-#elif defined(__sgi)            /* Newer MIPS */
-#define ECRYPT_UNKNOWN
-#else	                        /* Any other processor */
-#define ECRYPT_UNKNOWN
-#endif
-
-/* ------------------------------------------------------------------------- */
-
-/*
- * Find minimal-width types to store 8-bit, 16-bit, 32-bit, and 64-bit
- * integers.
- *
- * Note: to enable 64-bit types on 32-bit compilers, it might be
- * necessary to switch from ISO C90 mode to ISO C99 mode (e.g., gcc
- * -std=c99).
- */
-
-#include <limits.h>
-
-/* --- check char --- */
-
-#if (UCHAR_MAX / 0xFU > 0xFU)
-#ifndef I8T
-#define I8T char
-#define U8C(v) (v##U)
-
-#if (UCHAR_MAX == 0xFFU)
-#define ECRYPT_I8T_IS_BYTE
-#endif
-
-#endif
-
-#if (UCHAR_MAX / 0xFFU > 0xFFU)
-#ifndef I16T
-#define I16T char
-#define U16C(v) (v##U)
-#endif
-
-#if (UCHAR_MAX / 0xFFFFU > 0xFFFFU)
-#ifndef I32T
-#define I32T char
-#define U32C(v) (v##U)
-#endif
-
-#if (UCHAR_MAX / 0xFFFFFFFFU > 0xFFFFFFFFU)
-#ifndef I64T
-#define I64T char
-#define U64C(v) (v##U)
-#define ECRYPT_NATIVE64
-#endif
-
-#endif
-#endif
-#endif
-#endif
-
-/* --- check short --- */
-
-#if (USHRT_MAX / 0xFU > 0xFU)
-#ifndef I8T
-#define I8T short
-#define U8C(v) (v##U)
-
-#if (USHRT_MAX == 0xFFU)
-#define ECRYPT_I8T_IS_BYTE
-#endif
-
-#endif
-
-#if (USHRT_MAX / 0xFFU > 0xFFU)
-#ifndef I16T
-#define I16T short
-#define U16C(v) (v##U)
-#endif
-
-#if (USHRT_MAX / 0xFFFFU > 0xFFFFU)
-#ifndef I32T
-#define I32T short
-#define U32C(v) (v##U)
-#endif
-
-#if (USHRT_MAX / 0xFFFFFFFFU > 0xFFFFFFFFU)
-#ifndef I64T
-#define I64T short
-#define U64C(v) (v##U)
-#define ECRYPT_NATIVE64
-#endif
-
-#endif
-#endif
-#endif
-#endif
-
-/* --- check int --- */
-
-#if (UINT_MAX / 0xFU > 0xFU)
-#ifndef I8T
-#define I8T int
-#define U8C(v) (v##U)
-
-#if (ULONG_MAX == 0xFFU)
-#define ECRYPT_I8T_IS_BYTE
-#endif
-
-#endif
-
-#if (UINT_MAX / 0xFFU > 0xFFU)
-#ifndef I16T
-#define I16T int
-#define U16C(v) (v##U)
-#endif
-
-#if (UINT_MAX / 0xFFFFU > 0xFFFFU)
-#ifndef I32T
-#define I32T int
-#define U32C(v) (v##U)
-#endif
-
-#if (UINT_MAX / 0xFFFFFFFFU > 0xFFFFFFFFU)
-#ifndef I64T
-#define I64T int
-#define U64C(v) (v##U)
-#define ECRYPT_NATIVE64
-#endif
-
-#endif
-#endif
-#endif
-#endif
-
-/* --- check long --- */
-
-#if (ULONG_MAX / 0xFUL > 0xFUL)
-#ifndef I8T
-#define I8T long
-#define U8C(v) (v##UL)
-
-#if (ULONG_MAX == 0xFFUL)
-#define ECRYPT_I8T_IS_BYTE
-#endif
-
-#endif
-
-#if (ULONG_MAX / 0xFFUL > 0xFFUL)
-#ifndef I16T
-#define I16T long
-#define U16C(v) (v##UL)
-#endif
-
-#if (ULONG_MAX / 0xFFFFUL > 0xFFFFUL)
-#ifndef I32T
-#define I32T long
-#define U32C(v) (v##UL)
-#endif
-
-#if (ULONG_MAX / 0xFFFFFFFFUL > 0xFFFFFFFFUL)
-#ifndef I64T
-#define I64T long
-#define U64C(v) (v##UL)
-#define ECRYPT_NATIVE64
-#endif
-
-#endif
-#endif
-#endif
-#endif
-
-/* --- check long long --- */
-
-#ifdef ULLONG_MAX
-
-#if (ULLONG_MAX / 0xFULL > 0xFULL)
-#ifndef I8T
-#define I8T long long
-#define U8C(v) (v##ULL)
-
-#if (ULLONG_MAX == 0xFFULL)
-#define ECRYPT_I8T_IS_BYTE
-#endif
-
-#endif
-
-#if (ULLONG_MAX / 0xFFULL > 0xFFULL)
-#ifndef I16T
-#define I16T long long
-#define U16C(v) (v##ULL)
-#endif
-
-#if (ULLONG_MAX / 0xFFFFULL > 0xFFFFULL)
-#ifndef I32T
-#define I32T long long
-#define U32C(v) (v##ULL)
-#endif
-
-#if (ULLONG_MAX / 0xFFFFFFFFULL > 0xFFFFFFFFULL)
-#ifndef I64T
-#define I64T long long
-#define U64C(v) (v##ULL)
-#endif
-
-#endif
-#endif
-#endif
-#endif
-
-#endif
-
-/* --- check __int64 --- */
-
-#ifdef _UI64_MAX
-
-#if (_UI64_MAX / 0xFFFFFFFFui64 > 0xFFFFFFFFui64)
-#ifndef I64T
-#define I64T __int64
-#define U64C(v) (v##ui64)
-#endif
-
-#endif
-
-#endif
-
-/* ------------------------------------------------------------------------- */
-
-#endif
--- a/src/crypto/astrobwt/salsa20_ref/ecrypt-machine.h
+++ b/src/crypto/astrobwt/salsa20_ref/ecrypt-machine.h
@ -1,46 +0,0 @@
-/* ecrypt-machine.h */
-
-/*
- * This file is included by 'ecrypt-portable.h'. It allows to override
- * the default macros for specific platforms. Please carefully check
- * the machine code generated by your compiler (with optimisations
- * turned on) before deciding to edit this file.
- */
-
-/* ------------------------------------------------------------------------- */
-
-#if (defined(ECRYPT_DEFAULT_ROT) && !defined(ECRYPT_MACHINE_ROT))
-
-#define ECRYPT_MACHINE_ROT
-
-#if (defined(WIN32) && defined(_MSC_VER))
-
-#undef ROTL32
-#undef ROTR32
-#undef ROTL64
-#undef ROTR64
-
-#include <stdlib.h>
-
-#define ROTL32(v, n) _lrotl(v, n)
-#define ROTR32(v, n) _lrotr(v, n)
-#define ROTL64(v, n) _rotl64(v, n)
-#define ROTR64(v, n) _rotr64(v, n)
-
-#endif
-
-#endif
-
-/* ------------------------------------------------------------------------- */
-
-#if (defined(ECRYPT_DEFAULT_SWAP) && !defined(ECRYPT_MACHINE_SWAP))
-
-#define ECRYPT_MACHINE_SWAP
-
-/*
- * If you want to overwrite the default swap macros, put it here. And so on.
- */
-
-#endif
-
-/* ------------------------------------------------------------------------- */
--- a/src/crypto/astrobwt/salsa20_ref/ecrypt-portable.h
+++ b/src/crypto/astrobwt/salsa20_ref/ecrypt-portable.h
@ -1,303 +0,0 @@
-/* ecrypt-portable.h */
-
-/*
- * WARNING: the conversions defined below are implemented as macros,
- * and should be used carefully. They should NOT be used with
- * parameters which perform some action. E.g., the following two lines
- * are not equivalent:
- * 
- *  1) ++x; y = ROTL32(x, n); 
- *  2) y = ROTL32(++x, n);
- */
-
-/*
- * *** Please do not edit this file. ***
- *
- * The default macros can be overridden for specific architectures by
- * editing 'ecrypt-machine.h'.
- */
-
-#ifndef ECRYPT_PORTABLE
-#define ECRYPT_PORTABLE
-
-#include "ecrypt-config.h"
-
-/* ------------------------------------------------------------------------- */
-
-/*
- * The following types are defined (if available):
- *
- * u8:  unsigned integer type, at least 8 bits
- * u16: unsigned integer type, at least 16 bits
- * u32: unsigned integer type, at least 32 bits
- * u64: unsigned integer type, at least 64 bits
- *
- * s8, s16, s32, s64 -> signed counterparts of u8, u16, u32, u64
- *
- * The selection of minimum-width integer types is taken care of by
- * 'ecrypt-config.h'. Note: to enable 64-bit types on 32-bit
- * compilers, it might be necessary to switch from ISO C90 mode to ISO
- * C99 mode (e.g., gcc -std=c99).
- */
-
-#ifdef I8T
-typedef signed I8T s8;
-typedef unsigned I8T u8;
-#endif
-
-#ifdef I16T
-typedef signed I16T s16;
-typedef unsigned I16T u16;
-#endif
-
-#ifdef I32T
-typedef signed I32T s32;
-typedef unsigned I32T u32;
-#endif
-
-#ifdef I64T
-typedef signed I64T s64;
-typedef unsigned I64T u64;
-#endif
-
-/*
- * The following macros are used to obtain exact-width results.
- */
-
-#define U8V(v) ((u8)(v) & U8C(0xFF))
-#define U16V(v) ((u16)(v) & U16C(0xFFFF))
-#define U32V(v) ((u32)(v) & U32C(0xFFFFFFFF))
-#define U64V(v) ((u64)(v) & U64C(0xFFFFFFFFFFFFFFFF))
-
-/* ------------------------------------------------------------------------- */
-
-/*
- * The following macros return words with their bits rotated over n
- * positions to the left/right.
- */
-
-#define ECRYPT_DEFAULT_ROT
-
-#define ROTL8(v, n) \
-  (U8V((v) << (n)) | ((v) >> (8 - (n))))
-
-#define ROTL16(v, n) \
-  (U16V((v) << (n)) | ((v) >> (16 - (n))))
-
-#define ROTL32(v, n) \
-  (U32V((v) << (n)) | ((v) >> (32 - (n))))
-
-#define ROTL64(v, n) \
-  (U64V((v) << (n)) | ((v) >> (64 - (n))))
-
-#define ROTR8(v, n) ROTL8(v, 8 - (n))
-#define ROTR16(v, n) ROTL16(v, 16 - (n))
-#define ROTR32(v, n) ROTL32(v, 32 - (n))
-#define ROTR64(v, n) ROTL64(v, 64 - (n))
-
-#include "ecrypt-machine.h"
-
-/* ------------------------------------------------------------------------- */
-
-/*
- * The following macros return a word with bytes in reverse order.
- */
-
-#define ECRYPT_DEFAULT_SWAP
-
-#define SWAP16(v) \
-  ROTL16(v, 8)
-
-#define SWAP32(v) \
-  ((ROTL32(v,  8) & U32C(0x00FF00FF)) | \
-   (ROTL32(v, 24) & U32C(0xFF00FF00)))
-
-#ifdef ECRYPT_NATIVE64
-#define SWAP64(v) \
-  ((ROTL64(v,  8) & U64C(0x000000FF000000FF)) | \
-   (ROTL64(v, 24) & U64C(0x0000FF000000FF00)) | \
-   (ROTL64(v, 40) & U64C(0x00FF000000FF0000)) | \
-   (ROTL64(v, 56) & U64C(0xFF000000FF000000)))
-#else
-#define SWAP64(v) \
-  (((u64)SWAP32(U32V(v)) << 32) | (u64)SWAP32(U32V(v >> 32)))
-#endif
-
-#include "ecrypt-machine.h"
-
-#define ECRYPT_DEFAULT_WTOW
-
-#ifdef ECRYPT_LITTLE_ENDIAN
-#define U16TO16_LITTLE(v) (v)
-#define U32TO32_LITTLE(v) (v)
-#define U64TO64_LITTLE(v) (v)
-
-#define U16TO16_BIG(v) SWAP16(v)
-#define U32TO32_BIG(v) SWAP32(v)
-#define U64TO64_BIG(v) SWAP64(v)
-#endif
-
-#ifdef ECRYPT_BIG_ENDIAN
-#define U16TO16_LITTLE(v) SWAP16(v)
-#define U32TO32_LITTLE(v) SWAP32(v)
-#define U64TO64_LITTLE(v) SWAP64(v)
-
-#define U16TO16_BIG(v) (v)
-#define U32TO32_BIG(v) (v)
-#define U64TO64_BIG(v) (v)
-#endif
-
-#include "ecrypt-machine.h"
-
-/*
- * The following macros load words from an array of bytes with
- * different types of endianness, and vice versa.
- */
-
-#define ECRYPT_DEFAULT_BTOW
-
-#if (!defined(ECRYPT_UNKNOWN) && defined(ECRYPT_I8T_IS_BYTE))
-
-#define U8TO16_LITTLE(p) U16TO16_LITTLE(((u16*)(p))[0])
-#define U8TO32_LITTLE(p) U32TO32_LITTLE(((u32*)(p))[0])
-#define U8TO64_LITTLE(p) U64TO64_LITTLE(((u64*)(p))[0])
-
-#define U8TO16_BIG(p) U16TO16_BIG(((u16*)(p))[0])
-#define U8TO32_BIG(p) U32TO32_BIG(((u32*)(p))[0])
-#define U8TO64_BIG(p) U64TO64_BIG(((u64*)(p))[0])
-
-#define U16TO8_LITTLE(p, v) (((u16*)(p))[0] = U16TO16_LITTLE(v))
-#define U32TO8_LITTLE(p, v) (((u32*)(p))[0] = U32TO32_LITTLE(v))
-#define U64TO8_LITTLE(p, v) (((u64*)(p))[0] = U64TO64_LITTLE(v))
-
-#define U16TO8_BIG(p, v) (((u16*)(p))[0] = U16TO16_BIG(v))
-#define U32TO8_BIG(p, v) (((u32*)(p))[0] = U32TO32_BIG(v))
-#define U64TO8_BIG(p, v) (((u64*)(p))[0] = U64TO64_BIG(v))
-
-#else
-
-#define U8TO16_LITTLE(p) \
-  (((u16)((p)[0])      ) | \
-   ((u16)((p)[1]) <<  8))
-
-#define U8TO32_LITTLE(p) \
-  (((u32)((p)[0])      ) | \
-   ((u32)((p)[1]) <<  8) | \
-   ((u32)((p)[2]) << 16) | \
-   ((u32)((p)[3]) << 24))
-
-#ifdef ECRYPT_NATIVE64
-#define U8TO64_LITTLE(p) \
-  (((u64)((p)[0])      ) | \
-   ((u64)((p)[1]) <<  8) | \
-   ((u64)((p)[2]) << 16) | \
-   ((u64)((p)[3]) << 24) | \
-   ((u64)((p)[4]) << 32) | \
-   ((u64)((p)[5]) << 40) | \
-   ((u64)((p)[6]) << 48) | \
-   ((u64)((p)[7]) << 56))
-#else
-#define U8TO64_LITTLE(p) \
-  ((u64)U8TO32_LITTLE(p) | ((u64)U8TO32_LITTLE((p) + 4) << 32))
-#endif
-
-#define U8TO16_BIG(p) \
-  (((u16)((p)[0]) <<  8) | \
-   ((u16)((p)[1])      ))
-
-#define U8TO32_BIG(p) \
-  (((u32)((p)[0]) << 24) | \
-   ((u32)((p)[1]) << 16) | \
-   ((u32)((p)[2]) <<  8) | \
-   ((u32)((p)[3])      ))
-
-#ifdef ECRYPT_NATIVE64
-#define U8TO64_BIG(p) \
-  (((u64)((p)[0]) << 56) | \
-   ((u64)((p)[1]) << 48) | \
-   ((u64)((p)[2]) << 40) | \
-   ((u64)((p)[3]) << 32) | \
-   ((u64)((p)[4]) << 24) | \
-   ((u64)((p)[5]) << 16) | \
-   ((u64)((p)[6]) <<  8) | \
-   ((u64)((p)[7])      ))
-#else
-#define U8TO64_BIG(p) \
-  (((u64)U8TO32_BIG(p) << 32) | (u64)U8TO32_BIG((p) + 4))
-#endif
-
-#define U16TO8_LITTLE(p, v) \
-  do { \
-    (p)[0] = U8V((v)      ); \
-    (p)[1] = U8V((v) >>  8); \
-  } while (0)
-
-#define U32TO8_LITTLE(p, v) \
-  do { \
-    (p)[0] = U8V((v)      ); \
-    (p)[1] = U8V((v) >>  8); \
-    (p)[2] = U8V((v) >> 16); \
-    (p)[3] = U8V((v) >> 24); \
-  } while (0)
-
-#ifdef ECRYPT_NATIVE64
-#define U64TO8_LITTLE(p, v) \
-  do { \
-    (p)[0] = U8V((v)      ); \
-    (p)[1] = U8V((v) >>  8); \
-    (p)[2] = U8V((v) >> 16); \
-    (p)[3] = U8V((v) >> 24); \
-    (p)[4] = U8V((v) >> 32); \
-    (p)[5] = U8V((v) >> 40); \
-    (p)[6] = U8V((v) >> 48); \
-    (p)[7] = U8V((v) >> 56); \
-  } while (0)
-#else
-#define U64TO8_LITTLE(p, v) \
-  do { \
-    U32TO8_LITTLE((p),     U32V((v)      )); \
-    U32TO8_LITTLE((p) + 4, U32V((v) >> 32)); \
-  } while (0)
-#endif
-
-#define U16TO8_BIG(p, v) \
-  do { \
-    (p)[0] = U8V((v)      ); \
-    (p)[1] = U8V((v) >>  8); \
-  } while (0)
-
-#define U32TO8_BIG(p, v) \
-  do { \
-    (p)[0] = U8V((v) >> 24); \
-    (p)[1] = U8V((v) >> 16); \
-    (p)[2] = U8V((v) >>  8); \
-    (p)[3] = U8V((v)      ); \
-  } while (0)
-
-#ifdef ECRYPT_NATIVE64
-#define U64TO8_BIG(p, v) \
-  do { \
-    (p)[0] = U8V((v) >> 56); \
-    (p)[1] = U8V((v) >> 48); \
-    (p)[2] = U8V((v) >> 40); \
-    (p)[3] = U8V((v) >> 32); \
-    (p)[4] = U8V((v) >> 24); \
-    (p)[5] = U8V((v) >> 16); \
-    (p)[6] = U8V((v) >>  8); \
-    (p)[7] = U8V((v)      ); \
-  } while (0)
-#else
-#define U64TO8_BIG(p, v) \
-  do { \
-    U32TO8_BIG((p),     U32V((v) >> 32)); \
-    U32TO8_BIG((p) + 4, U32V((v)      )); \
-  } while (0)
-#endif
-
-#endif
-
-#include "ecrypt-machine.h"
-
-/* ------------------------------------------------------------------------- */
-
-#endif
--- a/src/crypto/astrobwt/salsa20_ref/ecrypt-sync.h
+++ b/src/crypto/astrobwt/salsa20_ref/ecrypt-sync.h
@ -1,279 +0,0 @@
-/* ecrypt-sync.h */
-
-/* 
- * Header file for synchronous stream ciphers without authentication
- * mechanism.
- * 
- * *** Please only edit parts marked with "[edit]". ***
- */
-
-#ifndef ECRYPT_SYNC
-#define ECRYPT_SYNC
-
-#include "ecrypt-portable.h"
-
-/* ------------------------------------------------------------------------- */
-
-/* Cipher parameters */
-
-/* 
- * The name of your cipher.
- */
-#define ECRYPT_NAME "Salsa20"    /* [edit] */ 
-#define ECRYPT_PROFILE "S!_H."
-
-/*
- * Specify which key and IV sizes are supported by your cipher. A user
- * should be able to enumerate the supported sizes by running the
- * following code:
- *
- * for (i = 0; ECRYPT_KEYSIZE(i) <= ECRYPT_MAXKEYSIZE; ++i)
- *   {
- *     keysize = ECRYPT_KEYSIZE(i);
- *
- *     ...
- *   }
- *
- * All sizes are in bits.
- */
-
-#define ECRYPT_MAXKEYSIZE 256                 /* [edit] */
-#define ECRYPT_KEYSIZE(i) (128 + (i)*128)     /* [edit] */
-
-#define ECRYPT_MAXIVSIZE 64                   /* [edit] */
-#define ECRYPT_IVSIZE(i) (64 + (i)*64)        /* [edit] */
-
-/* ------------------------------------------------------------------------- */
-
-/* Data structures */
-
-/* 
- * ECRYPT_ctx is the structure containing the representation of the
- * internal state of your cipher. 
- */
-
-typedef struct
-{
-  u32 input[16]; /* could be compressed */
-  /* 
-   * [edit]
-   *
-   * Put here all state variable needed during the encryption process.
-   */
-} ECRYPT_ctx;
-
-/* ------------------------------------------------------------------------- */
-
-/* Mandatory functions */
-
-/*
- * Key and message independent initialization. This function will be
- * called once when the program starts (e.g., to build expanded S-box
- * tables).
- */
-void ECRYPT_init();
-
-/*
- * Key setup. It is the user's responsibility to select the values of
- * keysize and ivsize from the set of supported values specified
- * above.
- */
-void ECRYPT_keysetup(
-  ECRYPT_ctx* ctx, 
-  const u8* key, 
-  u32 keysize,                /* Key size in bits. */ 
-  u32 ivsize);                /* IV size in bits. */ 
-
-/*
- * IV setup. After having called ECRYPT_keysetup(), the user is
- * allowed to call ECRYPT_ivsetup() different times in order to
- * encrypt/decrypt different messages with the same key but different
- * IV's.
- */
-void ECRYPT_ivsetup(
-  ECRYPT_ctx* ctx, 
-  const u8* iv);
-
-/*
- * Encryption/decryption of arbitrary length messages.
- *
- * For efficiency reasons, the API provides two types of
- * encrypt/decrypt functions. The ECRYPT_encrypt_bytes() function
- * (declared here) encrypts byte strings of arbitrary length, while
- * the ECRYPT_encrypt_blocks() function (defined later) only accepts
- * lengths which are multiples of ECRYPT_BLOCKLENGTH.
- * 
- * The user is allowed to make multiple calls to
- * ECRYPT_encrypt_blocks() to incrementally encrypt a long message,
- * but he is NOT allowed to make additional encryption calls once he
- * has called ECRYPT_encrypt_bytes() (unless he starts a new message
- * of course). For example, this sequence of calls is acceptable:
- *
- * ECRYPT_keysetup();
- *
- * ECRYPT_ivsetup();
- * ECRYPT_encrypt_blocks();
- * ECRYPT_encrypt_blocks();
- * ECRYPT_encrypt_bytes();
- *
- * ECRYPT_ivsetup();
- * ECRYPT_encrypt_blocks();
- * ECRYPT_encrypt_blocks();
- *
- * ECRYPT_ivsetup();
- * ECRYPT_encrypt_bytes();
- * 
- * The following sequence is not:
- *
- * ECRYPT_keysetup();
- * ECRYPT_ivsetup();
- * ECRYPT_encrypt_blocks();
- * ECRYPT_encrypt_bytes();
- * ECRYPT_encrypt_blocks();
- */
-
-void ECRYPT_encrypt_bytes(
-  ECRYPT_ctx* ctx, 
-  const u8* plaintext, 
-  u8* ciphertext, 
-  u32 msglen);                /* Message length in bytes. */ 
-
-void ECRYPT_decrypt_bytes(
-  ECRYPT_ctx* ctx, 
-  const u8* ciphertext, 
-  u8* plaintext, 
-  u32 msglen);                /* Message length in bytes. */ 
-
-/* ------------------------------------------------------------------------- */
-
-/* Optional features */
-
-/* 
- * For testing purposes it can sometimes be useful to have a function
- * which immediately generates keystream without having to provide it
- * with a zero plaintext. If your cipher cannot provide this function
- * (e.g., because it is not strictly a synchronous cipher), please
- * reset the ECRYPT_GENERATES_KEYSTREAM flag.
- */
-
-#define ECRYPT_GENERATES_KEYSTREAM
-#ifdef ECRYPT_GENERATES_KEYSTREAM
-
-void ECRYPT_keystream_bytes(
-  ECRYPT_ctx* ctx,
-  u8* keystream,
-  u32 length);                /* Length of keystream in bytes. */
-
-#endif
-
-/* ------------------------------------------------------------------------- */
-
-/* Optional optimizations */
-
-/* 
- * By default, the functions in this section are implemented using
- * calls to functions declared above. However, you might want to
- * implement them differently for performance reasons.
- */
-
-/*
- * All-in-one encryption/decryption of (short) packets.
- *
- * The default definitions of these functions can be found in
- * "ecrypt-sync.c". If you want to implement them differently, please
- * undef the ECRYPT_USES_DEFAULT_ALL_IN_ONE flag.
- */
-#define ECRYPT_USES_DEFAULT_ALL_IN_ONE        /* [edit] */
-
-void ECRYPT_encrypt_packet(
-  ECRYPT_ctx* ctx, 
-  const u8* iv,
-  const u8* plaintext, 
-  u8* ciphertext, 
-  u32 msglen);
-
-void ECRYPT_decrypt_packet(
-  ECRYPT_ctx* ctx, 
-  const u8* iv,
-  const u8* ciphertext, 
-  u8* plaintext, 
-  u32 msglen);
-
-/*
- * Encryption/decryption of blocks.
- * 
- * By default, these functions are defined as macros. If you want to
- * provide a different implementation, please undef the
- * ECRYPT_USES_DEFAULT_BLOCK_MACROS flag and implement the functions
- * declared below.
- */
-
-#define ECRYPT_BLOCKLENGTH 64                  /* [edit] */
-
-#define ECRYPT_USES_DEFAULT_BLOCK_MACROS      /* [edit] */
-#ifdef ECRYPT_USES_DEFAULT_BLOCK_MACROS
-
-#define ECRYPT_encrypt_blocks(ctx, plaintext, ciphertext, blocks)  \
-  ECRYPT_encrypt_bytes(ctx, plaintext, ciphertext,                 \
-    (blocks) * ECRYPT_BLOCKLENGTH)
-
-#define ECRYPT_decrypt_blocks(ctx, ciphertext, plaintext, blocks)  \
-  ECRYPT_decrypt_bytes(ctx, ciphertext, plaintext,                 \
-    (blocks) * ECRYPT_BLOCKLENGTH)
-
-#ifdef ECRYPT_GENERATES_KEYSTREAM
-
-#define ECRYPT_keystream_blocks(ctx, keystream, blocks)            \
-  ECRYPT_keystream_bytes(ctx, keystream,                        \
-    (blocks) * ECRYPT_BLOCKLENGTH)
-
-#endif
-
-#else
-
-void ECRYPT_encrypt_blocks(
-  ECRYPT_ctx* ctx, 
-  const u8* plaintext, 
-  u8* ciphertext, 
-  u32 blocks);                /* Message length in blocks. */ 
-
-void ECRYPT_decrypt_blocks(
-  ECRYPT_ctx* ctx, 
-  const u8* ciphertext, 
-  u8* plaintext, 
-  u32 blocks);                /* Message length in blocks. */ 
-
-#ifdef ECRYPT_GENERATES_KEYSTREAM
-
-void ECRYPT_keystream_blocks(
-  ECRYPT_ctx* ctx,
-  const u8* keystream,
-  u32 blocks);                /* Keystream length in blocks. */ 
-
-#endif
-
-#endif
-
-/*
- * If your cipher can be implemented in different ways, you can use
- * the ECRYPT_VARIANT parameter to allow the user to choose between
- * them at compile time (e.g., gcc -DECRYPT_VARIANT=3 ...). Please
- * only use this possibility if you really think it could make a
- * significant difference and keep the number of variants
- * (ECRYPT_MAXVARIANT) as small as possible (definitely not more than
- * 10). Note also that all variants should have exactly the same
- * external interface (i.e., the same ECRYPT_BLOCKLENGTH, etc.). 
- */
-#define ECRYPT_MAXVARIANT 1                   /* [edit] */
-
-#ifndef ECRYPT_VARIANT
-#define ECRYPT_VARIANT 1
-#endif
-
-#if (ECRYPT_VARIANT > ECRYPT_MAXVARIANT)
-#error this variant does not exist
-#endif
-
-/* ------------------------------------------------------------------------- */
-
-#endif
--- a/src/crypto/astrobwt/salsa20_ref/salsa20.c
+++ b/src/crypto/astrobwt/salsa20_ref/salsa20.c
@ -1,219 +0,0 @@
-/*
-salsa20-merged.c version 20051118
-D. J. Bernstein
-Public domain.
-*/
-
-#include "ecrypt-sync.h"
-
-#define ROTATE(v,c) (ROTL32(v,c))
-#define XOR(v,w) ((v) ^ (w))
-#define PLUS(v,w) (U32V((v) + (w)))
-#define PLUSONE(v) (PLUS((v),1))
-
-void ECRYPT_init(void)
-{
-  return;
-}
-
-static const char sigma[16] = "expand 32-byte k";
-static const char tau[16] = "expand 16-byte k";
-
-void ECRYPT_keysetup(ECRYPT_ctx *x,const u8 *k,u32 kbits,u32 ivbits)
-{
-  const char *constants;
-
-  x->input[1] = U8TO32_LITTLE(k + 0);
-  x->input[2] = U8TO32_LITTLE(k + 4);
-  x->input[3] = U8TO32_LITTLE(k + 8);
-  x->input[4] = U8TO32_LITTLE(k + 12);
-  if (kbits == 256) { /* recommended */
-    k += 16;
-    constants = sigma;
-  } else { /* kbits == 128 */
-    constants = tau;
-  }
-  x->input[11] = U8TO32_LITTLE(k + 0);
-  x->input[12] = U8TO32_LITTLE(k + 4);
-  x->input[13] = U8TO32_LITTLE(k + 8);
-  x->input[14] = U8TO32_LITTLE(k + 12);
-  x->input[0] = U8TO32_LITTLE(constants + 0);
-  x->input[5] = U8TO32_LITTLE(constants + 4);
-  x->input[10] = U8TO32_LITTLE(constants + 8);
-  x->input[15] = U8TO32_LITTLE(constants + 12);
-}
-
-void ECRYPT_ivsetup(ECRYPT_ctx *x,const u8 *iv)
-{
-  x->input[6] = U8TO32_LITTLE(iv + 0);
-  x->input[7] = U8TO32_LITTLE(iv + 4);
-  x->input[8] = 0;
-  x->input[9] = 0;
-}
-
-void ECRYPT_encrypt_bytes(ECRYPT_ctx *x,const u8 *m,u8 *c,u32 bytes)
-{
-  u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-  u32 j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
-  u8 *ctarget = 0;
-  u8 tmp[64];
-  int i;
-
-  if (!bytes) return;
-
-  j0 = x->input[0];
-  j1 = x->input[1];
-  j2 = x->input[2];
-  j3 = x->input[3];
-  j4 = x->input[4];
-  j5 = x->input[5];
-  j6 = x->input[6];
-  j7 = x->input[7];
-  j8 = x->input[8];
-  j9 = x->input[9];
-  j10 = x->input[10];
-  j11 = x->input[11];
-  j12 = x->input[12];
-  j13 = x->input[13];
-  j14 = x->input[14];
-  j15 = x->input[15];
-
-  for (;;) {
-    if (bytes < 64) {
-      for (i = 0;i < bytes;++i) tmp[i] = m[i];
-      m = tmp;
-      ctarget = c;
-      c = tmp;
-    }
-    x0 = j0;
-    x1 = j1;
-    x2 = j2;
-    x3 = j3;
-    x4 = j4;
-    x5 = j5;
-    x6 = j6;
-    x7 = j7;
-    x8 = j8;
-    x9 = j9;
-    x10 = j10;
-    x11 = j11;
-    x12 = j12;
-    x13 = j13;
-    x14 = j14;
-    x15 = j15;
-    for (i = 20;i > 0;i -= 2) {
-       x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
-       x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
-      x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
-       x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
-       x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
-      x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
-       x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
-       x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
-      x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
-       x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
-       x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
-      x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
-       x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
-       x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
-      x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
-      x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
-       x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
-       x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
-       x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
-       x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
-       x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
-       x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
-       x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
-       x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
-      x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
-       x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
-       x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
-      x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
-      x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
-      x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
-      x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
-      x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
-    }
-    x0 = PLUS(x0,j0);
-    x1 = PLUS(x1,j1);
-    x2 = PLUS(x2,j2);
-    x3 = PLUS(x3,j3);
-    x4 = PLUS(x4,j4);
-    x5 = PLUS(x5,j5);
-    x6 = PLUS(x6,j6);
-    x7 = PLUS(x7,j7);
-    x8 = PLUS(x8,j8);
-    x9 = PLUS(x9,j9);
-    x10 = PLUS(x10,j10);
-    x11 = PLUS(x11,j11);
-    x12 = PLUS(x12,j12);
-    x13 = PLUS(x13,j13);
-    x14 = PLUS(x14,j14);
-    x15 = PLUS(x15,j15);
-
-    x0 = XOR(x0,U8TO32_LITTLE(m + 0));
-    x1 = XOR(x1,U8TO32_LITTLE(m + 4));
-    x2 = XOR(x2,U8TO32_LITTLE(m + 8));
-    x3 = XOR(x3,U8TO32_LITTLE(m + 12));
-    x4 = XOR(x4,U8TO32_LITTLE(m + 16));
-    x5 = XOR(x5,U8TO32_LITTLE(m + 20));
-    x6 = XOR(x6,U8TO32_LITTLE(m + 24));
-    x7 = XOR(x7,U8TO32_LITTLE(m + 28));
-    x8 = XOR(x8,U8TO32_LITTLE(m + 32));
-    x9 = XOR(x9,U8TO32_LITTLE(m + 36));
-    x10 = XOR(x10,U8TO32_LITTLE(m + 40));
-    x11 = XOR(x11,U8TO32_LITTLE(m + 44));
-    x12 = XOR(x12,U8TO32_LITTLE(m + 48));
-    x13 = XOR(x13,U8TO32_LITTLE(m + 52));
-    x14 = XOR(x14,U8TO32_LITTLE(m + 56));
-    x15 = XOR(x15,U8TO32_LITTLE(m + 60));
-
-    j8 = PLUSONE(j8);
-    if (!j8) {
-      j9 = PLUSONE(j9);
-      /* stopping at 2^70 bytes per nonce is user's responsibility */
-    }
-
-    U32TO8_LITTLE(c + 0,x0);
-    U32TO8_LITTLE(c + 4,x1);
-    U32TO8_LITTLE(c + 8,x2);
-    U32TO8_LITTLE(c + 12,x3);
-    U32TO8_LITTLE(c + 16,x4);
-    U32TO8_LITTLE(c + 20,x5);
-    U32TO8_LITTLE(c + 24,x6);
-    U32TO8_LITTLE(c + 28,x7);
-    U32TO8_LITTLE(c + 32,x8);
-    U32TO8_LITTLE(c + 36,x9);
-    U32TO8_LITTLE(c + 40,x10);
-    U32TO8_LITTLE(c + 44,x11);
-    U32TO8_LITTLE(c + 48,x12);
-    U32TO8_LITTLE(c + 52,x13);
-    U32TO8_LITTLE(c + 56,x14);
-    U32TO8_LITTLE(c + 60,x15);
-
-    if (bytes <= 64) {
-      if (bytes < 64) {
-        for (i = 0;i < bytes;++i) ctarget[i] = c[i];
-      }
-      x->input[8] = j8;
-      x->input[9] = j9;
-      return;
-    }
-    bytes -= 64;
-    c += 64;
-    m += 64;
-  }
-}
-
-void ECRYPT_decrypt_bytes(ECRYPT_ctx *x,const u8 *c,u8 *m,u32 bytes)
-{
-  ECRYPT_encrypt_bytes(x,c,m,bytes);
-}
-
-void ECRYPT_keystream_bytes(ECRYPT_ctx *x,u8 *stream,u32 bytes)
-{
-  u32 i;
-  for (i = 0; i < bytes; ++i) stream[i] = 0;
-  ECRYPT_encrypt_bytes(x,stream,stream,bytes);
-}
--- a/src/crypto/astrobwt/sha3_256_avx2.S
+++ b/src/crypto/astrobwt/sha3_256_avx2.S
@ -1,57 +0,0 @@
-;# XMRig
-;# Copyright 2010      Jeff Garzik              <jgarzik@pobox.com>
-;# Copyright 2012-2014 pooler                   <pooler@litecoinpool.org>
-;# Copyright 2014      Lucas Jones              <https://github.com/lucasjones>
-;# Copyright 2014-2016 Wolf9466                 <https://github.com/OhGodAPet>
-;# Copyright 2016      Jay D Dee                <jayddee246@gmail.com>
-;# Copyright 2017-2019 XMR-Stak                 <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
-;# Copyright 2018      Lee Clagett              <https://github.com/vtnerd>
-;# Copyright 2018-2019 tevador                  <tevador@gmail.com>
-;# Copyright 2000      Transmeta Corporation    <https://github.com/intel/msr-tools>
-;# Copyright 2004-2008 H. Peter Anvin           <https://github.com/intel/msr-tools>
-;# Copyright 2018-2020 SChernykh                <https://github.com/SChernykh>
-;# Copyright 2016-2020 XMRig                    <https://github.com/xmrig>, <support@xmrig.com>
-;#
-;#   This program is free software: you can redistribute it and/or modify
-;#   it under the terms of the GNU General Public License as published by
-;#   the Free Software Foundation, either version 3 of the License, or
-;#   (at your option) any later version.
-;#
-;#   This program is distributed in the hope that it will be useful,
-;#   but WITHOUT ANY WARRANTY; without even the implied warranty of
-;#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;#   GNU General Public License for more details.
-;#
-;#   You should have received a copy of the GNU General Public License
-;#   along with this program. If not, see <http://www.gnu.org/licenses/>.
-;#
-
-.intel_syntax noprefix
-#if defined(__APPLE__)
-.text
-#define DECL(x) _##x
-#else
-.section .text
-#define DECL(x) x
-#endif
-
-#define ALIGN .balign
-#define dq .quad
-
-.global DECL(SHA3_256_AVX2_ASM)
-
-ALIGN 64
-DECL(SHA3_256_AVX2_ASM):
-
-#include "sha3_256_avx2.inc"
-
-KeccakF1600_AVX2_ASM:
-	lea r8,[rip+rot_left+96]
-	lea r9,[rip+rot_right+96]
-	lea r10,[rip+rndc]
-
-#include "sha3_256_keccakf1600_avx2.inc"
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
--- a/src/crypto/astrobwt/sha3_256_avx2.asm
+++ b/src/crypto/astrobwt/sha3_256_avx2.asm
@ -1,45 +0,0 @@
-;# XMRig
-;# Copyright 2010      Jeff Garzik              <jgarzik@pobox.com>
-;# Copyright 2012-2014 pooler                   <pooler@litecoinpool.org>
-;# Copyright 2014      Lucas Jones              <https://github.com/lucasjones>
-;# Copyright 2014-2016 Wolf9466                 <https://github.com/OhGodAPet>
-;# Copyright 2016      Jay D Dee                <jayddee246@gmail.com>
-;# Copyright 2017-2019 XMR-Stak                 <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
-;# Copyright 2018      Lee Clagett              <https://github.com/vtnerd>
-;# Copyright 2018-2019 tevador                  <tevador@gmail.com>
-;# Copyright 2000      Transmeta Corporation    <https://github.com/intel/msr-tools>
-;# Copyright 2004-2008 H. Peter Anvin           <https://github.com/intel/msr-tools>
-;# Copyright 2018-2020 SChernykh                <https://github.com/SChernykh>
-;# Copyright 2016-2020 XMRig                    <https://github.com/xmrig>, <support@xmrig.com>
-;#
-;#   This program is free software: you can redistribute it and/or modify
-;#   it under the terms of the GNU General Public License as published by
-;#   the Free Software Foundation, either version 3 of the License, or
-;#   (at your option) any later version.
-;#
-;#   This program is distributed in the hope that it will be useful,
-;#   but WITHOUT ANY WARRANTY; without even the implied warranty of
-;#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;#   GNU General Public License for more details.
-;#
-;#   You should have received a copy of the GNU General Public License
-;#   along with this program. If not, see <http://www.gnu.org/licenses/>.
-;#
-
-_SHA3_256_AVX2_ASM SEGMENT PAGE READ EXECUTE
-PUBLIC SHA3_256_AVX2_ASM
-
-ALIGN 64
-SHA3_256_AVX2_ASM:
-
-include sha3_256_avx2.inc
-
-KeccakF1600_AVX2_ASM:
-	lea r8,[rot_left+96]
-	lea r9,[rot_right+96]
-	lea r10,[rndc]
-
-include sha3_256_keccakf1600_avx2.inc
-
-_SHA3_256_AVX2_ASM ENDS
-END
--- a/src/crypto/astrobwt/sha3_256_avx2.inc
+++ b/src/crypto/astrobwt/sha3_256_avx2.inc
@ -1,162 +0,0 @@
-;# XMRig
-;# Copyright 2010      Jeff Garzik              <jgarzik@pobox.com>
-;# Copyright 2012-2014 pooler                   <pooler@litecoinpool.org>
-;# Copyright 2014      Lucas Jones              <https://github.com/lucasjones>
-;# Copyright 2014-2016 Wolf9466                 <https://github.com/OhGodAPet>
-;# Copyright 2016      Jay D Dee                <jayddee246@gmail.com>
-;# Copyright 2017-2019 XMR-Stak                 <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
-;# Copyright 2018      Lee Clagett              <https://github.com/vtnerd>
-;# Copyright 2018-2019 tevador                  <tevador@gmail.com>
-;# Copyright 2000      Transmeta Corporation    <https://github.com/intel/msr-tools>
-;# Copyright 2004-2008 H. Peter Anvin           <https://github.com/intel/msr-tools>
-;# Copyright 2018-2020 SChernykh                <https://github.com/SChernykh>
-;# Copyright 2016-2020 XMRig                    <https://github.com/xmrig>, <support@xmrig.com>
-;#
-;#   This program is free software: you can redistribute it and/or modify
-;#   it under the terms of the GNU General Public License as published by
-;#   the Free Software Foundation, either version 3 of the License, or
-;#   (at your option) any later version.
-;#
-;#   This program is distributed in the hope that it will be useful,
-;#   but WITHOUT ANY WARRANTY; without even the implied warranty of
-;#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;#   GNU General Public License for more details.
-;#
-;#   You should have received a copy of the GNU General Public License
-;#   along with this program. If not, see <http://www.gnu.org/licenses/>.
-;#
-
-	vzeroupper
-
-	mov qword ptr [rsp+8],rbx
-	mov qword ptr [rsp+16],rsi
-	mov qword ptr [rsp+24],rdi
-	push rbp
-	push r12
-	push r13
-	push r14
-	push r15
-
-	sub rsp, 80
-	movdqu xmmword ptr [rsp+64], xmm6
-	movdqu xmmword ptr [rsp+48], xmm7
-	movdqu xmmword ptr [rsp+32], xmm8
-	movdqu xmmword ptr [rsp+16], xmm9
-	movdqu xmmword ptr [rsp+0], xmm10
-	sub rsp, 80
-	movdqu xmmword ptr [rsp+64], xmm11
-	movdqu xmmword ptr [rsp+48], xmm12
-	movdqu xmmword ptr [rsp+32], xmm13
-	movdqu xmmword ptr [rsp+16], xmm14
-	movdqu xmmword ptr [rsp+0], xmm15
-
-	sub rsp,320
-	lea rbp,[rsp+64]
-	and rbp,-32
-	vpxor xmm0,xmm0,xmm0
-	xor edi,edi
-	mov dword ptr [rbp],50462976
-	mov r12,rdx
-	mov dword ptr [rbp+4],169150212
-	mov r14,rdx
-	mov dword ptr [rbp+8],218436623
-	shr r14,3
-	and r12d,7
-	mov dword ptr [rbp+12],135009046
-	mov r13,r8
-	mov byte ptr [rbp+16],9
-	mov rsi,rcx
-	mov ebx,edi
-	vmovdqa ymmword ptr [rbp+32],ymm0
-	vmovdqa ymmword ptr [rbp+64],ymm0
-	vmovdqa ymmword ptr [rbp+96],ymm0
-	vmovdqa ymmword ptr [rbp+128],ymm0
-	vmovdqa ymmword ptr [rbp+160],ymm0
-	vmovdqa ymmword ptr [rbp+192],ymm0
-	vmovdqa ymmword ptr [rbp+224],ymm0
-	test r14,r14
-	je sha3_main_loop_end
-
-sha3_main_loop:
-	movzx eax,byte ptr [rbp+rbx]
-	lea rcx,[rbp+32]
-	lea rcx,[rcx+rax*8]
-	mov rax,qword ptr [rsi]
-	xor qword ptr [rcx],rax
-	lea r15,[rbx+1]
-	cmp rbx,16
-	jne skip_keccak
-
-	lea rcx,[rbp+32]
-	call KeccakF1600_AVX2_ASM
-
-skip_keccak:
-	cmp rbx,16
-	mov rax,rdi
-	cmovne rax,r15
-	add rsi,8
-	mov rbx,rax
-	sub r14,1
-	jne sha3_main_loop
-
-sha3_main_loop_end:
-	mov rdx,rdi
-	test r12,r12
-	je sha3_tail_loop_end
-	mov r8,rdi
-
-sha3_tail_loop:
-	movzx eax,byte ptr [rdx+rsi]
-	inc rdx
-	shlx rcx,rax,r8
-	or rdi,rcx
-	add r8,8
-	cmp rdx,r12
-	jb sha3_tail_loop
-
-sha3_tail_loop_end:
-	movzx eax,byte ptr [rbp+rbx]
-	lea rdx,[rbp+32]
-	lea rdx,[rdx+rax*8]
-	mov ecx,6
-	lea rax,[r12*8]
-	shlx rcx,rcx,rax
-	xor rcx,qword ptr [rdx]
-	mov eax,1
-	shl rax,63
-	xor rcx,rdi
-	mov qword ptr [rdx],rcx
-	xor qword ptr [rbp+104],rax
-
-	lea rcx,[rbp+32]
-	call KeccakF1600_AVX2_ASM
-
-	vmovups ymm0,ymmword ptr [rbp+32]
-	vmovups ymmword ptr [r13],ymm0
-	vzeroupper
-
-	add rsp,320
-
-	movdqu xmm15, xmmword ptr [rsp]
-	movdqu xmm14, xmmword ptr [rsp+16]
-	movdqu xmm13, xmmword ptr [rsp+32]
-	movdqu xmm12, xmmword ptr [rsp+48]
-	movdqu xmm11, xmmword ptr [rsp+64]
-	add rsp, 80
-	movdqu xmm10, xmmword ptr [rsp]
-	movdqu xmm9, xmmword ptr [rsp+16]
-	movdqu xmm8, xmmword ptr [rsp+32]
-	movdqu xmm7, xmmword ptr [rsp+48]
-	movdqu xmm6, xmmword ptr [rsp+64]
-	add rsp, 80
-
-	pop r15
-	pop r14
-	pop r13
-	pop r12
-	pop rbp
-	mov rbx,qword ptr [rsp+8]
-	mov rsi,qword ptr [rsp+16]
-	mov rdi,qword ptr [rsp+24]
-
-	ret
--- a/src/crypto/astrobwt/sha3_256_keccakf1600_avx2.inc
+++ b/src/crypto/astrobwt/sha3_256_keccakf1600_avx2.inc
@ -1,203 +0,0 @@
-;# XMRig
-;# Copyright 2010      Jeff Garzik              <jgarzik@pobox.com>
-;# Copyright 2012-2014 pooler                   <pooler@litecoinpool.org>
-;# Copyright 2014      Lucas Jones              <https://github.com/lucasjones>
-;# Copyright 2014-2016 Wolf9466                 <https://github.com/OhGodAPet>
-;# Copyright 2016      Jay D Dee                <jayddee246@gmail.com>
-;# Copyright 2017-2019 XMR-Stak                 <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
-;# Copyright 2018      Lee Clagett              <https://github.com/vtnerd>
-;# Copyright 2018-2019 tevador                  <tevador@gmail.com>
-;# Copyright 2000      Transmeta Corporation    <https://github.com/intel/msr-tools>
-;# Copyright 2004-2008 H. Peter Anvin           <https://github.com/intel/msr-tools>
-;# Copyright 2018-2020 SChernykh                <https://github.com/SChernykh>
-;# Copyright 2016-2020 XMRig                    <https://github.com/xmrig>, <support@xmrig.com>
-;#
-;#   This program is free software: you can redistribute it and/or modify
-;#   it under the terms of the GNU General Public License as published by
-;#   the Free Software Foundation, either version 3 of the License, or
-;#   (at your option) any later version.
-;#
-;#   This program is distributed in the hope that it will be useful,
-;#   but WITHOUT ANY WARRANTY; without even the implied warranty of
-;#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;#   GNU General Public License for more details.
-;#
-;#   You should have received a copy of the GNU General Public License
-;#   along with this program. If not, see <http://www.gnu.org/licenses/>.
-;#
-
-	mov eax,24
-	lea rcx,[rcx+96]
-	vpbroadcastq ymm0,QWORD PTR [rcx-96]
-	vmovdqu ymm1,YMMWORD PTR [rcx-88]
-	vmovdqu ymm2,YMMWORD PTR [rcx-56]
-	vmovdqu ymm3,YMMWORD PTR [rcx-24]
-	vmovdqu ymm4,YMMWORD PTR [rcx+8]
-	vmovdqu ymm5,YMMWORD PTR [rcx+40]
-	vmovdqu ymm6,YMMWORD PTR [rcx+72]
-
-ALIGN 64
-Loop_avx2:
-	vpshufd ymm13,ymm2,78
-	vpxor ymm12,ymm5,ymm3
-	vpxor ymm9,ymm4,ymm6
-	vpxor ymm12,ymm12,ymm1
-	vpxor ymm12,ymm12,ymm9
-	vpermq ymm11,ymm12,147
-	vpxor ymm13,ymm13,ymm2
-	vpermq ymm7,ymm13,78
-	vpsrlq ymm8,ymm12,63
-	vpaddq ymm9,ymm12,ymm12
-	vpor ymm8,ymm8,ymm9
-	vpermq ymm15,ymm8,57
-	vpxor ymm14,ymm8,ymm11
-	vpermq ymm14,ymm14,0
-	vpxor ymm13,ymm13,ymm0
-	vpxor ymm13,ymm13,ymm7
-	vpsrlq ymm7,ymm13,63
-	vpaddq ymm8,ymm13,ymm13
-	vpor ymm8,ymm8,ymm7
-	vpxor ymm2,ymm2,ymm14
-	vpxor ymm0,ymm0,ymm14
-	vpblendd ymm15,ymm15,ymm8,192
-	vpblendd ymm11,ymm11,ymm13,3
-	vpxor ymm15,ymm15,ymm11
-	vpsllvq ymm10,ymm2,YMMWORD PTR [r8-96]
-	vpsrlvq ymm2,ymm2,YMMWORD PTR [r9-96]
-	vpor ymm2,ymm2,ymm10
-	vpxor ymm3,ymm3,ymm15
-	vpsllvq ymm11,ymm3,YMMWORD PTR [r8-32]
-	vpsrlvq ymm3,ymm3,YMMWORD PTR [r9-32]
-	vpor ymm3,ymm3,ymm11
-	vpxor ymm4,ymm4,ymm15
-	vpsllvq ymm12,ymm4,YMMWORD PTR [r8]
-	vpsrlvq ymm4,ymm4,YMMWORD PTR [r9]
-	vpor ymm4,ymm4,ymm12
-	vpxor ymm5,ymm5,ymm15
-	vpsllvq ymm13,ymm5,YMMWORD PTR [r8+32]
-	vpsrlvq ymm5,ymm5,YMMWORD PTR [r9+32]
-	vpor ymm5,ymm5,ymm13
-	vpxor ymm6,ymm6,ymm15
-	vpermq ymm10,ymm2,141
-	vpermq ymm11,ymm3,141
-	vpsllvq ymm14,ymm6,YMMWORD PTR [r8+64]
-	vpsrlvq ymm8,ymm6,YMMWORD PTR [r9+64]
-	vpor ymm8,ymm8,ymm14
-	vpxor ymm1,ymm1,ymm15
-	vpermq ymm12,ymm4,27
-	vpermq ymm13,ymm5,114
-	vpsllvq ymm15,ymm1,YMMWORD PTR [r8-64]
-	vpsrlvq ymm9,ymm1,YMMWORD PTR [r9-64]
-	vpor ymm9,ymm9,ymm15
-	vpsrldq ymm14,ymm8,8
-	vpandn ymm7,ymm8,ymm14
-	vpblendd ymm3,ymm9,ymm13,12
-	vpblendd ymm15,ymm11,ymm9,12
-	vpblendd ymm5,ymm10,ymm11,12
-	vpblendd ymm14,ymm9,ymm10,12
-	vpblendd ymm3,ymm3,ymm11,48
-	vpblendd ymm15,ymm15,ymm12,48
-	vpblendd ymm5,ymm5,ymm9,48
-	vpblendd ymm14,ymm14,ymm13,48
-	vpblendd ymm3,ymm3,ymm12,192
-	vpblendd ymm15,ymm15,ymm13,192
-	vpblendd ymm5,ymm5,ymm13,192
-	vpblendd ymm14,ymm14,ymm11,192
-	vpandn ymm3,ymm3,ymm15
-	vpandn ymm5,ymm5,ymm14
-	vpblendd ymm6,ymm12,ymm9,12
-	vpblendd ymm15,ymm10,ymm12,12
-	vpxor ymm3,ymm3,ymm10
-	vpblendd ymm6,ymm6,ymm10,48
-	vpblendd ymm15,ymm15,ymm11,48
-	vpxor ymm5,ymm5,ymm12
-	vpblendd ymm6,ymm6,ymm11,192
-	vpblendd ymm15,ymm15,ymm9,192
-	vpandn ymm6,ymm6,ymm15
-	vpxor ymm6,ymm6,ymm13
-	vpermq ymm4,ymm8,30
-	vpblendd ymm15,ymm4,ymm0,48
-	vpermq ymm1,ymm8,57
-	vpblendd ymm1,ymm1,ymm0,192
-	vpandn ymm1,ymm1,ymm15
-	vpblendd ymm2,ymm11,ymm12,12
-	vpblendd ymm14,ymm13,ymm11,12
-	vpblendd ymm2,ymm2,ymm13,48
-	vpblendd ymm14,ymm14,ymm10,48
-	vpblendd ymm2,ymm2,ymm10,192
-	vpblendd ymm14,ymm14,ymm12,192
-	vpandn ymm2,ymm2,ymm14
-	vpxor ymm2,ymm2,ymm9
-	vpermq ymm7,ymm7,0
-	vpermq ymm3,ymm3,27
-	vpermq ymm5,ymm5,141
-	vpermq ymm6,ymm6,114
-	vpblendd ymm4,ymm13,ymm10,12
-	vpblendd ymm14,ymm12,ymm13,12
-	vpblendd ymm4,ymm4,ymm12,48
-	vpblendd ymm14,ymm14,ymm9,48
-	vpblendd ymm4,ymm4,ymm9,192
-	vpblendd ymm14,ymm14,ymm10,192
-	vpandn ymm4,ymm4,ymm14
-	vpxor ymm0,ymm0,ymm7
-	vpxor ymm1,ymm1,ymm8
-	vpxor ymm4,ymm4,ymm11
-	vpxor ymm0,ymm0,YMMWORD PTR [r10]
-	lea r10,[r10+32]
-	dec eax
-	jnz Loop_avx2
-
-	vmovq QWORD PTR [rcx-96],xmm0
-	vmovdqu YMMWORD PTR [rcx-88],ymm1
-	vmovdqu YMMWORD PTR [rcx-56],ymm2
-	vmovdqu YMMWORD PTR [rcx-24],ymm3
-	vmovdqu YMMWORD PTR [rcx+8],ymm4
-	vmovdqu YMMWORD PTR [rcx+40],ymm5
-	vmovdqu YMMWORD PTR [rcx+72],ymm6
-
-	ret
-
-ALIGN 32
-rot_left:
-	dq 3, 18, 36, 41
-	dq 1, 62, 28, 27
-	dq 45, 6, 56, 39
-	dq 10, 61, 55, 8
-	dq 2, 15, 25, 20
-	dq 44, 43, 21, 14
-
-ALIGN 32
-rot_right:
-	dq 64-3, 64-18, 64-36, 64-41
-	dq 64-1, 64-62, 64-28, 64-27
-	dq 64-45, 64-6, 64-56, 64-39
-	dq 64-10, 64-61, 64-55, 64-8
-	dq 64-2, 64-15, 64-25, 64-20
-	dq 64-44, 64-43, 64-21, 64-14
-
-ALIGN 32
-rndc:
-        dq 1, 1, 1, 1
-        dq 32898, 32898, 32898, 32898
-        dq 9223372036854808714, 9223372036854808714, 9223372036854808714, 9223372036854808714
-        dq 9223372039002292224, 9223372039002292224, 9223372039002292224, 9223372039002292224
-        dq 32907, 32907, 32907, 32907
-        dq 2147483649, 2147483649, 2147483649, 2147483649
-        dq 9223372039002292353, 9223372039002292353, 9223372039002292353, 9223372039002292353
-        dq 9223372036854808585, 9223372036854808585, 9223372036854808585, 9223372036854808585
-        dq 138, 138, 138, 138
-        dq 136, 136, 136, 136
-        dq 2147516425, 2147516425, 2147516425, 2147516425
-        dq 2147483658, 2147483658, 2147483658, 2147483658
-        dq 2147516555, 2147516555, 2147516555, 2147516555
-        dq 9223372036854775947, 9223372036854775947, 9223372036854775947, 9223372036854775947
-        dq 9223372036854808713, 9223372036854808713, 9223372036854808713, 9223372036854808713
-        dq 9223372036854808579, 9223372036854808579, 9223372036854808579, 9223372036854808579
-        dq 9223372036854808578, 9223372036854808578, 9223372036854808578, 9223372036854808578
-        dq 9223372036854775936, 9223372036854775936, 9223372036854775936, 9223372036854775936
-        dq 32778, 32778, 32778, 32778
-        dq 9223372039002259466, 9223372039002259466, 9223372039002259466, 9223372039002259466
-        dq 9223372039002292353, 9223372039002292353, 9223372039002292353, 9223372039002292353
-        dq 9223372036854808704, 9223372036854808704, 9223372036854808704, 9223372036854808704
-        dq 2147483649, 2147483649, 2147483649, 2147483649
-        dq 9223372039002292232, 9223372039002292232, 9223372039002292232, 9223372039002292232
--- a/src/crypto/astrobwt/xmm6int/salsa20_xmm6int-avx2.c
+++ b/src/crypto/astrobwt/xmm6int/salsa20_xmm6int-avx2.c
@ -1,98 +0,0 @@
-/*
- * ISC License
- *
- * Copyright (c) 2013-2021
- * Frank Denis <j at pureftpd dot org>
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <smmintrin.h>
-#include <tmmintrin.h>
-
-#define ROUNDS 20
-
-typedef struct salsa_ctx {
-    uint32_t input[16];
-} salsa_ctx;
-
-static const int TR[16] = {
-    0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3
-};
-
-#define LOAD32_LE(p) *((uint32_t*)(p))
-#define STORE32_LE(dst, src) memcpy((dst), &(src), sizeof(uint32_t))
-
-static void
-salsa_keysetup(salsa_ctx *ctx, const uint8_t *k)
-{
-    ctx->input[TR[1]]  = LOAD32_LE(k + 0);
-    ctx->input[TR[2]]  = LOAD32_LE(k + 4);
-    ctx->input[TR[3]]  = LOAD32_LE(k + 8);
-    ctx->input[TR[4]]  = LOAD32_LE(k + 12);
-    ctx->input[TR[11]] = LOAD32_LE(k + 16);
-    ctx->input[TR[12]] = LOAD32_LE(k + 20);
-    ctx->input[TR[13]] = LOAD32_LE(k + 24);
-    ctx->input[TR[14]] = LOAD32_LE(k + 28);
-    ctx->input[TR[0]]  = 0x61707865;
-    ctx->input[TR[5]]  = 0x3320646e;
-    ctx->input[TR[10]] = 0x79622d32;
-    ctx->input[TR[15]] = 0x6b206574;
-}
-
-static void
-salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
-{
-    ctx->input[TR[6]] = LOAD32_LE(iv + 0);
-    ctx->input[TR[7]] = LOAD32_LE(iv + 4);
-    ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
-    ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
-}
-
-static void
-salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c,
-                      unsigned long long bytes)
-{
-    uint32_t * const x = &ctx->input[0];
-
-    if (!bytes) {
-        return; /* LCOV_EXCL_LINE */
-    }
-
-#include "u8.h"
-#include "u4.h"
-#include "u1.h"
-#include "u0.h"
-}
-
-int salsa20_stream_avx2(void* c, uint64_t clen, const void* iv, const void* key)
-{
-	struct salsa_ctx ctx;
-
-	if (!clen) {
-		return 0;
-	}
-
-	salsa_keysetup(&ctx, (const uint8_t*)key);
-	salsa_ivsetup(&ctx, (const uint8_t*)iv, NULL);
-	memset(c, 0, clen);
-	salsa20_encrypt_bytes(&ctx, (const uint8_t*)c, (uint8_t*)c, clen);
-
-	return 0;
-}
--- a/src/crypto/astrobwt/xmm6int/u0.h
+++ b/src/crypto/astrobwt/xmm6int/u0.h
@ -1,193 +0,0 @@
-if (bytes > 0) {
-    __m128i diag0 = _mm_loadu_si128((const __m128i *) (x + 0));
-    __m128i diag1 = _mm_loadu_si128((const __m128i *) (x + 4));
-    __m128i diag2 = _mm_loadu_si128((const __m128i *) (x + 8));
-    __m128i diag3 = _mm_loadu_si128((const __m128i *) (x + 12));
-    __m128i a0, a1, a2, a3, a4, a5, a6, a7;
-    __m128i b0, b1, b2, b3, b4, b5, b6, b7;
-    uint8_t partialblock[64];
-
-    unsigned int i;
-
-    a0 = diag1;
-    for (i = 0; i < ROUNDS; i += 4) {
-        a0    = _mm_add_epi32(a0, diag0);
-        a1    = diag0;
-        b0    = a0;
-        a0    = _mm_slli_epi32(a0, 7);
-        b0    = _mm_srli_epi32(b0, 25);
-        diag3 = _mm_xor_si128(diag3, a0);
-
-        diag3 = _mm_xor_si128(diag3, b0);
-
-        a1    = _mm_add_epi32(a1, diag3);
-        a2    = diag3;
-        b1    = a1;
-        a1    = _mm_slli_epi32(a1, 9);
-        b1    = _mm_srli_epi32(b1, 23);
-        diag2 = _mm_xor_si128(diag2, a1);
-        diag3 = _mm_shuffle_epi32(diag3, 0x93);
-        diag2 = _mm_xor_si128(diag2, b1);
-
-        a2    = _mm_add_epi32(a2, diag2);
-        a3    = diag2;
-        b2    = a2;
-        a2    = _mm_slli_epi32(a2, 13);
-        b2    = _mm_srli_epi32(b2, 19);
-        diag1 = _mm_xor_si128(diag1, a2);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag1 = _mm_xor_si128(diag1, b2);
-
-        a3    = _mm_add_epi32(a3, diag1);
-        a4    = diag3;
-        b3    = a3;
-        a3    = _mm_slli_epi32(a3, 18);
-        b3    = _mm_srli_epi32(b3, 14);
-        diag0 = _mm_xor_si128(diag0, a3);
-        diag1 = _mm_shuffle_epi32(diag1, 0x39);
-        diag0 = _mm_xor_si128(diag0, b3);
-
-        a4    = _mm_add_epi32(a4, diag0);
-        a5    = diag0;
-        b4    = a4;
-        a4    = _mm_slli_epi32(a4, 7);
-        b4    = _mm_srli_epi32(b4, 25);
-        diag1 = _mm_xor_si128(diag1, a4);
-
-        diag1 = _mm_xor_si128(diag1, b4);
-
-        a5    = _mm_add_epi32(a5, diag1);
-        a6    = diag1;
-        b5    = a5;
-        a5    = _mm_slli_epi32(a5, 9);
-        b5    = _mm_srli_epi32(b5, 23);
-        diag2 = _mm_xor_si128(diag2, a5);
-        diag1 = _mm_shuffle_epi32(diag1, 0x93);
-        diag2 = _mm_xor_si128(diag2, b5);
-
-        a6    = _mm_add_epi32(a6, diag2);
-        a7    = diag2;
-        b6    = a6;
-        a6    = _mm_slli_epi32(a6, 13);
-        b6    = _mm_srli_epi32(b6, 19);
-        diag3 = _mm_xor_si128(diag3, a6);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag3 = _mm_xor_si128(diag3, b6);
-
-        a7    = _mm_add_epi32(a7, diag3);
-        a0    = diag1;
-        b7    = a7;
-        a7    = _mm_slli_epi32(a7, 18);
-        b7    = _mm_srli_epi32(b7, 14);
-        diag0 = _mm_xor_si128(diag0, a7);
-        diag3 = _mm_shuffle_epi32(diag3, 0x39);
-        diag0 = _mm_xor_si128(diag0, b7);
-
-        a0    = _mm_add_epi32(a0, diag0);
-        a1    = diag0;
-        b0    = a0;
-        a0    = _mm_slli_epi32(a0, 7);
-        b0    = _mm_srli_epi32(b0, 25);
-        diag3 = _mm_xor_si128(diag3, a0);
-
-        diag3 = _mm_xor_si128(diag3, b0);
-
-        a1    = _mm_add_epi32(a1, diag3);
-        a2    = diag3;
-        b1    = a1;
-        a1    = _mm_slli_epi32(a1, 9);
-        b1    = _mm_srli_epi32(b1, 23);
-        diag2 = _mm_xor_si128(diag2, a1);
-        diag3 = _mm_shuffle_epi32(diag3, 0x93);
-        diag2 = _mm_xor_si128(diag2, b1);
-
-        a2    = _mm_add_epi32(a2, diag2);
-        a3    = diag2;
-        b2    = a2;
-        a2    = _mm_slli_epi32(a2, 13);
-        b2    = _mm_srli_epi32(b2, 19);
-        diag1 = _mm_xor_si128(diag1, a2);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag1 = _mm_xor_si128(diag1, b2);
-
-        a3    = _mm_add_epi32(a3, diag1);
-        a4    = diag3;
-        b3    = a3;
-        a3    = _mm_slli_epi32(a3, 18);
-        b3    = _mm_srli_epi32(b3, 14);
-        diag0 = _mm_xor_si128(diag0, a3);
-        diag1 = _mm_shuffle_epi32(diag1, 0x39);
-        diag0 = _mm_xor_si128(diag0, b3);
-
-        a4    = _mm_add_epi32(a4, diag0);
-        a5    = diag0;
-        b4    = a4;
-        a4    = _mm_slli_epi32(a4, 7);
-        b4    = _mm_srli_epi32(b4, 25);
-        diag1 = _mm_xor_si128(diag1, a4);
-
-        diag1 = _mm_xor_si128(diag1, b4);
-
-        a5    = _mm_add_epi32(a5, diag1);
-        a6    = diag1;
-        b5    = a5;
-        a5    = _mm_slli_epi32(a5, 9);
-        b5    = _mm_srli_epi32(b5, 23);
-        diag2 = _mm_xor_si128(diag2, a5);
-        diag1 = _mm_shuffle_epi32(diag1, 0x93);
-        diag2 = _mm_xor_si128(diag2, b5);
-
-        a6    = _mm_add_epi32(a6, diag2);
-        a7    = diag2;
-        b6    = a6;
-        a6    = _mm_slli_epi32(a6, 13);
-        b6    = _mm_srli_epi32(b6, 19);
-        diag3 = _mm_xor_si128(diag3, a6);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag3 = _mm_xor_si128(diag3, b6);
-
-        a7    = _mm_add_epi32(a7, diag3);
-        a0    = diag1;
-        b7    = a7;
-        a7    = _mm_slli_epi32(a7, 18);
-        b7    = _mm_srli_epi32(b7, 14);
-        diag0 = _mm_xor_si128(diag0, a7);
-        diag3 = _mm_shuffle_epi32(diag3, 0x39);
-        diag0 = _mm_xor_si128(diag0, b7);
-    }
-
-    diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((const __m128i *) (x + 0)));
-    diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((const __m128i *) (x + 4)));
-    diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((const __m128i *) (x + 8)));
-    diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((const __m128i *) (x + 12)));
-
-#define ONEQUAD_SHUFFLE(A, B, C, D)                                              \
-    do {                                                                         \
-        uint32_t in##A                         = _mm_cvtsi128_si32(diag0);       \
-        uint32_t in##B                         = _mm_cvtsi128_si32(diag1);       \
-        uint32_t in##C                         = _mm_cvtsi128_si32(diag2);       \
-        uint32_t in##D                         = _mm_cvtsi128_si32(diag3);       \
-        diag0                                  = _mm_shuffle_epi32(diag0, 0x39); \
-        diag1                                  = _mm_shuffle_epi32(diag1, 0x39); \
-        diag2                                  = _mm_shuffle_epi32(diag2, 0x39); \
-        diag3                                  = _mm_shuffle_epi32(diag3, 0x39); \
-        *(uint32_t *) (partialblock + (A * 4)) = in##A;                          \
-        *(uint32_t *) (partialblock + (B * 4)) = in##B;                          \
-        *(uint32_t *) (partialblock + (C * 4)) = in##C;                          \
-        *(uint32_t *) (partialblock + (D * 4)) = in##D;                          \
-    } while (0)
-
-#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
-
-    ONEQUAD(0, 12, 8, 4);
-    ONEQUAD(5, 1, 13, 9);
-    ONEQUAD(10, 6, 2, 14);
-    ONEQUAD(15, 11, 7, 3);
-
-#undef ONEQUAD
-#undef ONEQUAD_SHUFFLE
-
-    for (i = 0; i < bytes; i++) {
-        c[i] = m[i] ^ partialblock[i];
-    }
-}
--- a/src/crypto/astrobwt/xmm6int/u1.h
+++ b/src/crypto/astrobwt/xmm6int/u1.h
@ -1,207 +0,0 @@
-while (bytes >= 64) {
-    __m128i diag0 = _mm_loadu_si128((const __m128i *) (x + 0));
-    __m128i diag1 = _mm_loadu_si128((const __m128i *) (x + 4));
-    __m128i diag2 = _mm_loadu_si128((const __m128i *) (x + 8));
-    __m128i diag3 = _mm_loadu_si128((const __m128i *) (x + 12));
-    __m128i a0, a1, a2, a3, a4, a5, a6, a7;
-    __m128i b0, b1, b2, b3, b4, b5, b6, b7;
-
-    uint32_t in8;
-    uint32_t in9;
-    int      i;
-
-    a0 = diag1;
-    for (i = 0; i < ROUNDS; i += 4) {
-        a0    = _mm_add_epi32(a0, diag0);
-        a1    = diag0;
-        b0    = a0;
-        a0    = _mm_slli_epi32(a0, 7);
-        b0    = _mm_srli_epi32(b0, 25);
-        diag3 = _mm_xor_si128(diag3, a0);
-
-        diag3 = _mm_xor_si128(diag3, b0);
-
-        a1    = _mm_add_epi32(a1, diag3);
-        a2    = diag3;
-        b1    = a1;
-        a1    = _mm_slli_epi32(a1, 9);
-        b1    = _mm_srli_epi32(b1, 23);
-        diag2 = _mm_xor_si128(diag2, a1);
-        diag3 = _mm_shuffle_epi32(diag3, 0x93);
-        diag2 = _mm_xor_si128(diag2, b1);
-
-        a2    = _mm_add_epi32(a2, diag2);
-        a3    = diag2;
-        b2    = a2;
-        a2    = _mm_slli_epi32(a2, 13);
-        b2    = _mm_srli_epi32(b2, 19);
-        diag1 = _mm_xor_si128(diag1, a2);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag1 = _mm_xor_si128(diag1, b2);
-
-        a3    = _mm_add_epi32(a3, diag1);
-        a4    = diag3;
-        b3    = a3;
-        a3    = _mm_slli_epi32(a3, 18);
-        b3    = _mm_srli_epi32(b3, 14);
-        diag0 = _mm_xor_si128(diag0, a3);
-        diag1 = _mm_shuffle_epi32(diag1, 0x39);
-        diag0 = _mm_xor_si128(diag0, b3);
-
-        a4    = _mm_add_epi32(a4, diag0);
-        a5    = diag0;
-        b4    = a4;
-        a4    = _mm_slli_epi32(a4, 7);
-        b4    = _mm_srli_epi32(b4, 25);
-        diag1 = _mm_xor_si128(diag1, a4);
-
-        diag1 = _mm_xor_si128(diag1, b4);
-
-        a5    = _mm_add_epi32(a5, diag1);
-        a6    = diag1;
-        b5    = a5;
-        a5    = _mm_slli_epi32(a5, 9);
-        b5    = _mm_srli_epi32(b5, 23);
-        diag2 = _mm_xor_si128(diag2, a5);
-        diag1 = _mm_shuffle_epi32(diag1, 0x93);
-        diag2 = _mm_xor_si128(diag2, b5);
-
-        a6    = _mm_add_epi32(a6, diag2);
-        a7    = diag2;
-        b6    = a6;
-        a6    = _mm_slli_epi32(a6, 13);
-        b6    = _mm_srli_epi32(b6, 19);
-        diag3 = _mm_xor_si128(diag3, a6);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag3 = _mm_xor_si128(diag3, b6);
-
-        a7    = _mm_add_epi32(a7, diag3);
-        a0    = diag1;
-        b7    = a7;
-        a7    = _mm_slli_epi32(a7, 18);
-        b7    = _mm_srli_epi32(b7, 14);
-        diag0 = _mm_xor_si128(diag0, a7);
-        diag3 = _mm_shuffle_epi32(diag3, 0x39);
-        diag0 = _mm_xor_si128(diag0, b7);
-
-        a0    = _mm_add_epi32(a0, diag0);
-        a1    = diag0;
-        b0    = a0;
-        a0    = _mm_slli_epi32(a0, 7);
-        b0    = _mm_srli_epi32(b0, 25);
-        diag3 = _mm_xor_si128(diag3, a0);
-
-        diag3 = _mm_xor_si128(diag3, b0);
-
-        a1    = _mm_add_epi32(a1, diag3);
-        a2    = diag3;
-        b1    = a1;
-        a1    = _mm_slli_epi32(a1, 9);
-        b1    = _mm_srli_epi32(b1, 23);
-        diag2 = _mm_xor_si128(diag2, a1);
-        diag3 = _mm_shuffle_epi32(diag3, 0x93);
-        diag2 = _mm_xor_si128(diag2, b1);
-
-        a2    = _mm_add_epi32(a2, diag2);
-        a3    = diag2;
-        b2    = a2;
-        a2    = _mm_slli_epi32(a2, 13);
-        b2    = _mm_srli_epi32(b2, 19);
-        diag1 = _mm_xor_si128(diag1, a2);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag1 = _mm_xor_si128(diag1, b2);
-
-        a3    = _mm_add_epi32(a3, diag1);
-        a4    = diag3;
-        b3    = a3;
-        a3    = _mm_slli_epi32(a3, 18);
-        b3    = _mm_srli_epi32(b3, 14);
-        diag0 = _mm_xor_si128(diag0, a3);
-        diag1 = _mm_shuffle_epi32(diag1, 0x39);
-        diag0 = _mm_xor_si128(diag0, b3);
-
-        a4    = _mm_add_epi32(a4, diag0);
-        a5    = diag0;
-        b4    = a4;
-        a4    = _mm_slli_epi32(a4, 7);
-        b4    = _mm_srli_epi32(b4, 25);
-        diag1 = _mm_xor_si128(diag1, a4);
-
-        diag1 = _mm_xor_si128(diag1, b4);
-
-        a5    = _mm_add_epi32(a5, diag1);
-        a6    = diag1;
-        b5    = a5;
-        a5    = _mm_slli_epi32(a5, 9);
-        b5    = _mm_srli_epi32(b5, 23);
-        diag2 = _mm_xor_si128(diag2, a5);
-        diag1 = _mm_shuffle_epi32(diag1, 0x93);
-        diag2 = _mm_xor_si128(diag2, b5);
-
-        a6    = _mm_add_epi32(a6, diag2);
-        a7    = diag2;
-        b6    = a6;
-        a6    = _mm_slli_epi32(a6, 13);
-        b6    = _mm_srli_epi32(b6, 19);
-        diag3 = _mm_xor_si128(diag3, a6);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag3 = _mm_xor_si128(diag3, b6);
-
-        a7    = _mm_add_epi32(a7, diag3);
-        a0    = diag1;
-        b7    = a7;
-        a7    = _mm_slli_epi32(a7, 18);
-        b7    = _mm_srli_epi32(b7, 14);
-        diag0 = _mm_xor_si128(diag0, a7);
-        diag3 = _mm_shuffle_epi32(diag3, 0x39);
-        diag0 = _mm_xor_si128(diag0, b7);
-    }
-
-    diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((const __m128i *) (x + 0)));
-    diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((const __m128i *) (x + 4)));
-    diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((const __m128i *) (x + 8)));
-    diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((const __m128i *) (x + 12)));
-
-#define ONEQUAD_SHUFFLE(A, B, C, D)                      \
-    do {                                                 \
-        uint32_t in##A = _mm_cvtsi128_si32(diag0);       \
-        uint32_t in##B = _mm_cvtsi128_si32(diag1);       \
-        uint32_t in##C = _mm_cvtsi128_si32(diag2);       \
-        uint32_t in##D = _mm_cvtsi128_si32(diag3);       \
-        diag0          = _mm_shuffle_epi32(diag0, 0x39); \
-        diag1          = _mm_shuffle_epi32(diag1, 0x39); \
-        diag2          = _mm_shuffle_epi32(diag2, 0x39); \
-        diag3          = _mm_shuffle_epi32(diag3, 0x39); \
-        in##A ^= *(const uint32_t *) (m + (A * 4));      \
-        in##B ^= *(const uint32_t *) (m + (B * 4));      \
-        in##C ^= *(const uint32_t *) (m + (C * 4));      \
-        in##D ^= *(const uint32_t *) (m + (D * 4));      \
-        *(uint32_t *) (c + (A * 4)) = in##A;             \
-        *(uint32_t *) (c + (B * 4)) = in##B;             \
-        *(uint32_t *) (c + (C * 4)) = in##C;             \
-        *(uint32_t *) (c + (D * 4)) = in##D;             \
-    } while (0)
-
-#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
-
-    ONEQUAD(0, 12, 8, 4);
-    ONEQUAD(5, 1, 13, 9);
-    ONEQUAD(10, 6, 2, 14);
-    ONEQUAD(15, 11, 7, 3);
-
-#undef ONEQUAD
-#undef ONEQUAD_SHUFFLE
-
-    in8 = x[8];
-    in9 = x[13];
-    in8++;
-    if (in8 == 0) {
-        in9++;
-    }
-    x[8]  = in8;
-    x[13] = in9;
-
-    c += 64;
-    m += 64;
-    bytes -= 64;
-}
--- a/src/crypto/astrobwt/xmm6int/u4.h
+++ b/src/crypto/astrobwt/xmm6int/u4.h
@ -1,547 +0,0 @@
-if (bytes >= 256) {
-    __m128i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
-        y15;
-    __m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14,
-        z15;
-    __m128i orig0, orig1, orig2, orig3, orig4, orig5, orig6, orig7, orig8,
-        orig9, orig10, orig11, orig12, orig13, orig14, orig15;
-
-    uint32_t in8;
-    uint32_t in9;
-    int      i;
-
-    /* element broadcast immediate for _mm_shuffle_epi32 are in order:
-       0x00, 0x55, 0xaa, 0xff */
-    z0  = _mm_loadu_si128((const __m128i *) (x + 0));
-    z5  = _mm_shuffle_epi32(z0, 0x55);
-    z10 = _mm_shuffle_epi32(z0, 0xaa);
-    z15 = _mm_shuffle_epi32(z0, 0xff);
-    z0  = _mm_shuffle_epi32(z0, 0x00);
-    z1  = _mm_loadu_si128((const __m128i *) (x + 4));
-    z6  = _mm_shuffle_epi32(z1, 0xaa);
-    z11 = _mm_shuffle_epi32(z1, 0xff);
-    z12 = _mm_shuffle_epi32(z1, 0x00);
-    z1  = _mm_shuffle_epi32(z1, 0x55);
-    z2  = _mm_loadu_si128((const __m128i *) (x + 8));
-    z7  = _mm_shuffle_epi32(z2, 0xff);
-    z13 = _mm_shuffle_epi32(z2, 0x55);
-    z2  = _mm_shuffle_epi32(z2, 0xaa);
-    /* no z8 -> first half of the nonce, will fill later */
-    z3  = _mm_loadu_si128((const __m128i *) (x + 12));
-    z4  = _mm_shuffle_epi32(z3, 0x00);
-    z14 = _mm_shuffle_epi32(z3, 0xaa);
-    z3  = _mm_shuffle_epi32(z3, 0xff);
-    /* no z9 -> second half of the nonce, will fill later */
-    orig0  = z0;
-    orig1  = z1;
-    orig2  = z2;
-    orig3  = z3;
-    orig4  = z4;
-    orig5  = z5;
-    orig6  = z6;
-    orig7  = z7;
-    orig10 = z10;
-    orig11 = z11;
-    orig12 = z12;
-    orig13 = z13;
-    orig14 = z14;
-    orig15 = z15;
-
-    while (bytes >= 256) {
-        /* vector implementation for z8 and z9 */
-        /* not sure if it helps for only 4 blocks */
-        const __m128i addv8 = _mm_set_epi64x(1, 0);
-        const __m128i addv9 = _mm_set_epi64x(3, 2);
-        __m128i       t8, t9;
-        uint64_t      in89;
-
-        in8  = x[8];
-        in9  = x[13];
-        in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
-        t8   = _mm_set1_epi64x(in89);
-        t9   = _mm_set1_epi64x(in89);
-
-        z8 = _mm_add_epi64(addv8, t8);
-        z9 = _mm_add_epi64(addv9, t9);
-
-        t8 = _mm_unpacklo_epi32(z8, z9);
-        t9 = _mm_unpackhi_epi32(z8, z9);
-
-        z8 = _mm_unpacklo_epi32(t8, t9);
-        z9 = _mm_unpackhi_epi32(t8, t9);
-
-        orig8 = z8;
-        orig9 = z9;
-
-        in89 += 4;
-
-        x[8]  = in89 & 0xFFFFFFFF;
-        x[13] = (in89 >> 32) & 0xFFFFFFFF;
-
-        z5  = orig5;
-        z10 = orig10;
-        z15 = orig15;
-        z14 = orig14;
-        z3  = orig3;
-        z6  = orig6;
-        z11 = orig11;
-        z1  = orig1;
-
-        z7  = orig7;
-        z13 = orig13;
-        z2  = orig2;
-        z9  = orig9;
-        z0  = orig0;
-        z12 = orig12;
-        z4  = orig4;
-        z8  = orig8;
-
-        for (i = 0; i < ROUNDS; i += 2) {
-            /* the inner loop is a direct translation (regexp search/replace)
-             * from the amd64-xmm6 ASM */
-            __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
-                r14, r15;
-
-            y4 = z12;
-            y4 = _mm_add_epi32(y4, z0);
-            r4 = y4;
-            y4 = _mm_slli_epi32(y4, 7);
-            z4 = _mm_xor_si128(z4, y4);
-            r4 = _mm_srli_epi32(r4, 25);
-            z4 = _mm_xor_si128(z4, r4);
-
-            y9 = z1;
-            y9 = _mm_add_epi32(y9, z5);
-            r9 = y9;
-            y9 = _mm_slli_epi32(y9, 7);
-            z9 = _mm_xor_si128(z9, y9);
-            r9 = _mm_srli_epi32(r9, 25);
-            z9 = _mm_xor_si128(z9, r9);
-
-            y8 = z0;
-            y8 = _mm_add_epi32(y8, z4);
-            r8 = y8;
-            y8 = _mm_slli_epi32(y8, 9);
-            z8 = _mm_xor_si128(z8, y8);
-            r8 = _mm_srli_epi32(r8, 23);
-            z8 = _mm_xor_si128(z8, r8);
-
-            y13 = z5;
-            y13 = _mm_add_epi32(y13, z9);
-            r13 = y13;
-            y13 = _mm_slli_epi32(y13, 9);
-            z13 = _mm_xor_si128(z13, y13);
-            r13 = _mm_srli_epi32(r13, 23);
-            z13 = _mm_xor_si128(z13, r13);
-
-            y12 = z4;
-            y12 = _mm_add_epi32(y12, z8);
-            r12 = y12;
-            y12 = _mm_slli_epi32(y12, 13);
-            z12 = _mm_xor_si128(z12, y12);
-            r12 = _mm_srli_epi32(r12, 19);
-            z12 = _mm_xor_si128(z12, r12);
-
-            y1 = z9;
-            y1 = _mm_add_epi32(y1, z13);
-            r1 = y1;
-            y1 = _mm_slli_epi32(y1, 13);
-            z1 = _mm_xor_si128(z1, y1);
-            r1 = _mm_srli_epi32(r1, 19);
-            z1 = _mm_xor_si128(z1, r1);
-
-            y0 = z8;
-            y0 = _mm_add_epi32(y0, z12);
-            r0 = y0;
-            y0 = _mm_slli_epi32(y0, 18);
-            z0 = _mm_xor_si128(z0, y0);
-            r0 = _mm_srli_epi32(r0, 14);
-            z0 = _mm_xor_si128(z0, r0);
-
-            y5 = z13;
-            y5 = _mm_add_epi32(y5, z1);
-            r5 = y5;
-            y5 = _mm_slli_epi32(y5, 18);
-            z5 = _mm_xor_si128(z5, y5);
-            r5 = _mm_srli_epi32(r5, 14);
-            z5 = _mm_xor_si128(z5, r5);
-
-            y14 = z6;
-            y14 = _mm_add_epi32(y14, z10);
-            r14 = y14;
-            y14 = _mm_slli_epi32(y14, 7);
-            z14 = _mm_xor_si128(z14, y14);
-            r14 = _mm_srli_epi32(r14, 25);
-            z14 = _mm_xor_si128(z14, r14);
-
-            y3 = z11;
-            y3 = _mm_add_epi32(y3, z15);
-            r3 = y3;
-            y3 = _mm_slli_epi32(y3, 7);
-            z3 = _mm_xor_si128(z3, y3);
-            r3 = _mm_srli_epi32(r3, 25);
-            z3 = _mm_xor_si128(z3, r3);
-
-            y2 = z10;
-            y2 = _mm_add_epi32(y2, z14);
-            r2 = y2;
-            y2 = _mm_slli_epi32(y2, 9);
-            z2 = _mm_xor_si128(z2, y2);
-            r2 = _mm_srli_epi32(r2, 23);
-            z2 = _mm_xor_si128(z2, r2);
-
-            y7 = z15;
-            y7 = _mm_add_epi32(y7, z3);
-            r7 = y7;
-            y7 = _mm_slli_epi32(y7, 9);
-            z7 = _mm_xor_si128(z7, y7);
-            r7 = _mm_srli_epi32(r7, 23);
-            z7 = _mm_xor_si128(z7, r7);
-
-            y6 = z14;
-            y6 = _mm_add_epi32(y6, z2);
-            r6 = y6;
-            y6 = _mm_slli_epi32(y6, 13);
-            z6 = _mm_xor_si128(z6, y6);
-            r6 = _mm_srli_epi32(r6, 19);
-            z6 = _mm_xor_si128(z6, r6);
-
-            y11 = z3;
-            y11 = _mm_add_epi32(y11, z7);
-            r11 = y11;
-            y11 = _mm_slli_epi32(y11, 13);
-            z11 = _mm_xor_si128(z11, y11);
-            r11 = _mm_srli_epi32(r11, 19);
-            z11 = _mm_xor_si128(z11, r11);
-
-            y10 = z2;
-            y10 = _mm_add_epi32(y10, z6);
-            r10 = y10;
-            y10 = _mm_slli_epi32(y10, 18);
-            z10 = _mm_xor_si128(z10, y10);
-            r10 = _mm_srli_epi32(r10, 14);
-            z10 = _mm_xor_si128(z10, r10);
-
-            y1 = z3;
-            y1 = _mm_add_epi32(y1, z0);
-            r1 = y1;
-            y1 = _mm_slli_epi32(y1, 7);
-            z1 = _mm_xor_si128(z1, y1);
-            r1 = _mm_srli_epi32(r1, 25);
-            z1 = _mm_xor_si128(z1, r1);
-
-            y15 = z7;
-            y15 = _mm_add_epi32(y15, z11);
-            r15 = y15;
-            y15 = _mm_slli_epi32(y15, 18);
-            z15 = _mm_xor_si128(z15, y15);
-            r15 = _mm_srli_epi32(r15, 14);
-            z15 = _mm_xor_si128(z15, r15);
-
-            y6 = z4;
-            y6 = _mm_add_epi32(y6, z5);
-            r6 = y6;
-            y6 = _mm_slli_epi32(y6, 7);
-            z6 = _mm_xor_si128(z6, y6);
-            r6 = _mm_srli_epi32(r6, 25);
-            z6 = _mm_xor_si128(z6, r6);
-
-            y2 = z0;
-            y2 = _mm_add_epi32(y2, z1);
-            r2 = y2;
-            y2 = _mm_slli_epi32(y2, 9);
-            z2 = _mm_xor_si128(z2, y2);
-            r2 = _mm_srli_epi32(r2, 23);
-            z2 = _mm_xor_si128(z2, r2);
-
-            y7 = z5;
-            y7 = _mm_add_epi32(y7, z6);
-            r7 = y7;
-            y7 = _mm_slli_epi32(y7, 9);
-            z7 = _mm_xor_si128(z7, y7);
-            r7 = _mm_srli_epi32(r7, 23);
-            z7 = _mm_xor_si128(z7, r7);
-
-            y3 = z1;
-            y3 = _mm_add_epi32(y3, z2);
-            r3 = y3;
-            y3 = _mm_slli_epi32(y3, 13);
-            z3 = _mm_xor_si128(z3, y3);
-            r3 = _mm_srli_epi32(r3, 19);
-            z3 = _mm_xor_si128(z3, r3);
-
-            y4 = z6;
-            y4 = _mm_add_epi32(y4, z7);
-            r4 = y4;
-            y4 = _mm_slli_epi32(y4, 13);
-            z4 = _mm_xor_si128(z4, y4);
-            r4 = _mm_srli_epi32(r4, 19);
-            z4 = _mm_xor_si128(z4, r4);
-
-            y0 = z2;
-            y0 = _mm_add_epi32(y0, z3);
-            r0 = y0;
-            y0 = _mm_slli_epi32(y0, 18);
-            z0 = _mm_xor_si128(z0, y0);
-            r0 = _mm_srli_epi32(r0, 14);
-            z0 = _mm_xor_si128(z0, r0);
-
-            y5 = z7;
-            y5 = _mm_add_epi32(y5, z4);
-            r5 = y5;
-            y5 = _mm_slli_epi32(y5, 18);
-            z5 = _mm_xor_si128(z5, y5);
-            r5 = _mm_srli_epi32(r5, 14);
-            z5 = _mm_xor_si128(z5, r5);
-
-            y11 = z9;
-            y11 = _mm_add_epi32(y11, z10);
-            r11 = y11;
-            y11 = _mm_slli_epi32(y11, 7);
-            z11 = _mm_xor_si128(z11, y11);
-            r11 = _mm_srli_epi32(r11, 25);
-            z11 = _mm_xor_si128(z11, r11);
-
-            y12 = z14;
-            y12 = _mm_add_epi32(y12, z15);
-            r12 = y12;
-            y12 = _mm_slli_epi32(y12, 7);
-            z12 = _mm_xor_si128(z12, y12);
-            r12 = _mm_srli_epi32(r12, 25);
-            z12 = _mm_xor_si128(z12, r12);
-
-            y8 = z10;
-            y8 = _mm_add_epi32(y8, z11);
-            r8 = y8;
-            y8 = _mm_slli_epi32(y8, 9);
-            z8 = _mm_xor_si128(z8, y8);
-            r8 = _mm_srli_epi32(r8, 23);
-            z8 = _mm_xor_si128(z8, r8);
-
-            y13 = z15;
-            y13 = _mm_add_epi32(y13, z12);
-            r13 = y13;
-            y13 = _mm_slli_epi32(y13, 9);
-            z13 = _mm_xor_si128(z13, y13);
-            r13 = _mm_srli_epi32(r13, 23);
-            z13 = _mm_xor_si128(z13, r13);
-
-            y9 = z11;
-            y9 = _mm_add_epi32(y9, z8);
-            r9 = y9;
-            y9 = _mm_slli_epi32(y9, 13);
-            z9 = _mm_xor_si128(z9, y9);
-            r9 = _mm_srli_epi32(r9, 19);
-            z9 = _mm_xor_si128(z9, r9);
-
-            y14 = z12;
-            y14 = _mm_add_epi32(y14, z13);
-            r14 = y14;
-            y14 = _mm_slli_epi32(y14, 13);
-            z14 = _mm_xor_si128(z14, y14);
-            r14 = _mm_srli_epi32(r14, 19);
-            z14 = _mm_xor_si128(z14, r14);
-
-            y10 = z8;
-            y10 = _mm_add_epi32(y10, z9);
-            r10 = y10;
-            y10 = _mm_slli_epi32(y10, 18);
-            z10 = _mm_xor_si128(z10, y10);
-            r10 = _mm_srli_epi32(r10, 14);
-            z10 = _mm_xor_si128(z10, r10);
-
-            y15 = z13;
-            y15 = _mm_add_epi32(y15, z14);
-            r15 = y15;
-            y15 = _mm_slli_epi32(y15, 18);
-            z15 = _mm_xor_si128(z15, y15);
-            r15 = _mm_srli_epi32(r15, 14);
-            z15 = _mm_xor_si128(z15, r15);
-        }
-
-/* store data ; this macro replicates the original amd64-xmm6 code */
-#define ONEQUAD_SHUFFLE(A, B, C, D)        \
-    z##A  = _mm_add_epi32(z##A, orig##A);  \
-    z##B  = _mm_add_epi32(z##B, orig##B);  \
-    z##C  = _mm_add_epi32(z##C, orig##C);  \
-    z##D  = _mm_add_epi32(z##D, orig##D);  \
-    in##A = _mm_cvtsi128_si32(z##A);       \
-    in##B = _mm_cvtsi128_si32(z##B);       \
-    in##C = _mm_cvtsi128_si32(z##C);       \
-    in##D = _mm_cvtsi128_si32(z##D);       \
-    z##A  = _mm_shuffle_epi32(z##A, 0x39); \
-    z##B  = _mm_shuffle_epi32(z##B, 0x39); \
-    z##C  = _mm_shuffle_epi32(z##C, 0x39); \
-    z##D  = _mm_shuffle_epi32(z##D, 0x39); \
-                                           \
-    in##A ^= *(uint32_t *) (m + 0);        \
-    in##B ^= *(uint32_t *) (m + 4);        \
-    in##C ^= *(uint32_t *) (m + 8);        \
-    in##D ^= *(uint32_t *) (m + 12);       \
-                                           \
-    *(uint32_t *) (c + 0)  = in##A;        \
-    *(uint32_t *) (c + 4)  = in##B;        \
-    *(uint32_t *) (c + 8)  = in##C;        \
-    *(uint32_t *) (c + 12) = in##D;        \
-                                           \
-    in##A = _mm_cvtsi128_si32(z##A);       \
-    in##B = _mm_cvtsi128_si32(z##B);       \
-    in##C = _mm_cvtsi128_si32(z##C);       \
-    in##D = _mm_cvtsi128_si32(z##D);       \
-    z##A  = _mm_shuffle_epi32(z##A, 0x39); \
-    z##B  = _mm_shuffle_epi32(z##B, 0x39); \
-    z##C  = _mm_shuffle_epi32(z##C, 0x39); \
-    z##D  = _mm_shuffle_epi32(z##D, 0x39); \
-                                           \
-    in##A ^= *(uint32_t *) (m + 64);       \
-    in##B ^= *(uint32_t *) (m + 68);       \
-    in##C ^= *(uint32_t *) (m + 72);       \
-    in##D ^= *(uint32_t *) (m + 76);       \
-    *(uint32_t *) (c + 64) = in##A;        \
-    *(uint32_t *) (c + 68) = in##B;        \
-    *(uint32_t *) (c + 72) = in##C;        \
-    *(uint32_t *) (c + 76) = in##D;        \
-                                           \
-    in##A = _mm_cvtsi128_si32(z##A);       \
-    in##B = _mm_cvtsi128_si32(z##B);       \
-    in##C = _mm_cvtsi128_si32(z##C);       \
-    in##D = _mm_cvtsi128_si32(z##D);       \
-    z##A  = _mm_shuffle_epi32(z##A, 0x39); \
-    z##B  = _mm_shuffle_epi32(z##B, 0x39); \
-    z##C  = _mm_shuffle_epi32(z##C, 0x39); \
-    z##D  = _mm_shuffle_epi32(z##D, 0x39); \
-                                           \
-    in##A ^= *(uint32_t *) (m + 128);      \
-    in##B ^= *(uint32_t *) (m + 132);      \
-    in##C ^= *(uint32_t *) (m + 136);      \
-    in##D ^= *(uint32_t *) (m + 140);      \
-    *(uint32_t *) (c + 128) = in##A;       \
-    *(uint32_t *) (c + 132) = in##B;       \
-    *(uint32_t *) (c + 136) = in##C;       \
-    *(uint32_t *) (c + 140) = in##D;       \
-                                           \
-    in##A = _mm_cvtsi128_si32(z##A);       \
-    in##B = _mm_cvtsi128_si32(z##B);       \
-    in##C = _mm_cvtsi128_si32(z##C);       \
-    in##D = _mm_cvtsi128_si32(z##D);       \
-                                           \
-    in##A ^= *(uint32_t *) (m + 192);      \
-    in##B ^= *(uint32_t *) (m + 196);      \
-    in##C ^= *(uint32_t *) (m + 200);      \
-    in##D ^= *(uint32_t *) (m + 204);      \
-    *(uint32_t *) (c + 192) = in##A;       \
-    *(uint32_t *) (c + 196) = in##B;       \
-    *(uint32_t *) (c + 200) = in##C;       \
-    *(uint32_t *) (c + 204) = in##D
-
-/* store data ; this macro replaces shuffle+mov by a direct extract; not much
- * difference */
-#define ONEQUAD_EXTRACT(A, B, C, D)       \
-    z##A  = _mm_add_epi32(z##A, orig##A); \
-    z##B  = _mm_add_epi32(z##B, orig##B); \
-    z##C  = _mm_add_epi32(z##C, orig##C); \
-    z##D  = _mm_add_epi32(z##D, orig##D); \
-    in##A = _mm_cvtsi128_si32(z##A);      \
-    in##B = _mm_cvtsi128_si32(z##B);      \
-    in##C = _mm_cvtsi128_si32(z##C);      \
-    in##D = _mm_cvtsi128_si32(z##D);      \
-    in##A ^= *(uint32_t *) (m + 0);       \
-    in##B ^= *(uint32_t *) (m + 4);       \
-    in##C ^= *(uint32_t *) (m + 8);       \
-    in##D ^= *(uint32_t *) (m + 12);      \
-    *(uint32_t *) (c + 0)  = in##A;       \
-    *(uint32_t *) (c + 4)  = in##B;       \
-    *(uint32_t *) (c + 8)  = in##C;       \
-    *(uint32_t *) (c + 12) = in##D;       \
-                                          \
-    in##A = _mm_extract_epi32(z##A, 1);   \
-    in##B = _mm_extract_epi32(z##B, 1);   \
-    in##C = _mm_extract_epi32(z##C, 1);   \
-    in##D = _mm_extract_epi32(z##D, 1);   \
-                                          \
-    in##A ^= *(uint32_t *) (m + 64);      \
-    in##B ^= *(uint32_t *) (m + 68);      \
-    in##C ^= *(uint32_t *) (m + 72);      \
-    in##D ^= *(uint32_t *) (m + 76);      \
-    *(uint32_t *) (c + 64) = in##A;       \
-    *(uint32_t *) (c + 68) = in##B;       \
-    *(uint32_t *) (c + 72) = in##C;       \
-    *(uint32_t *) (c + 76) = in##D;       \
-                                          \
-    in##A = _mm_extract_epi32(z##A, 2);   \
-    in##B = _mm_extract_epi32(z##B, 2);   \
-    in##C = _mm_extract_epi32(z##C, 2);   \
-    in##D = _mm_extract_epi32(z##D, 2);   \
-                                          \
-    in##A ^= *(uint32_t *) (m + 128);     \
-    in##B ^= *(uint32_t *) (m + 132);     \
-    in##C ^= *(uint32_t *) (m + 136);     \
-    in##D ^= *(uint32_t *) (m + 140);     \
-    *(uint32_t *) (c + 128) = in##A;      \
-    *(uint32_t *) (c + 132) = in##B;      \
-    *(uint32_t *) (c + 136) = in##C;      \
-    *(uint32_t *) (c + 140) = in##D;      \
-                                          \
-    in##A = _mm_extract_epi32(z##A, 3);   \
-    in##B = _mm_extract_epi32(z##B, 3);   \
-    in##C = _mm_extract_epi32(z##C, 3);   \
-    in##D = _mm_extract_epi32(z##D, 3);   \
-                                          \
-    in##A ^= *(uint32_t *) (m + 192);     \
-    in##B ^= *(uint32_t *) (m + 196);     \
-    in##C ^= *(uint32_t *) (m + 200);     \
-    in##D ^= *(uint32_t *) (m + 204);     \
-    *(uint32_t *) (c + 192) = in##A;      \
-    *(uint32_t *) (c + 196) = in##B;      \
-    *(uint32_t *) (c + 200) = in##C;      \
-    *(uint32_t *) (c + 204) = in##D
-
-/* store data ; this macro first transpose data in-registers, and then store
- * them in memory. much faster with icc. */
-#define ONEQUAD_TRANSPOSE(A, B, C, D)                                         \
-    z##A = _mm_add_epi32(z##A, orig##A);                                      \
-    z##B = _mm_add_epi32(z##B, orig##B);                                      \
-    z##C = _mm_add_epi32(z##C, orig##C);                                      \
-    z##D = _mm_add_epi32(z##D, orig##D);                                      \
-    y##A = _mm_unpacklo_epi32(z##A, z##B);                                    \
-    y##B = _mm_unpacklo_epi32(z##C, z##D);                                    \
-    y##C = _mm_unpackhi_epi32(z##A, z##B);                                    \
-    y##D = _mm_unpackhi_epi32(z##C, z##D);                                    \
-    z##A = _mm_unpacklo_epi64(y##A, y##B);                                    \
-    z##B = _mm_unpackhi_epi64(y##A, y##B);                                    \
-    z##C = _mm_unpacklo_epi64(y##C, y##D);                                    \
-    z##D = _mm_unpackhi_epi64(y##C, y##D);                                    \
-    y##A = _mm_xor_si128(z##A, _mm_loadu_si128((const __m128i *) (m + 0)));   \
-    _mm_storeu_si128((__m128i *) (c + 0), y##A);                              \
-    y##B = _mm_xor_si128(z##B, _mm_loadu_si128((const __m128i *) (m + 64)));  \
-    _mm_storeu_si128((__m128i *) (c + 64), y##B);                             \
-    y##C = _mm_xor_si128(z##C, _mm_loadu_si128((const __m128i *) (m + 128))); \
-    _mm_storeu_si128((__m128i *) (c + 128), y##C);                            \
-    y##D = _mm_xor_si128(z##D, _mm_loadu_si128((const __m128i *) (m + 192))); \
-    _mm_storeu_si128((__m128i *) (c + 192), y##D)
-
-#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
-
-        ONEQUAD(0, 1, 2, 3);
-        m += 16;
-        c += 16;
-        ONEQUAD(4, 5, 6, 7);
-        m += 16;
-        c += 16;
-        ONEQUAD(8, 9, 10, 11);
-        m += 16;
-        c += 16;
-        ONEQUAD(12, 13, 14, 15);
-        m -= 48;
-        c -= 48;
-
-#undef ONEQUAD
-#undef ONEQUAD_TRANSPOSE
-#undef ONEQUAD_EXTRACT
-#undef ONEQUAD_SHUFFLE
-
-        bytes -= 256;
-        c += 256;
-        m += 256;
-    }
-}
--- a/src/crypto/astrobwt/xmm6int/u8.h
+++ b/src/crypto/astrobwt/xmm6int/u8.h
@ -1,477 +0,0 @@
-if (bytes >= 512) {
-    __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
-        y15;
-
-    /* the naive way seems as fast (if not a bit faster) than the vector way */
-    __m256i z0  = _mm256_set1_epi32(x[0]);
-    __m256i z5  = _mm256_set1_epi32(x[1]);
-    __m256i z10 = _mm256_set1_epi32(x[2]);
-    __m256i z15 = _mm256_set1_epi32(x[3]);
-    __m256i z12 = _mm256_set1_epi32(x[4]);
-    __m256i z1  = _mm256_set1_epi32(x[5]);
-    __m256i z6  = _mm256_set1_epi32(x[6]);
-    __m256i z11 = _mm256_set1_epi32(x[7]);
-    __m256i z8; /* useless */
-    __m256i z13 = _mm256_set1_epi32(x[9]);
-    __m256i z2  = _mm256_set1_epi32(x[10]);
-    __m256i z7  = _mm256_set1_epi32(x[11]);
-    __m256i z4  = _mm256_set1_epi32(x[12]);
-    __m256i z9; /* useless */
-    __m256i z14 = _mm256_set1_epi32(x[14]);
-    __m256i z3  = _mm256_set1_epi32(x[15]);
-
-    __m256i orig0 = z0;
-    __m256i orig1 = z1;
-    __m256i orig2 = z2;
-    __m256i orig3 = z3;
-    __m256i orig4 = z4;
-    __m256i orig5 = z5;
-    __m256i orig6 = z6;
-    __m256i orig7 = z7;
-    __m256i orig8;
-    __m256i orig9;
-    __m256i orig10 = z10;
-    __m256i orig11 = z11;
-    __m256i orig12 = z12;
-    __m256i orig13 = z13;
-    __m256i orig14 = z14;
-    __m256i orig15 = z15;
-
-    uint32_t in8;
-    uint32_t in9;
-    int      i;
-
-    while (bytes >= 512) {
-        /* vector implementation for z8 and z9 */
-        /* faster than the naive version for 8 blocks */
-        const __m256i addv8   = _mm256_set_epi64x(3, 2, 1, 0);
-        const __m256i addv9   = _mm256_set_epi64x(7, 6, 5, 4);
-        const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
-
-        __m256i  t8, t9;
-        uint64_t in89;
-
-        in8  = x[8];
-        in9  = x[13]; /* see arrays above for the address translation */
-        in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
-
-        z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
-
-        t8 = _mm256_add_epi64(addv8, z8);
-        t9 = _mm256_add_epi64(addv9, z9);
-
-        z8 = _mm256_unpacklo_epi32(t8, t9);
-        z9 = _mm256_unpackhi_epi32(t8, t9);
-
-        t8 = _mm256_unpacklo_epi32(z8, z9);
-        t9 = _mm256_unpackhi_epi32(z8, z9);
-
-        /* required because unpack* are intra-lane */
-        z8 = _mm256_permutevar8x32_epi32(t8, permute);
-        z9 = _mm256_permutevar8x32_epi32(t9, permute);
-
-        orig8 = z8;
-        orig9 = z9;
-
-        in89 += 8;
-
-        x[8]  = in89 & 0xFFFFFFFF;
-        x[13] = (in89 >> 32) & 0xFFFFFFFF;
-
-        z5  = orig5;
-        z10 = orig10;
-        z15 = orig15;
-        z14 = orig14;
-        z3  = orig3;
-        z6  = orig6;
-        z11 = orig11;
-        z1  = orig1;
-
-        z7  = orig7;
-        z13 = orig13;
-        z2  = orig2;
-        z9  = orig9;
-        z0  = orig0;
-        z12 = orig12;
-        z4  = orig4;
-        z8  = orig8;
-
-        for (i = 0; i < ROUNDS; i += 2) {
-            /* the inner loop is a direct translation (regexp search/replace)
-             * from the amd64-xmm6 ASM */
-            __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
-                r14, r15;
-
-            y4 = z12;
-            y4 = _mm256_add_epi32(y4, z0);
-            r4 = y4;
-            y4 = _mm256_slli_epi32(y4, 7);
-            z4 = _mm256_xor_si256(z4, y4);
-            r4 = _mm256_srli_epi32(r4, 25);
-            z4 = _mm256_xor_si256(z4, r4);
-
-            y9 = z1;
-            y9 = _mm256_add_epi32(y9, z5);
-            r9 = y9;
-            y9 = _mm256_slli_epi32(y9, 7);
-            z9 = _mm256_xor_si256(z9, y9);
-            r9 = _mm256_srli_epi32(r9, 25);
-            z9 = _mm256_xor_si256(z9, r9);
-
-            y8 = z0;
-            y8 = _mm256_add_epi32(y8, z4);
-            r8 = y8;
-            y8 = _mm256_slli_epi32(y8, 9);
-            z8 = _mm256_xor_si256(z8, y8);
-            r8 = _mm256_srli_epi32(r8, 23);
-            z8 = _mm256_xor_si256(z8, r8);
-
-            y13 = z5;
-            y13 = _mm256_add_epi32(y13, z9);
-            r13 = y13;
-            y13 = _mm256_slli_epi32(y13, 9);
-            z13 = _mm256_xor_si256(z13, y13);
-            r13 = _mm256_srli_epi32(r13, 23);
-            z13 = _mm256_xor_si256(z13, r13);
-
-            y12 = z4;
-            y12 = _mm256_add_epi32(y12, z8);
-            r12 = y12;
-            y12 = _mm256_slli_epi32(y12, 13);
-            z12 = _mm256_xor_si256(z12, y12);
-            r12 = _mm256_srli_epi32(r12, 19);
-            z12 = _mm256_xor_si256(z12, r12);
-
-            y1 = z9;
-            y1 = _mm256_add_epi32(y1, z13);
-            r1 = y1;
-            y1 = _mm256_slli_epi32(y1, 13);
-            z1 = _mm256_xor_si256(z1, y1);
-            r1 = _mm256_srli_epi32(r1, 19);
-            z1 = _mm256_xor_si256(z1, r1);
-
-            y0 = z8;
-            y0 = _mm256_add_epi32(y0, z12);
-            r0 = y0;
-            y0 = _mm256_slli_epi32(y0, 18);
-            z0 = _mm256_xor_si256(z0, y0);
-            r0 = _mm256_srli_epi32(r0, 14);
-            z0 = _mm256_xor_si256(z0, r0);
-
-            y5 = z13;
-            y5 = _mm256_add_epi32(y5, z1);
-            r5 = y5;
-            y5 = _mm256_slli_epi32(y5, 18);
-            z5 = _mm256_xor_si256(z5, y5);
-            r5 = _mm256_srli_epi32(r5, 14);
-            z5 = _mm256_xor_si256(z5, r5);
-
-            y14 = z6;
-            y14 = _mm256_add_epi32(y14, z10);
-            r14 = y14;
-            y14 = _mm256_slli_epi32(y14, 7);
-            z14 = _mm256_xor_si256(z14, y14);
-            r14 = _mm256_srli_epi32(r14, 25);
-            z14 = _mm256_xor_si256(z14, r14);
-
-            y3 = z11;
-            y3 = _mm256_add_epi32(y3, z15);
-            r3 = y3;
-            y3 = _mm256_slli_epi32(y3, 7);
-            z3 = _mm256_xor_si256(z3, y3);
-            r3 = _mm256_srli_epi32(r3, 25);
-            z3 = _mm256_xor_si256(z3, r3);
-
-            y2 = z10;
-            y2 = _mm256_add_epi32(y2, z14);
-            r2 = y2;
-            y2 = _mm256_slli_epi32(y2, 9);
-            z2 = _mm256_xor_si256(z2, y2);
-            r2 = _mm256_srli_epi32(r2, 23);
-            z2 = _mm256_xor_si256(z2, r2);
-
-            y7 = z15;
-            y7 = _mm256_add_epi32(y7, z3);
-            r7 = y7;
-            y7 = _mm256_slli_epi32(y7, 9);
-            z7 = _mm256_xor_si256(z7, y7);
-            r7 = _mm256_srli_epi32(r7, 23);
-            z7 = _mm256_xor_si256(z7, r7);
-
-            y6 = z14;
-            y6 = _mm256_add_epi32(y6, z2);
-            r6 = y6;
-            y6 = _mm256_slli_epi32(y6, 13);
-            z6 = _mm256_xor_si256(z6, y6);
-            r6 = _mm256_srli_epi32(r6, 19);
-            z6 = _mm256_xor_si256(z6, r6);
-
-            y11 = z3;
-            y11 = _mm256_add_epi32(y11, z7);
-            r11 = y11;
-            y11 = _mm256_slli_epi32(y11, 13);
-            z11 = _mm256_xor_si256(z11, y11);
-            r11 = _mm256_srli_epi32(r11, 19);
-            z11 = _mm256_xor_si256(z11, r11);
-
-            y10 = z2;
-            y10 = _mm256_add_epi32(y10, z6);
-            r10 = y10;
-            y10 = _mm256_slli_epi32(y10, 18);
-            z10 = _mm256_xor_si256(z10, y10);
-            r10 = _mm256_srli_epi32(r10, 14);
-            z10 = _mm256_xor_si256(z10, r10);
-
-            y1 = z3;
-            y1 = _mm256_add_epi32(y1, z0);
-            r1 = y1;
-            y1 = _mm256_slli_epi32(y1, 7);
-            z1 = _mm256_xor_si256(z1, y1);
-            r1 = _mm256_srli_epi32(r1, 25);
-            z1 = _mm256_xor_si256(z1, r1);
-
-            y15 = z7;
-            y15 = _mm256_add_epi32(y15, z11);
-            r15 = y15;
-            y15 = _mm256_slli_epi32(y15, 18);
-            z15 = _mm256_xor_si256(z15, y15);
-            r15 = _mm256_srli_epi32(r15, 14);
-            z15 = _mm256_xor_si256(z15, r15);
-
-            y6 = z4;
-            y6 = _mm256_add_epi32(y6, z5);
-            r6 = y6;
-            y6 = _mm256_slli_epi32(y6, 7);
-            z6 = _mm256_xor_si256(z6, y6);
-            r6 = _mm256_srli_epi32(r6, 25);
-            z6 = _mm256_xor_si256(z6, r6);
-
-            y2 = z0;
-            y2 = _mm256_add_epi32(y2, z1);
-            r2 = y2;
-            y2 = _mm256_slli_epi32(y2, 9);
-            z2 = _mm256_xor_si256(z2, y2);
-            r2 = _mm256_srli_epi32(r2, 23);
-            z2 = _mm256_xor_si256(z2, r2);
-
-            y7 = z5;
-            y7 = _mm256_add_epi32(y7, z6);
-            r7 = y7;
-            y7 = _mm256_slli_epi32(y7, 9);
-            z7 = _mm256_xor_si256(z7, y7);
-            r7 = _mm256_srli_epi32(r7, 23);
-            z7 = _mm256_xor_si256(z7, r7);
-
-            y3 = z1;
-            y3 = _mm256_add_epi32(y3, z2);
-            r3 = y3;
-            y3 = _mm256_slli_epi32(y3, 13);
-            z3 = _mm256_xor_si256(z3, y3);
-            r3 = _mm256_srli_epi32(r3, 19);
-            z3 = _mm256_xor_si256(z3, r3);
-
-            y4 = z6;
-            y4 = _mm256_add_epi32(y4, z7);
-            r4 = y4;
-            y4 = _mm256_slli_epi32(y4, 13);
-            z4 = _mm256_xor_si256(z4, y4);
-            r4 = _mm256_srli_epi32(r4, 19);
-            z4 = _mm256_xor_si256(z4, r4);
-
-            y0 = z2;
-            y0 = _mm256_add_epi32(y0, z3);
-            r0 = y0;
-            y0 = _mm256_slli_epi32(y0, 18);
-            z0 = _mm256_xor_si256(z0, y0);
-            r0 = _mm256_srli_epi32(r0, 14);
-            z0 = _mm256_xor_si256(z0, r0);
-
-            y5 = z7;
-            y5 = _mm256_add_epi32(y5, z4);
-            r5 = y5;
-            y5 = _mm256_slli_epi32(y5, 18);
-            z5 = _mm256_xor_si256(z5, y5);
-            r5 = _mm256_srli_epi32(r5, 14);
-            z5 = _mm256_xor_si256(z5, r5);
-
-            y11 = z9;
-            y11 = _mm256_add_epi32(y11, z10);
-            r11 = y11;
-            y11 = _mm256_slli_epi32(y11, 7);
-            z11 = _mm256_xor_si256(z11, y11);
-            r11 = _mm256_srli_epi32(r11, 25);
-            z11 = _mm256_xor_si256(z11, r11);
-
-            y12 = z14;
-            y12 = _mm256_add_epi32(y12, z15);
-            r12 = y12;
-            y12 = _mm256_slli_epi32(y12, 7);
-            z12 = _mm256_xor_si256(z12, y12);
-            r12 = _mm256_srli_epi32(r12, 25);
-            z12 = _mm256_xor_si256(z12, r12);
-
-            y8 = z10;
-            y8 = _mm256_add_epi32(y8, z11);
-            r8 = y8;
-            y8 = _mm256_slli_epi32(y8, 9);
-            z8 = _mm256_xor_si256(z8, y8);
-            r8 = _mm256_srli_epi32(r8, 23);
-            z8 = _mm256_xor_si256(z8, r8);
-
-            y13 = z15;
-            y13 = _mm256_add_epi32(y13, z12);
-            r13 = y13;
-            y13 = _mm256_slli_epi32(y13, 9);
-            z13 = _mm256_xor_si256(z13, y13);
-            r13 = _mm256_srli_epi32(r13, 23);
-            z13 = _mm256_xor_si256(z13, r13);
-
-            y9 = z11;
-            y9 = _mm256_add_epi32(y9, z8);
-            r9 = y9;
-            y9 = _mm256_slli_epi32(y9, 13);
-            z9 = _mm256_xor_si256(z9, y9);
-            r9 = _mm256_srli_epi32(r9, 19);
-            z9 = _mm256_xor_si256(z9, r9);
-
-            y14 = z12;
-            y14 = _mm256_add_epi32(y14, z13);
-            r14 = y14;
-            y14 = _mm256_slli_epi32(y14, 13);
-            z14 = _mm256_xor_si256(z14, y14);
-            r14 = _mm256_srli_epi32(r14, 19);
-            z14 = _mm256_xor_si256(z14, r14);
-
-            y10 = z8;
-            y10 = _mm256_add_epi32(y10, z9);
-            r10 = y10;
-            y10 = _mm256_slli_epi32(y10, 18);
-            z10 = _mm256_xor_si256(z10, y10);
-            r10 = _mm256_srli_epi32(r10, 14);
-            z10 = _mm256_xor_si256(z10, r10);
-
-            y15 = z13;
-            y15 = _mm256_add_epi32(y15, z14);
-            r15 = y15;
-            y15 = _mm256_slli_epi32(y15, 18);
-            z15 = _mm256_xor_si256(z15, y15);
-            r15 = _mm256_srli_epi32(r15, 14);
-            z15 = _mm256_xor_si256(z15, r15);
-        }
-
-/* store data ; this macro first transpose data in-registers, and then store
- * them in memory. much faster with icc. */
-#define ONEQUAD_TRANSPOSE(A, B, C, D)                                    \
-    {                                                                    \
-        __m128i t0, t1, t2, t3;                                          \
-        z##A = _mm256_add_epi32(z##A, orig##A);                          \
-        z##B = _mm256_add_epi32(z##B, orig##B);                          \
-        z##C = _mm256_add_epi32(z##C, orig##C);                          \
-        z##D = _mm256_add_epi32(z##D, orig##D);                          \
-        y##A = _mm256_unpacklo_epi32(z##A, z##B);                        \
-        y##B = _mm256_unpacklo_epi32(z##C, z##D);                        \
-        y##C = _mm256_unpackhi_epi32(z##A, z##B);                        \
-        y##D = _mm256_unpackhi_epi32(z##C, z##D);                        \
-        z##A = _mm256_unpacklo_epi64(y##A, y##B);                        \
-        z##B = _mm256_unpackhi_epi64(y##A, y##B);                        \
-        z##C = _mm256_unpacklo_epi64(y##C, y##D);                        \
-        z##D = _mm256_unpackhi_epi64(y##C, y##D);                        \
-        t0   = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0),          \
-                           _mm_loadu_si128((const __m128i*) (m + 0))); \
-        _mm_storeu_si128((__m128i*) (c + 0), t0);                        \
-        t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0),            \
-                           _mm_loadu_si128((const __m128i*) (m + 64)));  \
-        _mm_storeu_si128((__m128i*) (c + 64), t1);                       \
-        t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0),            \
-                           _mm_loadu_si128((const __m128i*) (m + 128))); \
-        _mm_storeu_si128((__m128i*) (c + 128), t2);                      \
-        t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0),            \
-                           _mm_loadu_si128((const __m128i*) (m + 192))); \
-        _mm_storeu_si128((__m128i*) (c + 192), t3);                      \
-        t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1),            \
-                           _mm_loadu_si128((const __m128i*) (m + 256))); \
-        _mm_storeu_si128((__m128i*) (c + 256), t0);                      \
-        t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1),            \
-                           _mm_loadu_si128((const __m128i*) (m + 320))); \
-        _mm_storeu_si128((__m128i*) (c + 320), t1);                      \
-        t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1),            \
-                           _mm_loadu_si128((const __m128i*) (m + 384))); \
-        _mm_storeu_si128((__m128i*) (c + 384), t2);                      \
-        t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1),            \
-                           _mm_loadu_si128((const __m128i*) (m + 448))); \
-        _mm_storeu_si128((__m128i*) (c + 448), t3);                      \
-    }
-
-#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
-
-#define ONEQUAD_UNPCK(A, B, C, D)                 \
-    {                                             \
-        z##A = _mm256_add_epi32(z##A, orig##A);   \
-        z##B = _mm256_add_epi32(z##B, orig##B);   \
-        z##C = _mm256_add_epi32(z##C, orig##C);   \
-        z##D = _mm256_add_epi32(z##D, orig##D);   \
-        y##A = _mm256_unpacklo_epi32(z##A, z##B); \
-        y##B = _mm256_unpacklo_epi32(z##C, z##D); \
-        y##C = _mm256_unpackhi_epi32(z##A, z##B); \
-        y##D = _mm256_unpackhi_epi32(z##C, z##D); \
-        z##A = _mm256_unpacklo_epi64(y##A, y##B); \
-        z##B = _mm256_unpackhi_epi64(y##A, y##B); \
-        z##C = _mm256_unpacklo_epi64(y##C, y##D); \
-        z##D = _mm256_unpackhi_epi64(y##C, y##D); \
-    }
-
-#define ONEOCTO(A, B, C, D, A2, B2, C2, D2)                                    \
-    {                                                                          \
-        ONEQUAD_UNPCK(A, B, C, D);                                             \
-        ONEQUAD_UNPCK(A2, B2, C2, D2);                                         \
-        y##A  = _mm256_permute2x128_si256(z##A, z##A2, 0x20);                  \
-        y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31);                  \
-        y##B  = _mm256_permute2x128_si256(z##B, z##B2, 0x20);                  \
-        y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31);                  \
-        y##C  = _mm256_permute2x128_si256(z##C, z##C2, 0x20);                  \
-        y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31);                  \
-        y##D  = _mm256_permute2x128_si256(z##D, z##D2, 0x20);                  \
-        y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31);                  \
-        y##A  = _mm256_xor_si256(y##A,                                         \
-                                _mm256_loadu_si256((const __m256i*) (m + 0))); \
-        y##B  = _mm256_xor_si256(                                              \
-            y##B, _mm256_loadu_si256((const __m256i*) (m + 64)));              \
-        y##C = _mm256_xor_si256(                                               \
-            y##C, _mm256_loadu_si256((const __m256i*) (m + 128)));             \
-        y##D = _mm256_xor_si256(                                               \
-            y##D, _mm256_loadu_si256((const __m256i*) (m + 192)));             \
-        y##A2 = _mm256_xor_si256(                                              \
-            y##A2, _mm256_loadu_si256((const __m256i*) (m + 256)));            \
-        y##B2 = _mm256_xor_si256(                                              \
-            y##B2, _mm256_loadu_si256((const __m256i*) (m + 320)));            \
-        y##C2 = _mm256_xor_si256(                                              \
-            y##C2, _mm256_loadu_si256((const __m256i*) (m + 384)));            \
-        y##D2 = _mm256_xor_si256(                                              \
-            y##D2, _mm256_loadu_si256((const __m256i*) (m + 448)));            \
-        _mm256_storeu_si256((__m256i*) (c + 0), y##A);                         \
-        _mm256_storeu_si256((__m256i*) (c + 64), y##B);                        \
-        _mm256_storeu_si256((__m256i*) (c + 128), y##C);                       \
-        _mm256_storeu_si256((__m256i*) (c + 192), y##D);                       \
-        _mm256_storeu_si256((__m256i*) (c + 256), y##A2);                      \
-        _mm256_storeu_si256((__m256i*) (c + 320), y##B2);                      \
-        _mm256_storeu_si256((__m256i*) (c + 384), y##C2);                      \
-        _mm256_storeu_si256((__m256i*) (c + 448), y##D2);                      \
-    }
-
-        ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
-        m += 32;
-        c += 32;
-        ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
-        m -= 32;
-        c -= 32;
-
-#undef ONEQUAD
-#undef ONEQUAD_TRANSPOSE
-#undef ONEQUAD_UNPCK
-#undef ONEOCTO
-
-        bytes -= 512;
-        c += 512;
-        m += 512;
-    }
-}
--- a/src/crypto/cn/CnAlgo.h
+++ b/src/crypto/cn/CnAlgo.h
@ -43,6 +43,7 @@ public:
    constexpr inline size_t memory() const       { static_assert(Algorithm::isCN(ALGO), "invalid CRYPTONIGHT algorithm"); return Algorithm::l3(ALGO); }
    constexpr inline uint32_t iterations() const { static_assert(Algorithm::isCN(ALGO), "invalid CRYPTONIGHT algorithm"); return CN_ITER; }
    constexpr inline uint32_t mask() const       { return static_cast<uint32_t>(((memory() - 1) / 16) * 16); }
+    constexpr inline uint32_t half_mem() const   { return mask() < memory() / 2; }

    inline static uint32_t iterations(Algorithm::Id algo)
    {
@ -108,6 +109,16 @@ public:
        }
 #       endif

+#       ifdef XMRIG_ALGO_GHOSTRIDER
+        if (algo == Algorithm::CN_GR_1) {
+            return 0x3FFF0;
+        }
+
+        if (algo == Algorithm::CN_GR_5) {
+            return 0x1FFF0;
+        }
+#       endif
+
        return ((Algorithm::l3(algo) - 1) / 16) * 16;
    }

@ -136,6 +147,18 @@ template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_UPX2>::iterations() co
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_0>::mask() const             { return 0x1FFF0; }
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_UPX2>::mask() const               { return 0x1FFF0; }

+#ifdef XMRIG_ALGO_GHOSTRIDER
+template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GR_0>::iterations() const         { return CN_ITER / 4; }
+template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GR_1>::iterations() const         { return CN_ITER / 4; }
+template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GR_2>::iterations() const         { return CN_ITER / 2; }
+template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GR_3>::iterations() const         { return CN_ITER / 2; }
+template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GR_4>::iterations() const         { return CN_ITER / 8; }
+template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GR_5>::iterations() const         { return CN_ITER / 8; }
+
+template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GR_1>::mask() const               { return 0x3FFF0; }
+template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GR_5>::mask() const               { return 0x1FFF0; }
+#endif
+

 } /* namespace xmrig */

--- a/src/crypto/cn/CnHash.cpp
+++ b/src/crypto/cn/CnHash.cpp
@ -35,11 +35,6 @@
 #endif


-#ifdef XMRIG_ALGO_ASTROBWT
-#   include "crypto/astrobwt/AstroBWT.h"
-#endif
-
-
 #define ADD_FN(algo) do {                                                                            \
        m_map[algo] = new cn_hash_fun_array{};                                                       \
        m_map[algo]->data[AV_SINGLE][Assembly::NONE]      = cryptonight_single_hash<algo, false, 0>; \
@ -55,6 +50,10 @@
    } while (0)


+bool cn_sse41_enabled = false;
+bool cn_vaes_enabled = false;
+
+
 #ifdef XMRIG_FEATURE_ASM
 #   define ADD_FN_ASM(algo) do {                                                                                    \
        m_map[algo]->data[AV_SINGLE][Assembly::INTEL]     = cryptonight_single_hash_asm<algo, Assembly::INTEL>;     \
@ -97,6 +96,27 @@ cn_mainloop_fun        cn_double_double_mainloop_sandybridge_asm  = nullptr;
 cn_mainloop_fun        cn_upx2_mainloop_asm                       = nullptr;
 cn_mainloop_fun        cn_upx2_double_mainloop_asm                = nullptr;

+cn_mainloop_fun        cn_gr0_single_mainloop_asm                 = nullptr;
+cn_mainloop_fun        cn_gr1_single_mainloop_asm                 = nullptr;
+cn_mainloop_fun        cn_gr2_single_mainloop_asm                 = nullptr;
+cn_mainloop_fun        cn_gr3_single_mainloop_asm                 = nullptr;
+cn_mainloop_fun        cn_gr4_single_mainloop_asm                 = nullptr;
+cn_mainloop_fun        cn_gr5_single_mainloop_asm                 = nullptr;
+
+cn_mainloop_fun        cn_gr0_double_mainloop_asm                 = nullptr;
+cn_mainloop_fun        cn_gr1_double_mainloop_asm                 = nullptr;
+cn_mainloop_fun        cn_gr2_double_mainloop_asm                 = nullptr;
+cn_mainloop_fun        cn_gr3_double_mainloop_asm                 = nullptr;
+cn_mainloop_fun        cn_gr4_double_mainloop_asm                 = nullptr;
+cn_mainloop_fun        cn_gr5_double_mainloop_asm                 = nullptr;
+
+cn_mainloop_fun        cn_gr0_quad_mainloop_asm                   = nullptr;
+cn_mainloop_fun        cn_gr1_quad_mainloop_asm                   = nullptr;
+cn_mainloop_fun        cn_gr2_quad_mainloop_asm                   = nullptr;
+cn_mainloop_fun        cn_gr3_quad_mainloop_asm                   = nullptr;
+cn_mainloop_fun        cn_gr4_quad_mainloop_asm                   = nullptr;
+cn_mainloop_fun        cn_gr5_quad_mainloop_asm                   = nullptr;
+

 template<Algorithm::Id SOURCE_ALGO = Algorithm::CN_2, typename T, typename U>
 static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t mask = CnAlgo<Algorithm::CN_HALF>().mask())
@ -136,7 +156,7 @@ static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t ma

 static void patchAsmVariants()
 {
-    const int allocation_size = 131072;
+    constexpr size_t allocation_size = 0x20000;
    auto base = static_cast<uint8_t *>(VirtualMemory::allocateExecutableMemory(allocation_size, false));

    cn_half_mainloop_ivybridge_asm              = reinterpret_cast<cn_mainloop_fun>         (base + 0x0000);
@ -173,6 +193,29 @@ static void patchAsmVariants()
    cn_upx2_double_mainloop_asm                 = reinterpret_cast<cn_mainloop_fun>         (base + 0x15000);
 #   endif

+#   ifdef XMRIG_ALGO_GHOSTRIDER
+    cn_gr0_single_mainloop_asm                  = reinterpret_cast<cn_mainloop_fun>         (base + 0x16000);
+    cn_gr1_single_mainloop_asm                  = reinterpret_cast<cn_mainloop_fun>         (base + 0x16800);
+    cn_gr2_single_mainloop_asm                  = reinterpret_cast<cn_mainloop_fun>         (base + 0x17000);
+    cn_gr3_single_mainloop_asm                  = reinterpret_cast<cn_mainloop_fun>         (base + 0x17800);
+    cn_gr4_single_mainloop_asm                  = reinterpret_cast<cn_mainloop_fun>         (base + 0x18000);
+    cn_gr5_single_mainloop_asm                  = reinterpret_cast<cn_mainloop_fun>         (base + 0x18800);
+
+    cn_gr0_double_mainloop_asm                  = reinterpret_cast<cn_mainloop_fun>         (base + 0x19000);
+    cn_gr1_double_mainloop_asm                  = reinterpret_cast<cn_mainloop_fun>         (base + 0x19800);
+    cn_gr2_double_mainloop_asm                  = reinterpret_cast<cn_mainloop_fun>         (base + 0x1A000);
+    cn_gr3_double_mainloop_asm                  = reinterpret_cast<cn_mainloop_fun>         (base + 0x1A800);
+    cn_gr4_double_mainloop_asm                  = reinterpret_cast<cn_mainloop_fun>         (base + 0x1B000);
+    cn_gr5_double_mainloop_asm                  = reinterpret_cast<cn_mainloop_fun>         (base + 0x1B800);
+
+    cn_gr0_quad_mainloop_asm                    = reinterpret_cast<cn_mainloop_fun>         (base + 0x1C000);
+    cn_gr1_quad_mainloop_asm                    = reinterpret_cast<cn_mainloop_fun>         (base + 0x1C800);
+    cn_gr2_quad_mainloop_asm                    = reinterpret_cast<cn_mainloop_fun>         (base + 0x1D000);
+    cn_gr3_quad_mainloop_asm                    = reinterpret_cast<cn_mainloop_fun>         (base + 0x1D800);
+    cn_gr4_quad_mainloop_asm                    = reinterpret_cast<cn_mainloop_fun>         (base + 0x1E000);
+    cn_gr5_quad_mainloop_asm                    = reinterpret_cast<cn_mainloop_fun>         (base + 0x1E800);
+#   endif
+
    {
        constexpr uint32_t ITER = CnAlgo<Algorithm::CN_HALF>().iterations();

@ -230,7 +273,30 @@ static void patchAsmVariants()
        patchCode<Algorithm::CN_RWZ>(cn_upx2_mainloop_asm,        cnv2_rwz_mainloop_asm,            ITER,   MASK);
        patchCode<Algorithm::CN_RWZ>(cn_upx2_double_mainloop_asm, cnv2_rwz_double_mainloop_asm,     ITER,   MASK);
    }
-#endif
+#   endif
+
+#   ifdef XMRIG_ALGO_GHOSTRIDER
+    patchCode<Algorithm::CN_1>(cn_gr0_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo<Algorithm::CN_GR_0>().iterations(), CnAlgo<Algorithm::CN_GR_0>().mask());
+    patchCode<Algorithm::CN_1>(cn_gr1_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo<Algorithm::CN_GR_1>().iterations(), CnAlgo<Algorithm::CN_GR_1>().mask());
+    patchCode<Algorithm::CN_1>(cn_gr2_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo<Algorithm::CN_GR_2>().iterations(), CnAlgo<Algorithm::CN_GR_2>().mask());
+    patchCode<Algorithm::CN_1>(cn_gr3_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo<Algorithm::CN_GR_3>().iterations(), CnAlgo<Algorithm::CN_GR_3>().mask());
+    patchCode<Algorithm::CN_1>(cn_gr4_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo<Algorithm::CN_GR_4>().iterations(), CnAlgo<Algorithm::CN_GR_4>().mask());
+    patchCode<Algorithm::CN_1>(cn_gr5_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo<Algorithm::CN_GR_5>().iterations(), CnAlgo<Algorithm::CN_GR_5>().mask());
+
+    patchCode<Algorithm::CN_1>(cn_gr0_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo<Algorithm::CN_GR_0>().iterations(), CnAlgo<Algorithm::CN_GR_0>().mask());
+    patchCode<Algorithm::CN_1>(cn_gr1_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo<Algorithm::CN_GR_1>().iterations(), CnAlgo<Algorithm::CN_GR_1>().mask());
+    patchCode<Algorithm::CN_1>(cn_gr2_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo<Algorithm::CN_GR_2>().iterations(), CnAlgo<Algorithm::CN_GR_2>().mask());
+    patchCode<Algorithm::CN_1>(cn_gr3_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo<Algorithm::CN_GR_3>().iterations(), CnAlgo<Algorithm::CN_GR_3>().mask());
+    patchCode<Algorithm::CN_1>(cn_gr4_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo<Algorithm::CN_GR_4>().iterations(), CnAlgo<Algorithm::CN_GR_4>().mask());
+    patchCode<Algorithm::CN_1>(cn_gr5_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo<Algorithm::CN_GR_5>().iterations(), CnAlgo<Algorithm::CN_GR_5>().mask());
+
+    patchCode<Algorithm::CN_1>(cn_gr0_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo<Algorithm::CN_GR_0>().iterations(), CnAlgo<Algorithm::CN_GR_0>().mask());
+    patchCode<Algorithm::CN_1>(cn_gr1_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo<Algorithm::CN_GR_1>().iterations(), CnAlgo<Algorithm::CN_GR_1>().mask());
+    patchCode<Algorithm::CN_1>(cn_gr2_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo<Algorithm::CN_GR_2>().iterations(), CnAlgo<Algorithm::CN_GR_2>().mask());
+    patchCode<Algorithm::CN_1>(cn_gr3_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo<Algorithm::CN_GR_3>().iterations(), CnAlgo<Algorithm::CN_GR_3>().mask());
+    patchCode<Algorithm::CN_1>(cn_gr4_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo<Algorithm::CN_GR_4>().iterations(), CnAlgo<Algorithm::CN_GR_4>().mask());
+    patchCode<Algorithm::CN_1>(cn_gr5_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo<Algorithm::CN_GR_5>().iterations(), CnAlgo<Algorithm::CN_GR_5>().mask());
+#   endif

    VirtualMemory::protectRX(base, allocation_size);
    VirtualMemory::flushInstructionCache(base, allocation_size);
@ -304,10 +370,13 @@ xmrig::CnHash::CnHash()
    m_map[Algorithm::AR2_WRKZ]->data[AV_SINGLE_SOFT][Assembly::NONE]      = argon2::single_hash<Algorithm::AR2_WRKZ>;
 #   endif

-#   ifdef XMRIG_ALGO_ASTROBWT
-    m_map[Algorithm::ASTROBWT_DERO] = new cn_hash_fun_array{};
-    m_map[Algorithm::ASTROBWT_DERO]->data[AV_SINGLE][Assembly::NONE]      = astrobwt::single_hash<Algorithm::ASTROBWT_DERO>;
-    m_map[Algorithm::ASTROBWT_DERO]->data[AV_SINGLE_SOFT][Assembly::NONE] = astrobwt::single_hash<Algorithm::ASTROBWT_DERO>;
+#   ifdef XMRIG_ALGO_GHOSTRIDER
+    ADD_FN(Algorithm::CN_GR_0);
+    ADD_FN(Algorithm::CN_GR_1);
+    ADD_FN(Algorithm::CN_GR_2);
+    ADD_FN(Algorithm::CN_GR_3);
+    ADD_FN(Algorithm::CN_GR_4);
+    ADD_FN(Algorithm::CN_GR_5);
 #   endif

 #   ifdef XMRIG_FEATURE_ASM
@ -316,6 +385,14 @@ xmrig::CnHash::CnHash()
 }


+xmrig::CnHash::~CnHash()
+{
+    for (auto const& x : m_map) {
+      delete m_map[x.first];
+    }
+}
+
+
 xmrig::cn_hash_fun xmrig::CnHash::fn(const Algorithm &algorithm, AlgoVariant av, Assembly::Id assembly)
 {
    assert(cnHash.m_map.count(algorithm));
@ -331,7 +408,7 @@ xmrig::cn_hash_fun xmrig::CnHash::fn(const Algorithm &algorithm, AlgoVariant av,

 #   ifdef XMRIG_ALGO_CN_HEAVY
    // cn-heavy optimization for Zen3 CPUs
-    if ((av == AV_SINGLE) && (assembly != Assembly::NONE) && (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3)) {
+    if ((av == AV_SINGLE) && (assembly != Assembly::NONE) && (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3) && (Cpu::info()->model() == 0x21)) {
        switch (algorithm.id()) {
        case Algorithm::CN_HEAVY_0:
            return cryptonight_single_hash<Algorithm::CN_HEAVY_0, false, 3>;
--- a/src/crypto/cn/CnHash.h
+++ b/src/crypto/cn/CnHash.h
@ -59,6 +59,7 @@ public:
    };

    CnHash();
+    virtual ~CnHash();

    static cn_hash_fun fn(const Algorithm &algorithm, AlgoVariant av, Assembly::Id assembly);

--- a/src/crypto/cn/CryptoNight.h
+++ b/src/crypto/cn/CryptoNight.h
@ -52,12 +52,17 @@ struct cryptonight_r_data {
 struct cryptonight_ctx {
    alignas(16) uint8_t state[224];
    alignas(16) uint8_t *memory;
+    const uint32_t* tweak1_table;
+    uint64_t tweak1_2;

-    uint8_t unused[40];
+    uint8_t unused[24];
    const uint32_t *saes_table;

    cn_mainloop_fun_ms_abi generated_code;
    cryptonight_r_data generated_code_data;
+
+    alignas(16) uint8_t save_state[128];
+    bool first_half;
 };


--- a/src/crypto/cn/CryptoNight_arm.h
+++ b/src/crypto/cn/CryptoNight_arm.h
@ -349,6 +349,9 @@ static inline __m128i aes_round_tweak_div(const __m128i &in, const __m128i &key)
 }


+alignas(64) static const uint32_t tweak1_table[256] = { 268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456 };
+
+
 namespace xmrig {


@ -368,12 +371,7 @@ static inline void cryptonight_monero_tweak(const uint8_t* l, uint64_t idx, __m1

        uint64_t vh = vgetq_lane_u64(tmp, 1);

-        uint8_t x = vh >> 24;
-        static const uint16_t table = 0x7531;
-        const uint8_t index = (((x >> (3)) & 6) | (x & 1)) << 1;
-        vh ^= ((table >> index) & 0x3) << 28;
-
-        mem_out[1] = vh;
+        mem_out[1] = vh ^ tweak1_table[static_cast<uint8_t>(vh >> 24)];
    }
 }

--- a/src/crypto/cn/CryptoNight_monero.h
+++ b/src/crypto/cn/CryptoNight_monero.h
@ -204,4 +204,7 @@
    v4_random_math(code##part, r##part); \
  }

+extern bool cn_sse41_enabled;
+extern bool cn_vaes_enabled;
+
 #endif /* XMRIG_CRYPTONIGHT_MONERO_H */
--- a/src/crypto/cn/CryptoNight_test.h
+++ b/src/crypto/cn/CryptoNight_test.h
@ -100,7 +100,7 @@ const static uint8_t test_output_r[] = {


 // "cn/0"
-const static uint8_t test_output_v0[160] = {
+const static uint8_t test_output_v0[256] = {
    0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7,
    0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00,
    0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66,
@ -115,7 +115,7 @@ const static uint8_t test_output_v0[160] = {


 // "cn/1" Cryptonight variant 1 (Monero v7)
-const static uint8_t test_output_v1[160] = {
+const static uint8_t test_output_v1[256] = {
    0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9,
    0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9,
    0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D,
@ -130,7 +130,7 @@ const static uint8_t test_output_v1[160] = {


 // "cn/2" Cryptonight variant 2 (Monero v8)
-const static uint8_t test_output_v2[160] = {
+const static uint8_t test_output_v2[256] = {
    0x97, 0x37, 0x82, 0x82, 0xCF, 0x10, 0xE7, 0xAD, 0x03, 0x3F, 0x7B, 0x80, 0x74, 0xC4, 0x0E, 0x14,
    0xD0, 0x6E, 0x7F, 0x60, 0x9D, 0xDD, 0xDA, 0x78, 0x76, 0x80, 0xB5, 0x8C, 0x05, 0xF4, 0x3D, 0x21,
    0x87, 0x1F, 0xCD, 0x68, 0x23, 0xF6, 0xA8, 0x79, 0xBB, 0x3F, 0x33, 0x95, 0x1C, 0x8E, 0x8E, 0x89,
@ -145,7 +145,7 @@ const static uint8_t test_output_v2[160] = {


 // "cn/half"
-const static uint8_t test_output_half[160] = {
+const static uint8_t test_output_half[256] = {
    0x5D, 0x4F, 0xBC, 0x35, 0x60, 0x97, 0xEA, 0x64, 0x40, 0xB0, 0x88, 0x8E, 0xDE, 0xB6, 0x35, 0xDD,
    0xC8, 0x4A, 0x0E, 0x39, 0x7C, 0x86, 0x84, 0x56, 0x89, 0x5C, 0x3F, 0x29, 0xBE, 0x73, 0x12, 0xA7,
    0x02, 0xE6, 0x1D, 0x2B, 0xBC, 0x84, 0xB6, 0x71, 0x96, 0x71, 0xD5, 0x0C, 0xAC, 0x76, 0x0E, 0x6B,
@ -160,7 +160,7 @@ const static uint8_t test_output_half[160] = {


 // "cn/msr" Masari (MSR)
-const static uint8_t test_output_msr[160] = {
+const static uint8_t test_output_msr[256] = {
    0x3C, 0x7A, 0x61, 0x08, 0x4C, 0x5E, 0xB8, 0x65, 0xB4, 0x98, 0xAB, 0x2F, 0x5A, 0x1A, 0xC5, 0x2C,
    0x49, 0xC1, 0x77, 0xC2, 0xD0, 0x13, 0x34, 0x42, 0xD6, 0x5E, 0xD5, 0x14, 0x33, 0x5C, 0x82, 0xC5,
    0x69, 0xDF, 0x38, 0x51, 0x1B, 0xB3, 0xEB, 0x7D, 0xE7, 0x6B, 0x08, 0x8E, 0xB6, 0x7E, 0xB7, 0x1C,
@ -175,7 +175,7 @@ const static uint8_t test_output_msr[160] = {


 // "cn/xao" Alloy (XAO)
-const static uint8_t test_output_xao[160] = {
+const static uint8_t test_output_xao[256] = {
    0x9A, 0x29, 0xD0, 0xC4, 0xAF, 0xDC, 0x63, 0x9B, 0x65, 0x53, 0xB1, 0xC8, 0x37, 0x35, 0x11, 0x4C,
    0x5D, 0x77, 0x16, 0x21, 0x42, 0x97, 0x5C, 0xB8, 0x50, 0xC0, 0xA5, 0x1F, 0x64, 0x07, 0xBD, 0x33,
    0xF1, 0xC9, 0x98, 0x40, 0x42, 0xDE, 0x39, 0xD1, 0xBA, 0x2D, 0xAD, 0xEC, 0xFE, 0xEA, 0xD8, 0x46,
@ -190,7 +190,7 @@ const static uint8_t test_output_xao[160] = {


 // "cn/rto" Arto (RTO)
-const static uint8_t test_output_rto[160] = {
+const static uint8_t test_output_rto[256] = {
    0x82, 0x66, 0x1E, 0x1C, 0x6E, 0x64, 0x36, 0x66, 0x84, 0x06, 0x32, 0x7A, 0x9B, 0xB1, 0x13, 0x19,
    0xA5, 0x56, 0x16, 0x15, 0xDF, 0xEC, 0x1C, 0x9E, 0xE3, 0x88, 0x4A, 0x6C, 0x1C, 0xEB, 0x76, 0xA5,
    0xB3, 0xFB, 0xF4, 0x3F, 0x2B, 0x6A, 0x3A, 0x39, 0xA3, 0x6E, 0x08, 0x33, 0x67, 0x90, 0x31, 0xB9,
@ -204,7 +204,7 @@ const static uint8_t test_output_rto[160] = {
 };

 // "cn/rwz"
-const static uint8_t test_output_rwz[160] = {
+const static uint8_t test_output_rwz[256] = {
    0x5f, 0x56, 0xc6, 0xb0, 0x99, 0x6b, 0xa2, 0x3e, 0x0b, 0xba, 0x07, 0x29, 0xc9, 0x90, 0x74, 0x85,
    0x5a, 0x10, 0xe3, 0x08, 0x7f, 0xdb, 0xfe, 0x94, 0x75, 0x33, 0x54, 0x73, 0x76, 0xf0, 0x75, 0xb8,
    0x8b, 0x70, 0x43, 0x9a, 0xfc, 0xf5, 0xeb, 0x15, 0xbb, 0xf9, 0xad, 0x9d, 0x2a, 0xbd, 0x72, 0x52,
@ -218,7 +218,7 @@ const static uint8_t test_output_rwz[160] = {
 };

 // "cn/zls"
-const static uint8_t test_output_zls[160] = {
+const static uint8_t test_output_zls[256] = {
    0x51, 0x6E, 0x33, 0xC6, 0xE4, 0x46, 0xAB, 0xBC, 0xCD, 0xAD, 0x18, 0xC0, 0x4C, 0xD9, 0xA2, 0x5E,
    0x64, 0x10, 0x28, 0x53, 0xB2, 0x0A, 0x42, 0xDF, 0xDE, 0xAA, 0x8B, 0x59, 0x9E, 0xCF, 0x40, 0xE2,
    0x0D, 0x62, 0x5B, 0x42, 0x18, 0xE2, 0x76, 0xAD, 0xD0, 0x74, 0x90, 0x60, 0x8D, 0xC4, 0xC7, 0x80,
@ -232,7 +232,7 @@ const static uint8_t test_output_zls[160] = {
 };

 // "cn/ccx"
-const static uint8_t test_output_ccx[160] = {
+const static uint8_t test_output_ccx[256] = {
    0xB3, 0xA1, 0x67, 0x86, 0xD2, 0xC9, 0x85, 0xEC, 0xAD, 0xC4, 0x5F, 0x91, 0x05, 0x27, 0xC7, 0xA1,
    0x96, 0xF0, 0xE1, 0xE9, 0x7C, 0x87, 0x09, 0x38, 0x1D, 0x7D, 0x41, 0x93, 0x35, 0xF8, 0x16, 0x72,
    0xC3, 0xBD, 0x8D, 0xE8, 0xD5, 0xAE, 0xB8, 0x59, 0x0A, 0x6C, 0xCB, 0x7B, 0x41, 0x30, 0xF7, 0x04,
@ -246,7 +246,7 @@ const static uint8_t test_output_ccx[160] = {
 };

 // "cn/double"
-const static uint8_t test_output_double[160] = {
+const static uint8_t test_output_double[256] = {
    0xAE, 0xFB, 0xB3, 0xF0, 0xCC, 0x88, 0x04, 0x6D, 0x11, 0x9F, 0x6C, 0x54, 0xB9, 0x6D, 0x90, 0xC9,
    0xE8, 0x84, 0xEA, 0x3B, 0x59, 0x83, 0xA6, 0x0D, 0x50, 0xA4, 0x2D, 0x7D, 0x3E, 0xBE, 0x48, 0x21,
    0x49, 0xCE, 0x8E, 0xF3, 0xBC, 0x8A, 0x36, 0xBF, 0x86, 0x37, 0x89, 0x55, 0x09, 0xBA, 0x22, 0xF8,
@ -261,7 +261,7 @@ const static uint8_t test_output_double[160] = {

 #ifdef XMRIG_ALGO_CN_LITE
 // "cn-lite/0"
-const static uint8_t test_output_v0_lite[160] = {
+const static uint8_t test_output_v0_lite[256] = {
    0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E,
    0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88,
    0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE,
@ -276,7 +276,7 @@ const static uint8_t test_output_v0_lite[160] = {


 // "cn-lite/1" AEON v7
-const static uint8_t test_output_v1_lite[160] = {
+const static uint8_t test_output_v1_lite[256] = {
    0x6D, 0x8C, 0xDC, 0x44, 0x4E, 0x9B, 0xBB, 0xFD, 0x68, 0xFC, 0x43, 0xFC, 0xD4, 0x85, 0x5B, 0x22,
    0x8C, 0x8A, 0x1B, 0xD9, 0x1D, 0x9D, 0x00, 0x28, 0x5B, 0xEC, 0x02, 0xB7, 0xCA, 0x2D, 0x67, 0x41,
    0x87, 0xC4, 0xE5, 0x70, 0x65, 0x3E, 0xB4, 0xC2, 0xB4, 0x2B, 0x7A, 0x0D, 0x54, 0x65, 0x59, 0x45,
@ -293,7 +293,7 @@ const static uint8_t test_output_v1_lite[160] = {

 #ifdef XMRIG_ALGO_CN_HEAVY
 // "cn-heavy/0"
-const static uint8_t test_output_v0_heavy[160] = {
+const static uint8_t test_output_v0_heavy[256] = {
    0x99, 0x83, 0xF2, 0x1B, 0xDF, 0x20, 0x10, 0xA8, 0xD7, 0x07, 0xBB, 0x2F, 0x14, 0xD7, 0x86, 0x64,
    0xBB, 0xE1, 0x18, 0x7F, 0x55, 0x01, 0x4B, 0x39, 0xE5, 0xF3, 0xD6, 0x93, 0x28, 0xE4, 0x8F, 0xC2,
    0x4D, 0x94, 0x7D, 0xD6, 0xDB, 0x6E, 0x07, 0x48, 0x26, 0x4A, 0x51, 0x2E, 0xAC, 0xF3, 0x25, 0x4A,
@ -308,7 +308,7 @@ const static uint8_t test_output_v0_heavy[160] = {


 // "cn-heavy/xhv"
-const static uint8_t test_output_xhv_heavy[160] = {
+const static uint8_t test_output_xhv_heavy[256] = {
    0x5A, 0xC3, 0xF7, 0x85, 0xC4, 0x90, 0xC5, 0x85, 0x50, 0xEC, 0x95, 0xD2, 0x72, 0x65, 0x63, 0x57,
    0x7E, 0x7C, 0x1C, 0x21, 0x2D, 0x0C, 0xDE, 0x59, 0x12, 0x73, 0x20, 0x1E, 0x44, 0xFD, 0xD5, 0xB6,
    0x1F, 0x4E, 0xB2, 0x0A, 0x36, 0x51, 0x4B, 0xF5, 0x4D, 0xC9, 0xE0, 0x90, 0x2C, 0x16, 0x47, 0x3F,
@ -323,7 +323,7 @@ const static uint8_t test_output_xhv_heavy[160] = {


 // "cn-heavy/tube"
-const static uint8_t test_output_tube_heavy[160] = {
+const static uint8_t test_output_tube_heavy[256] = {
    0xFE, 0x53, 0x35, 0x20, 0x76, 0xEA, 0xE6, 0x89, 0xFA, 0x3B, 0x4F, 0xDA, 0x61, 0x46, 0x34, 0xCF,
    0xC3, 0x12, 0xEE, 0x0C, 0x38, 0x7D, 0xF2, 0xB8, 0xB7, 0x4D, 0xA2, 0xA1, 0x59, 0x74, 0x12, 0x35,
    0xCD, 0x3F, 0x29, 0xDF, 0x07, 0x4A, 0x14, 0xAD, 0x0B, 0x98, 0x99, 0x37, 0xCA, 0x14, 0x68, 0xA3,
@ -340,7 +340,7 @@ const static uint8_t test_output_tube_heavy[160] = {

 #ifdef XMRIG_ALGO_CN_PICO
 // "cn-pico/trtl"
-const static uint8_t test_output_pico_trtl[160] = {
+const static uint8_t test_output_pico_trtl[256] = {
    0x08, 0xF4, 0x21, 0xD7, 0x83, 0x31, 0x17, 0x30, 0x0E, 0xDA, 0x66, 0xE9, 0x8F, 0x4A, 0x25, 0x69,
    0x09, 0x3D, 0xF3, 0x00, 0x50, 0x01, 0x73, 0x94, 0x4E, 0xFC, 0x40, 0x1E, 0x9A, 0x4A, 0x17, 0xAF,
    0xB2, 0x17, 0x2E, 0xC9, 0x46, 0x6E, 0x1A, 0xEE, 0x70, 0xEC, 0x85, 0x72, 0xA1, 0x4C, 0x23, 0x3E,
@ -355,7 +355,7 @@ const static uint8_t test_output_pico_trtl[160] = {


 // "cn-pico/tlo"
-const static uint8_t test_output_pico_tlo[160] = {
+const static uint8_t test_output_pico_tlo[256] = {
    0x99, 0x75, 0xF2, 0xC1, 0xB3, 0xB4, 0x54, 0x34, 0xA4, 0x93, 0x86, 0x21, 0x30, 0x97, 0xF3, 0x1B,
    0xB4, 0xB9, 0xA6, 0x58, 0x6A, 0x7E, 0x81, 0xF4, 0x42, 0x9F, 0x6D, 0x5F, 0x65, 0xC3, 0x8D, 0x1A,
    0xFC, 0x67, 0xDF, 0xCC, 0xB5, 0xFC, 0x90, 0xD7, 0x85, 0x5A, 0xE9, 0x03, 0x36, 0x1E, 0xAB, 0xD7,
@ -372,7 +372,7 @@ const static uint8_t test_output_pico_tlo[160] = {

 #ifdef XMRIG_ALGO_CN_FEMTO
 // "cn/upx2"
-const static uint8_t test_output_femto_upx2[160] = {
+const static uint8_t test_output_femto_upx2[256] = {
    0xAA, 0xBB, 0xB8, 0xED, 0x14, 0xA8, 0x35, 0xFA, 0x22, 0xCF, 0xB1, 0xB5, 0xDE, 0xA8, 0x72, 0xB0,
    0xA1, 0xD6, 0xCB, 0xD8, 0x46, 0xF4, 0x39, 0x1C, 0x0F, 0x01, 0xF3, 0x87, 0x5E, 0x3A, 0x37, 0x61,
    0x38, 0x59, 0x15, 0x72, 0xF8, 0x20, 0xD4, 0xDE, 0x25, 0x3C, 0xF5, 0x5A, 0x21, 0x92, 0xB6, 0x22,
@ -389,7 +389,7 @@ const static uint8_t test_output_femto_upx2[160] = {

 #ifdef XMRIG_ALGO_ARGON2
 // "argon2/chukwa"
-const static uint8_t argon2_chukwa_test_out[160] = {
+const static uint8_t argon2_chukwa_test_out[256] = {
    0xC1, 0x58, 0xA1, 0x05, 0xAE, 0x75, 0xC7, 0x56, 0x1C, 0xFD, 0x02, 0x90, 0x83, 0xA4, 0x7A, 0x87,
    0x65, 0x3D, 0x51, 0xF9, 0x14, 0x12, 0x8E, 0x21, 0xC1, 0x97, 0x1D, 0x8B, 0x10, 0xC4, 0x90, 0x34,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@ -403,7 +403,7 @@ const static uint8_t argon2_chukwa_test_out[160] = {
 };

 // "argon2/chukwav2"
-const static uint8_t argon2_chukwa_v2_test_out[160] = {
+const static uint8_t argon2_chukwa_v2_test_out[256] = {
    0x77, 0xCF, 0x69, 0x58, 0xB3, 0x53, 0x6E, 0x1F, 0x9F, 0x0D, 0x1E, 0xA1, 0x65, 0xF2, 0x28, 0x11,
    0xCA, 0x7B, 0xC4, 0x87, 0xEA, 0x9F, 0x52, 0x03, 0x0B, 0x50, 0x50, 0xC1, 0x7F, 0xCD, 0xD8, 0xF5,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@ -417,7 +417,7 @@ const static uint8_t argon2_chukwa_v2_test_out[160] = {
 };

 // "argon2/wrkz"
-const static uint8_t argon2_wrkz_test_out[160] = {
+const static uint8_t argon2_wrkz_test_out[256] = {
    0x35, 0xE0, 0x83, 0xD4, 0xB9, 0xC6, 0x4C, 0x2A, 0x68, 0x82, 0x0A, 0x43, 0x1F, 0x61, 0x31, 0x19,
    0x98, 0xA8, 0xCD, 0x18, 0x64, 0xDB, 0xA4, 0x07, 0x7E, 0x25, 0xB7, 0xF1, 0x21, 0xD5, 0x4B, 0xD1,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@ -432,21 +432,26 @@ const static uint8_t argon2_wrkz_test_out[160] = {
 #endif


-#ifdef XMRIG_ALGO_ASTROBWT
-// "astrobwt"
-const static uint8_t astrobwt_dero_test_out[160] = {
-    0x7E, 0x88, 0x44, 0xF2, 0xD6, 0xB7, 0xA4, 0x34, 0x98, 0xFE, 0x6D, 0x22, 0x65, 0x27, 0x68, 0x90,
-    0x23, 0xDA, 0x8A, 0x52, 0xF9, 0xFC, 0x4E, 0xC6, 0x9E, 0x5A, 0xAA, 0xA6, 0x3E, 0xDC, 0xE1, 0xC1,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+#ifdef XMRIG_ALGO_GHOSTRIDER
+// "GhostRider"
+const static uint8_t test_output_gr[256] = {
+    0x42, 0x17, 0x0C, 0xC1, 0x85, 0xE6, 0x76, 0x3C, 0xC7, 0xCB, 0x27, 0xC4, 0x17, 0x39, 0x2D, 0xE2,
+    0x29, 0x6B, 0x40, 0x66, 0x85, 0xA4, 0xE3, 0xD3, 0x8C, 0xE9, 0xA5, 0x8F, 0x10, 0xFC, 0x81, 0xE4,
+    0x90, 0x56, 0xF2, 0x9E, 0x00, 0xD0, 0xF8, 0xA1, 0x88, 0x82, 0x86, 0xC0, 0x86, 0x04, 0x6B, 0x0E,
+    0x9A, 0xDB, 0xDB, 0xFD, 0x23, 0x16, 0x77, 0x94, 0xFE, 0x58, 0x93, 0x05, 0x10, 0x3F, 0x27, 0x75,
+    0x51, 0x44, 0xF3, 0x5F, 0xE2, 0xF9, 0x61, 0xBE, 0xC0, 0x30, 0xB5, 0x8E, 0xB1, 0x1B, 0xA1, 0xF7,
+    0x06, 0x4E, 0xF1, 0x6A, 0xFD, 0xA5, 0x44, 0x8E, 0x64, 0x47, 0x8C, 0x67, 0x51, 0xE2, 0x5C, 0x55,
+    0x3E, 0x39, 0xA6, 0xA5, 0xF7, 0xB8, 0xD0, 0x5E, 0xE2, 0xBF, 0x92, 0x44, 0xD9, 0xAA, 0x76, 0x22,
+    0xE3, 0x3E, 0x15, 0x96, 0xD8, 0x6A, 0x78, 0x2D, 0xA9, 0x77, 0x24, 0x1A, 0x4B, 0xE7, 0x5A, 0x2E,
+    0x89, 0x77, 0xAE, 0x92, 0xE4, 0xA4, 0x2D, 0xAF, 0x0B, 0x27, 0x09, 0xB2, 0x5F, 0x95, 0x61, 0xA9,
+    0xA8, 0xBE, 0x5D, 0x39, 0xBE, 0x41, 0x5F, 0x9C, 0x67, 0x28, 0x48, 0x4F, 0xAE, 0x2A, 0x50, 0x2B,
+    0xB8, 0xC7, 0x42, 0x73, 0x51, 0x60, 0x59, 0xD8, 0x9C, 0xBA, 0x22, 0x2F, 0x8E, 0x34, 0xDE, 0xC8,
+    0x1B, 0xAE, 0x9E, 0xBD, 0xF7, 0xE8, 0xFD, 0x8A, 0x97, 0xBE, 0xF0, 0x47, 0xAC, 0x27, 0xDD, 0x28,
+    0xC9, 0x28, 0xA8, 0x7B, 0x2A, 0xB8, 0x90, 0x3E, 0xCA, 0xB4, 0x78, 0x44, 0xCE, 0xCD, 0x91, 0xEC,
+    0xC2, 0x5A, 0x17, 0x59, 0x7C, 0x14, 0xF8, 0x95, 0x28, 0x14, 0xC3, 0xAD, 0xC4, 0xE1, 0x13, 0x5A,
+    0xC4, 0xA7, 0xC7, 0x77, 0xAD, 0xF8, 0x09, 0x61, 0x16, 0xBB, 0xAA, 0x7E, 0xAB, 0xC3, 0x00, 0x25,
+    0xBA, 0xA8, 0x97, 0xC7, 0x7D, 0x38, 0x46, 0x0E, 0x59, 0xAC, 0xCB, 0xAE, 0xFE, 0x3C, 0x6F, 0x01
 };
-
 #endif


--- a/src/crypto/cn/CryptoNight_x86.h
+++ b/src/crypto/cn/CryptoNight_x86.h
@ -43,6 +43,11 @@
 #include "crypto/cn/soft_aes.h"


+#ifdef XMRIG_VAES
+#   include "crypto/cn/CryptoNight_x86_vaes.h"
+#endif
+
+
 extern "C"
 {
 #include "crypto/cn/c_groestl.h"
@ -285,23 +290,48 @@ inline constexpr uint64_t interleaved_index<0>(uint64_t k)


 template<Algorithm::Id ALGO, bool SOFT_AES, int interleave>
-static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output)
+static NOINLINE void cn_explode_scratchpad(cryptonight_ctx *ctx)
 {
    constexpr CnAlgo<ALGO> props;

+#   ifdef XMRIG_VAES
+    if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) {
+        cn_explode_scratchpad_vaes(ctx, props.memory(), props.half_mem());
+        return;
+    }
+#   endif
+
+    constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1);
+
    __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
    __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;

+    const __m128i* input = reinterpret_cast<const __m128i*>(ctx->state);
+    __m128i* output = reinterpret_cast<__m128i*>(ctx->memory);
+
    aes_genkey<SOFT_AES>(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);

-    xin0 = _mm_load_si128(input + 4);
-    xin1 = _mm_load_si128(input + 5);
-    xin2 = _mm_load_si128(input + 6);
-    xin3 = _mm_load_si128(input + 7);
-    xin4 = _mm_load_si128(input + 8);
-    xin5 = _mm_load_si128(input + 9);
-    xin6 = _mm_load_si128(input + 10);
-    xin7 = _mm_load_si128(input + 11);
+    if (props.half_mem() && !ctx->first_half) {
+        const __m128i* p = reinterpret_cast<const __m128i*>(ctx->save_state);
+        xin0 = _mm_load_si128(p + 0);
+        xin1 = _mm_load_si128(p + 1);
+        xin2 = _mm_load_si128(p + 2);
+        xin3 = _mm_load_si128(p + 3);
+        xin4 = _mm_load_si128(p + 4);
+        xin5 = _mm_load_si128(p + 5);
+        xin6 = _mm_load_si128(p + 6);
+        xin7 = _mm_load_si128(p + 7);
+    }
+    else {
+        xin0 = _mm_load_si128(input + 4);
+        xin1 = _mm_load_si128(input + 5);
+        xin2 = _mm_load_si128(input + 6);
+        xin3 = _mm_load_si128(input + 7);
+        xin4 = _mm_load_si128(input + 8);
+        xin5 = _mm_load_si128(input + 9);
+        xin6 = _mm_load_si128(input + 10);
+        xin7 = _mm_load_si128(input + 11);
+    }

    if (props.isHeavy()) {
        for (size_t i = 0; i < 16; i++) {
@ -320,50 +350,80 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output)
        }
    }

-    for (size_t i = 0; i < props.memory() / sizeof(__m128i); i += 8) {
-        if (interleave > 0) {
-            _mm_prefetch((const char*)(output), _MM_HINT_T0);
-            _mm_prefetch((const char*)(output + (64 << interleave) / sizeof(__m128i)), _MM_HINT_T0);
-        }
+    constexpr int output_increment = (64 << interleave) / sizeof(__m128i);
+    constexpr int prefetch_dist = 2048 / sizeof(__m128i);

-        aes_round<SOFT_AES>(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
-        aes_round<SOFT_AES>(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
-        aes_round<SOFT_AES>(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
-        aes_round<SOFT_AES>(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
-        aes_round<SOFT_AES>(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
-        aes_round<SOFT_AES>(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
-        aes_round<SOFT_AES>(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
-        aes_round<SOFT_AES>(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
-        aes_round<SOFT_AES>(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
-        aes_round<SOFT_AES>(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+    __m128i* e = output + (N << interleave) - prefetch_dist;
+    __m128i* prefetch_ptr = output + prefetch_dist;

-        _mm_store_si128(output + 0, xin0);
-        _mm_store_si128(output + 1, xin1);
-        _mm_store_si128(output + 2, xin2);
-        _mm_store_si128(output + 3, xin3);
+    for (int i = 0; i < 2; ++i) {
+        do {
+            _mm_prefetch((const char*)(prefetch_ptr), _MM_HINT_T0);
+            _mm_prefetch((const char*)(prefetch_ptr + output_increment), _MM_HINT_T0);

-        constexpr int output_increment = (64 << interleave) / sizeof(__m128i);
+            aes_round<SOFT_AES>(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+            aes_round<SOFT_AES>(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);

-        _mm_store_si128(output + output_increment + 0, xin4);
-        _mm_store_si128(output + output_increment + 1, xin5);
-        _mm_store_si128(output + output_increment + 2, xin6);
-        _mm_store_si128(output + output_increment + 3, xin7);
+            _mm_store_si128(output + 0, xin0);
+            _mm_store_si128(output + 1, xin1);
+            _mm_store_si128(output + 2, xin2);
+            _mm_store_si128(output + 3, xin3);

-        output += output_increment * 2;
+            _mm_store_si128(output + output_increment + 0, xin4);
+            _mm_store_si128(output + output_increment + 1, xin5);
+            _mm_store_si128(output + output_increment + 2, xin6);
+            _mm_store_si128(output + output_increment + 3, xin7);
+
+            output += output_increment * 2;
+            prefetch_ptr += output_increment * 2;
+        } while (output < e);
+        e += prefetch_dist;
+        prefetch_ptr = output;
+    }
+
+    if (props.half_mem() && ctx->first_half) {
+         __m128i* p = reinterpret_cast<__m128i*>(ctx->save_state);
+        _mm_store_si128(p + 0, xin0);
+        _mm_store_si128(p + 1, xin1);
+        _mm_store_si128(p + 2, xin2);
+        _mm_store_si128(p + 3, xin3);
+        _mm_store_si128(p + 4, xin4);
+        _mm_store_si128(p + 5, xin5);
+        _mm_store_si128(p + 6, xin6);
+        _mm_store_si128(p + 7, xin7);
    }
 }


 template<Algorithm::Id ALGO, bool SOFT_AES, int interleave>
-static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
+static NOINLINE void cn_implode_scratchpad(cryptonight_ctx *ctx)
 {
    constexpr CnAlgo<ALGO> props;

+#   ifdef XMRIG_VAES
+    if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) {
+        cn_implode_scratchpad_vaes(ctx, props.memory(), props.half_mem());
+        return;
+    }
+#   endif
+
    constexpr bool IS_HEAVY = props.isHeavy();
+    constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1);

    __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
    __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;

+    const __m128i *input = reinterpret_cast<const __m128i*>(ctx->memory);
+    __m128i *output = reinterpret_cast<__m128i*>(ctx->state);
+
    aes_genkey<SOFT_AES>(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);

    xout0 = _mm_load_si128(output + 4);
@ -376,46 +436,54 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
    xout7 = _mm_load_si128(output + 11);

    const __m128i* input_begin = input;
-    for (size_t i = 0; i < props.memory() / sizeof(__m128i);) {
-        xout0 = _mm_xor_si128(_mm_load_si128(input + 0), xout0);
-        xout1 = _mm_xor_si128(_mm_load_si128(input + 1), xout1);
-        xout2 = _mm_xor_si128(_mm_load_si128(input + 2), xout2);
-        xout3 = _mm_xor_si128(_mm_load_si128(input + 3), xout3);
-
-        constexpr int input_increment = (64 << interleave) / sizeof(__m128i);
-
-        xout4 = _mm_xor_si128(_mm_load_si128(input + input_increment + 0), xout4);
-        xout5 = _mm_xor_si128(_mm_load_si128(input + input_increment + 1), xout5);
-        xout6 = _mm_xor_si128(_mm_load_si128(input + input_increment + 2), xout6);
-        xout7 = _mm_xor_si128(_mm_load_si128(input + input_increment + 3), xout7);
-
-        input += input_increment * 2;
-        i += 8;
-
-        if ((interleave > 0) && (i < props.memory() / sizeof(__m128i))) {
-            _mm_prefetch((const char*)(input), _MM_HINT_T0);
-            _mm_prefetch((const char*)(input + (64 << interleave) / sizeof(__m128i)), _MM_HINT_T0);
+    for (size_t part = 0; part < (props.half_mem() ? 2 : 1); ++part) {
+        if (props.half_mem() && (part == 1)) {
+            input = input_begin;
+            ctx->first_half = false;
+            cn_explode_scratchpad<ALGO, SOFT_AES, interleave>(ctx);
        }

-        aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
-        aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
-        aes_round<SOFT_AES>(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
-        aes_round<SOFT_AES>(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
-        aes_round<SOFT_AES>(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
-        aes_round<SOFT_AES>(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
-        aes_round<SOFT_AES>(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
-        aes_round<SOFT_AES>(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
-        aes_round<SOFT_AES>(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
-        aes_round<SOFT_AES>(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+        for (size_t i = 0; i < N;) {
+            xout0 = _mm_xor_si128(_mm_load_si128(input + 0), xout0);
+            xout1 = _mm_xor_si128(_mm_load_si128(input + 1), xout1);
+            xout2 = _mm_xor_si128(_mm_load_si128(input + 2), xout2);
+            xout3 = _mm_xor_si128(_mm_load_si128(input + 3), xout3);

-        if (IS_HEAVY) {
-            mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            constexpr int input_increment = (64 << interleave) / sizeof(__m128i);
+
+            xout4 = _mm_xor_si128(_mm_load_si128(input + input_increment + 0), xout4);
+            xout5 = _mm_xor_si128(_mm_load_si128(input + input_increment + 1), xout5);
+            xout6 = _mm_xor_si128(_mm_load_si128(input + input_increment + 2), xout6);
+            xout7 = _mm_xor_si128(_mm_load_si128(input + input_increment + 3), xout7);
+
+            input += input_increment * 2;
+            i += 8;
+
+            if (i < N) {
+                _mm_prefetch((const char*)(input), _MM_HINT_T0);
+                _mm_prefetch((const char*)(input + input_increment), _MM_HINT_T0);
+            }
+
+            aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+            aes_round<SOFT_AES>(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+
+            if (IS_HEAVY) {
+                mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            }
        }
    }

    if (IS_HEAVY) {
        input = input_begin;
-        for (size_t i = 0; i < props.memory() / sizeof(__m128i);) {
+        for (size_t i = 0; i < N;) {
            xout0 = _mm_xor_si128(_mm_load_si128(input + 0), xout0);
            xout1 = _mm_xor_si128(_mm_load_si128(input + 1), xout1);
            xout2 = _mm_xor_si128(_mm_load_si128(input + 2), xout2);
@ -523,6 +591,9 @@ static inline __m128i int_sqrt_v2(const uint64_t n0)
 void v4_soft_aes_compile_code(const V4_Instruction *code, int code_size, void *machine_code, xmrig::Assembly ASM);


+alignas(64) static const uint32_t tweak1_table[256] = { 268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456 };
+
+
 namespace xmrig {


@ -541,12 +612,7 @@ static inline void cryptonight_monero_tweak(uint64_t *mem_out, const uint8_t *l,
        tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
        uint64_t vh = _mm_cvtsi128_si64(tmp);

-        uint8_t x = static_cast<uint8_t>(vh >> 24);
-        static const uint16_t table = 0x7531;
-        const uint8_t index = (((x >> (3)) & 6) | (x & 1)) << 1;
-        vh ^= ((table >> index) & 0x3) << 28;
-
-        mem_out[1] = vh;
+        mem_out[1] = vh ^ tweak1_table[static_cast<uint32_t>(vh) >> 24];
    }
 }

@ -568,9 +634,35 @@ static inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
    cx = _mm_xor_si128(cx, _mm_cvttps_epi32(nc));
 }

+#ifdef XMRIG_FEATURE_ASM
+template<Algorithm::Id ALGO>
+static void cryptonight_single_hash_gr_sse41(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height);
+#endif
+
 template<Algorithm::Id ALGO, bool SOFT_AES, int interleave>
 inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
 {
+#   ifdef XMRIG_FEATURE_ASM
+    if (!SOFT_AES) {
+        switch (ALGO) {
+        case Algorithm::CN_GR_0:
+        case Algorithm::CN_GR_1:
+        case Algorithm::CN_GR_2:
+        case Algorithm::CN_GR_3:
+        case Algorithm::CN_GR_4:
+        case Algorithm::CN_GR_5:
+            if (cn_sse41_enabled) {
+                cryptonight_single_hash_gr_sse41<ALGO>(input, size, output, ctx, height);
+                return;
+            }
+            break;
+
+        default:
+            break;
+        }
+    }
+#   endif
+
    constexpr CnAlgo<ALGO> props;
    constexpr size_t MASK        = props.mask();
    constexpr Algorithm::Id BASE = props.base();
@ -587,7 +679,11 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
    }

    keccak(input, size, ctx[0]->state);
-    cn_explode_scratchpad<ALGO, SOFT_AES, interleave>(reinterpret_cast<const __m128i *>(ctx[0]->state), reinterpret_cast<__m128i *>(ctx[0]->memory));
+
+    if (props.half_mem()) {
+        ctx[0]->first_half = true;
+    }
+    cn_explode_scratchpad<ALGO, SOFT_AES, interleave>(ctx[0]);

    uint64_t *h0 = reinterpret_cast<uint64_t*>(ctx[0]->state);
    uint8_t *l0   = ctx[0]->memory;
@ -712,7 +808,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si

            int64_t d5;

-#           if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 8))
+#           if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 8)) || !defined(XMRIG_64_BIT)
            d5 = d | 5;
 #           else
            // Workaround for stupid GCC which converts to 32 bit before doing "| 5" and then converts back to 64 bit
@ -742,7 +838,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
    }
 #   endif

-    cn_implode_scratchpad<ALGO, SOFT_AES, interleave>(reinterpret_cast<const __m128i *>(ctx[0]->memory), reinterpret_cast<__m128i *>(ctx[0]->state));
+    cn_implode_scratchpad<ALGO, SOFT_AES, interleave>(ctx[0]);
    keccakf(h0, 24);
    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
 }
@ -752,13 +848,16 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si


 #ifdef XMRIG_FEATURE_ASM
+extern "C" void cnv1_single_mainloop_asm(cryptonight_ctx * *ctx);
+extern "C" void cnv1_double_mainloop_asm(cryptonight_ctx **ctx);
+extern "C" void cnv1_quad_mainloop_asm(cryptonight_ctx **ctx);
 extern "C" void cnv2_mainloop_ivybridge_asm(cryptonight_ctx **ctx);
 extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx **ctx);
 extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx **ctx);
 extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx **ctx);
 extern "C" void cnv2_rwz_mainloop_asm(cryptonight_ctx **ctx);
 extern "C" void cnv2_rwz_double_mainloop_asm(cryptonight_ctx **ctx);
-extern "C" void cnv2_upx_double_mainloop_zen3_asm(cryptonight_ctx * *ctx);
+extern "C" void cnv2_upx_double_mainloop_zen3_asm(cryptonight_ctx **ctx);


 namespace xmrig {
@ -795,6 +894,28 @@ extern cn_mainloop_fun cn_double_double_mainloop_sandybridge_asm;
 extern cn_mainloop_fun cn_upx2_mainloop_asm;
 extern cn_mainloop_fun cn_upx2_double_mainloop_asm;

+extern cn_mainloop_fun cn_gr0_single_mainloop_asm;
+extern cn_mainloop_fun cn_gr1_single_mainloop_asm;
+extern cn_mainloop_fun cn_gr2_single_mainloop_asm;
+extern cn_mainloop_fun cn_gr3_single_mainloop_asm;
+extern cn_mainloop_fun cn_gr4_single_mainloop_asm;
+extern cn_mainloop_fun cn_gr5_single_mainloop_asm;
+
+extern cn_mainloop_fun cn_gr0_double_mainloop_asm;
+extern cn_mainloop_fun cn_gr1_double_mainloop_asm;
+extern cn_mainloop_fun cn_gr2_double_mainloop_asm;
+extern cn_mainloop_fun cn_gr3_double_mainloop_asm;
+extern cn_mainloop_fun cn_gr4_double_mainloop_asm;
+extern cn_mainloop_fun cn_gr5_double_mainloop_asm;
+
+extern cn_mainloop_fun cn_gr0_quad_mainloop_asm;
+extern cn_mainloop_fun cn_gr1_quad_mainloop_asm;
+extern cn_mainloop_fun cn_gr2_quad_mainloop_asm;
+extern cn_mainloop_fun cn_gr3_quad_mainloop_asm;
+extern cn_mainloop_fun cn_gr4_quad_mainloop_asm;
+extern cn_mainloop_fun cn_gr5_quad_mainloop_asm;
+
+
 } // namespace xmrig


@ -833,7 +954,11 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_
    }

    keccak(input, size, ctx[0]->state);
-    cn_explode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[0]->state), reinterpret_cast<__m128i*>(ctx[0]->memory));
+
+    if (props.half_mem()) {
+        ctx[0]->first_half = true;
+    }
+    cn_explode_scratchpad<ALGO, false, 0>(ctx[0]);

    if (ALGO == Algorithm::CN_2) {
        if (ASM == Assembly::INTEL) {
@ -915,7 +1040,7 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_
        ctx[0]->generated_code(ctx);
    }

-    cn_implode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state));
+    cn_implode_scratchpad<ALGO, false, 0>(ctx[0]);
    keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);
    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
 }
@ -937,8 +1062,21 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
    keccak(input,        size, ctx[0]->state);
    keccak(input + size, size, ctx[1]->state);

-    cn_explode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[0]->state), reinterpret_cast<__m128i*>(ctx[0]->memory));
-    cn_explode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[1]->state), reinterpret_cast<__m128i*>(ctx[1]->memory));
+    if (props.half_mem()) {
+        ctx[0]->first_half = true;
+        ctx[1]->first_half = true;
+    }
+
+#   ifdef XMRIG_VAES
+    if (!props.isHeavy() && cn_vaes_enabled) {
+        cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
+    }
+    else
+#   endif
+    {
+        cn_explode_scratchpad<ALGO, false, 0>(ctx[0]);
+        cn_explode_scratchpad<ALGO, false, 0>(ctx[1]);
+    }

    if (ALGO == Algorithm::CN_2) {
        cnv2_double_mainloop_sandybridge_asm(ctx);
@ -977,8 +1115,16 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
        ctx[0]->generated_code(ctx);
    }

-    cn_implode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state));
-    cn_implode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[1]->memory), reinterpret_cast<__m128i*>(ctx[1]->state));
+#   ifdef XMRIG_VAES
+    if (!props.isHeavy() && cn_vaes_enabled) {
+        cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
+    }
+    else
+#   endif
+    {
+        cn_implode_scratchpad<ALGO, false, 0>(ctx[0]);
+        cn_implode_scratchpad<ALGO, false, 0>(ctx[1]);
+    }

    keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);
    keccakf(reinterpret_cast<uint64_t*>(ctx[1]->state), 24);
@ -995,9 +1141,130 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
 namespace xmrig {


+#ifdef XMRIG_FEATURE_ASM
+template<Algorithm::Id ALGO>
+static NOINLINE void cryptonight_single_hash_gr_sse41(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height)
+{
+    constexpr CnAlgo<ALGO> props;
+    constexpr Algorithm::Id BASE = props.base();
+
+    if (BASE == Algorithm::CN_1 && size < 43) {
+        memset(output, 0, 32);
+        return;
+    }
+
+    keccak(input, size, ctx[0]->state);
+
+    if (props.half_mem()) {
+        ctx[0]->first_half = true;
+    }
+    cn_explode_scratchpad<ALGO, false, 0>(ctx[0]);
+
+    VARIANT1_INIT(0);
+    ctx[0]->tweak1_2 = tweak1_2_0;
+    ctx[0]->tweak1_table = tweak1_table;
+    if (ALGO == Algorithm::CN_GR_0) cn_gr0_single_mainloop_asm(ctx);
+    if (ALGO == Algorithm::CN_GR_1) cn_gr1_single_mainloop_asm(ctx);
+    if (ALGO == Algorithm::CN_GR_2) cn_gr2_single_mainloop_asm(ctx);
+    if (ALGO == Algorithm::CN_GR_3) cn_gr3_single_mainloop_asm(ctx);
+    if (ALGO == Algorithm::CN_GR_4) cn_gr4_single_mainloop_asm(ctx);
+    if (ALGO == Algorithm::CN_GR_5) cn_gr5_single_mainloop_asm(ctx);
+
+    cn_implode_scratchpad<ALGO, false, 0>(ctx[0]);
+    keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+template<Algorithm::Id ALGO>
+static NOINLINE void cryptonight_double_hash_gr_sse41(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
+{
+    constexpr CnAlgo<ALGO> props;
+    constexpr Algorithm::Id BASE = props.base();
+
+    if (BASE == Algorithm::CN_1 && size < 43) {
+        memset(output, 0, 64);
+        return;
+    }
+
+    keccak(input,        size, ctx[0]->state);
+    keccak(input + size, size, ctx[1]->state);
+
+    if (props.half_mem()) {
+        ctx[0]->first_half = true;
+        ctx[1]->first_half = true;
+    }
+
+#   ifdef XMRIG_VAES
+    if (!props.isHeavy() && cn_vaes_enabled) {
+        cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
+    }
+    else
+#   endif
+    {
+        cn_explode_scratchpad<ALGO, false, 0>(ctx[0]);
+        cn_explode_scratchpad<ALGO, false, 0>(ctx[1]);
+    }
+
+    VARIANT1_INIT(0);
+    VARIANT1_INIT(1);
+
+    ctx[0]->tweak1_2 = tweak1_2_0;
+    ctx[1]->tweak1_2 = tweak1_2_1;
+
+    ctx[0]->tweak1_table = tweak1_table;
+
+    if (ALGO == Algorithm::CN_GR_0) cn_gr0_double_mainloop_asm(ctx);
+    if (ALGO == Algorithm::CN_GR_1) cn_gr1_double_mainloop_asm(ctx);
+    if (ALGO == Algorithm::CN_GR_2) cn_gr2_double_mainloop_asm(ctx);
+    if (ALGO == Algorithm::CN_GR_3) cn_gr3_double_mainloop_asm(ctx);
+    if (ALGO == Algorithm::CN_GR_4) cn_gr4_double_mainloop_asm(ctx);
+    if (ALGO == Algorithm::CN_GR_5) cn_gr5_double_mainloop_asm(ctx);
+
+#   ifdef XMRIG_VAES
+    if (!props.isHeavy() && cn_vaes_enabled) {
+        cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
+    }
+    else
+#   endif
+    {
+        cn_implode_scratchpad<ALGO, false, 0>(ctx[0]);
+        cn_implode_scratchpad<ALGO, false, 0>(ctx[1]);
+    }
+
+    keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);
+    keccakf(reinterpret_cast<uint64_t*>(ctx[1]->state), 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+#endif
+
+
 template<Algorithm::Id ALGO, bool SOFT_AES>
 inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
 {
+#   ifdef XMRIG_FEATURE_ASM
+    if (!SOFT_AES) {
+        switch (ALGO) {
+        case Algorithm::CN_GR_0:
+        case Algorithm::CN_GR_1:
+        case Algorithm::CN_GR_2:
+        case Algorithm::CN_GR_3:
+        case Algorithm::CN_GR_4:
+        case Algorithm::CN_GR_5:
+            if (cn_sse41_enabled) {
+                cryptonight_double_hash_gr_sse41<ALGO>(input, size, output, ctx, height);
+                return;
+            }
+            break;
+
+        default:
+            break;
+        }
+    }
+#   endif
+
    constexpr CnAlgo<ALGO> props;
    constexpr size_t MASK        = props.mask();
    constexpr Algorithm::Id BASE = props.base();
@ -1029,8 +1296,21 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
    VARIANT4_RANDOM_MATH_INIT(0);
    VARIANT4_RANDOM_MATH_INIT(1);

-    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i *>(h0), reinterpret_cast<__m128i *>(l0));
-    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i *>(h1), reinterpret_cast<__m128i *>(l1));
+    if (props.half_mem()) {
+        ctx[0]->first_half = true;
+        ctx[1]->first_half = true;
+    }
+
+#   ifdef XMRIG_VAES
+    if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) {
+        cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
+    }
+    else
+#   endif
+    {
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+    }

    uint64_t al0 = h0[0] ^ h0[4];
    uint64_t al1 = h1[0] ^ h1[4];
@ -1225,8 +1505,16 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
        bx10 = cx1;
    }

-    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i *>(l0), reinterpret_cast<__m128i *>(h0));
-    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i *>(l1), reinterpret_cast<__m128i *>(h1));
+#   ifdef XMRIG_VAES
+    if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) {
+        cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
+    }
+    else
+#   endif
+    {
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+    }

    keccakf(h0, 24);
    keccakf(h1, 24);
@ -1236,6 +1524,85 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
 }


+#ifdef XMRIG_FEATURE_ASM
+template<Algorithm::Id ALGO>
+static NOINLINE void cryptonight_quad_hash_gr_sse41(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height)
+{
+    constexpr CnAlgo<ALGO> props;
+    constexpr Algorithm::Id BASE = props.base();
+
+    if (BASE == Algorithm::CN_1 && size < 43) {
+        memset(output, 0, 32 * 4);
+        return;
+    }
+
+    keccak(input + size * 0, size, ctx[0]->state);
+    keccak(input + size * 1, size, ctx[1]->state);
+    keccak(input + size * 2, size, ctx[2]->state);
+    keccak(input + size * 3, size, ctx[3]->state);
+
+    if (props.half_mem()) {
+        ctx[0]->first_half = true;
+        ctx[1]->first_half = true;
+        ctx[2]->first_half = true;
+        ctx[3]->first_half = true;
+    }
+
+#   ifdef XMRIG_VAES
+    if (!props.isHeavy() && cn_vaes_enabled) {
+        cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
+        cn_explode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem());
+    }
+    else
+#   endif
+    {
+        cn_explode_scratchpad<ALGO, false, 0>(ctx[0]);
+        cn_explode_scratchpad<ALGO, false, 0>(ctx[1]);
+        cn_explode_scratchpad<ALGO, false, 0>(ctx[2]);
+        cn_explode_scratchpad<ALGO, false, 0>(ctx[3]);
+    }
+
+    VARIANT1_INIT(0); ctx[0]->tweak1_2 = tweak1_2_0;
+    VARIANT1_INIT(1); ctx[1]->tweak1_2 = tweak1_2_1;
+    VARIANT1_INIT(2); ctx[2]->tweak1_2 = tweak1_2_2;
+    VARIANT1_INIT(3); ctx[3]->tweak1_2 = tweak1_2_3;
+
+    ctx[0]->tweak1_table = tweak1_table;
+
+    if (ALGO == Algorithm::CN_GR_0) cn_gr0_quad_mainloop_asm(ctx);
+    if (ALGO == Algorithm::CN_GR_1) cn_gr1_quad_mainloop_asm(ctx);
+    if (ALGO == Algorithm::CN_GR_2) cn_gr2_quad_mainloop_asm(ctx);
+    if (ALGO == Algorithm::CN_GR_3) cn_gr3_quad_mainloop_asm(ctx);
+    if (ALGO == Algorithm::CN_GR_4) cn_gr4_quad_mainloop_asm(ctx);
+    if (ALGO == Algorithm::CN_GR_5) cn_gr5_quad_mainloop_asm(ctx);
+
+#   ifdef XMRIG_VAES
+    if (!props.isHeavy() && cn_vaes_enabled) {
+        cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
+        cn_implode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem());
+    }
+    else
+#   endif
+    {
+        cn_implode_scratchpad<ALGO, false, 0>(ctx[0]);
+        cn_implode_scratchpad<ALGO, false, 0>(ctx[1]);
+        cn_implode_scratchpad<ALGO, false, 0>(ctx[2]);
+        cn_implode_scratchpad<ALGO, false, 0>(ctx[3]);
+    }
+
+    keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);
+    keccakf(reinterpret_cast<uint64_t*>(ctx[1]->state), 24);
+    keccakf(reinterpret_cast<uint64_t*>(ctx[2]->state), 24);
+    keccakf(reinterpret_cast<uint64_t*>(ctx[3]->state), 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+    extra_hashes[ctx[2]->state[0] & 3](ctx[2]->state, 200, output + 64);
+    extra_hashes[ctx[3]->state[0] & 3](ctx[3]->state, 200, output + 96);
+}
+#endif
+
+
 #define CN_STEP1(a, b0, b1, c, l, ptr, idx, conc_var) \
    ptr = reinterpret_cast<__m128i*>(&l[idx & MASK]); \
    c = _mm_load_si128(ptr);                          \
@ -1371,7 +1738,10 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si

    for (size_t i = 0; i < 3; i++) {
        keccak(input + size * i, size, ctx[i]->state);
-        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory));
+        if (props.half_mem()) {
+            ctx[i]->first_half = true;
+        }
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[i]);
    }

    uint8_t* l0  = ctx[0]->memory;
@ -1416,7 +1786,7 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si
    }

    for (size_t i = 0; i < 3; i++) {
-        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state));
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[i]);
        keccakf(reinterpret_cast<uint64_t*>(ctx[i]->state), 24);
        extra_hashes[ctx[i]->state[0] & 3](ctx[i]->state, 200, output + 32 * i);
    }
@ -1426,6 +1796,27 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si
 template<Algorithm::Id ALGO, bool SOFT_AES>
 inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
 {
+#   ifdef XMRIG_FEATURE_ASM
+    if (!SOFT_AES) {
+        switch (ALGO) {
+        case Algorithm::CN_GR_0:
+        case Algorithm::CN_GR_1:
+        case Algorithm::CN_GR_2:
+        case Algorithm::CN_GR_3:
+        case Algorithm::CN_GR_4:
+        case Algorithm::CN_GR_5:
+            if (cn_sse41_enabled) {
+                cryptonight_quad_hash_gr_sse41<ALGO>(input, size, output, ctx, height);
+                return;
+            }
+            break;
+
+        default:
+            break;
+        }
+    }
+#   endif
+
    constexpr CnAlgo<ALGO> props;
    constexpr size_t MASK        = props.mask();
    constexpr Algorithm::Id BASE = props.base();
@ -1445,7 +1836,23 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size

    for (size_t i = 0; i < 4; i++) {
        keccak(input + size * i, size, ctx[i]->state);
-        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory));
+        if (props.half_mem()) {
+            ctx[i]->first_half = true;
+        }
+    }
+
+#   ifdef XMRIG_VAES
+    if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) {
+        cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
+        cn_explode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem());
+    }
+    else
+#   endif
+    {
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
    }

    uint8_t* l0  = ctx[0]->memory;
@ -1497,8 +1904,21 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size
        CN_STEP4(3, ax3, bx30, bx31, cx3, l3, mc3, ptr3, idx3);
    }

+#   ifdef XMRIG_VAES
+    if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) {
+        cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
+        cn_implode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem());
+    }
+    else
+#   endif
+    {
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
+    }
+
    for (size_t i = 0; i < 4; i++) {
-        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state));
        keccakf(reinterpret_cast<uint64_t*>(ctx[i]->state), 24);
        extra_hashes[ctx[i]->state[0] & 3](ctx[i]->state, 200, output + 32 * i);
    }
@ -1527,7 +1947,10 @@ inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t siz

    for (size_t i = 0; i < 5; i++) {
        keccak(input + size * i, size, ctx[i]->state);
-        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory));
+        if (props.half_mem()) {
+            ctx[i]->first_half = true;
+        }
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[i]);
    }

    uint8_t* l0  = ctx[0]->memory;
@ -1588,7 +2011,7 @@ inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t siz
    }

    for (size_t i = 0; i < 5; i++) {
-        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state));
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[i]);
        keccakf(reinterpret_cast<uint64_t*>(ctx[i]->state), 24);
        extra_hashes[ctx[i]->state[0] & 3](ctx[i]->state, 200, output + 32 * i);
    }
--- a/src/crypto/cn/CryptoNight_x86_vaes.cpp
+++ b/src/crypto/cn/CryptoNight_x86_vaes.cpp
@ -0,0 +1,478 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2019 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2020 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2020 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "CryptoNight_x86_vaes.h"
+#include "CryptoNight_monero.h"
+#include "CryptoNight.h"
+
+
+#ifdef __GNUC__
+#   include <x86intrin.h>
+#if !defined(__clang__) && !defined(__ICC) && __GNUC__ < 10
+static inline __m256i
+__attribute__((__always_inline__))
+  _mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr)
+{
+    return _mm256_inserti128_si256(
+            _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1);
+}
+
+static inline void
+__attribute__((__always_inline__))
+  _mm256_storeu2_m128i(__m128i* const hiaddr, __m128i* const loaddr, const __m256i a)
+{
+    _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
+      _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1));
+}
+#endif
+#else
+#   include <intrin.h>
+#endif
+
+
+// This will shift and xor tmp1 into itself as 4 32-bit vals such as
+// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
+static FORCEINLINE __m128i sl_xor(__m128i tmp1)
+{
+    __m128i tmp4;
+    tmp4 = _mm_slli_si128(tmp1, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    return tmp1;
+}
+
+
+template<uint8_t rcon>
+static FORCEINLINE void aes_genkey_sub(__m128i* xout0, __m128i* xout2)
+{
+    __m128i xout1 = _mm_aeskeygenassist_si128(*xout2, rcon);
+    xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
+    *xout0 = sl_xor(*xout0);
+    *xout0 = _mm_xor_si128(*xout0, xout1);
+    xout1 = _mm_aeskeygenassist_si128(*xout0, 0x00);
+    xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
+    *xout2 = sl_xor(*xout2);
+    *xout2 = _mm_xor_si128(*xout2, xout1);
+}
+
+
+static NOINLINE void vaes_genkey(const __m128i* memory, __m256i* k0, __m256i* k1, __m256i* k2, __m256i* k3, __m256i* k4, __m256i* k5, __m256i* k6, __m256i* k7, __m256i* k8, __m256i* k9)
+{
+    __m128i xout0 = _mm_load_si128(memory);
+    __m128i xout2 = _mm_load_si128(memory + 1);
+    *k0 = _mm256_set_m128i(xout0, xout0);
+    *k1 = _mm256_set_m128i(xout2, xout2);
+
+    aes_genkey_sub<0x01>(&xout0, &xout2);
+    *k2 = _mm256_set_m128i(xout0, xout0);
+    *k3 = _mm256_set_m128i(xout2, xout2);
+
+    aes_genkey_sub<0x02>(&xout0, &xout2);
+    *k4 = _mm256_set_m128i(xout0, xout0);
+    *k5 = _mm256_set_m128i(xout2, xout2);
+
+    aes_genkey_sub<0x04>(&xout0, &xout2);
+    *k6 = _mm256_set_m128i(xout0, xout0);
+    *k7 = _mm256_set_m128i(xout2, xout2);
+
+    aes_genkey_sub<0x08>(&xout0, &xout2);
+    *k8 = _mm256_set_m128i(xout0, xout0);
+    *k9 = _mm256_set_m128i(xout2, xout2);
+}
+
+
+static NOINLINE void vaes_genkey_double(const __m128i* memory1, const __m128i* memory2, __m256i* k0, __m256i* k1, __m256i* k2, __m256i* k3, __m256i* k4, __m256i* k5, __m256i* k6, __m256i* k7, __m256i* k8, __m256i* k9)
+{
+    __m128i xout0 = _mm_load_si128(memory1);
+    __m128i xout1 = _mm_load_si128(memory1 + 1);
+    __m128i xout2 = _mm_load_si128(memory2);
+    __m128i xout3 = _mm_load_si128(memory2 + 1);
+    *k0 = _mm256_set_m128i(xout2, xout0);
+    *k1 = _mm256_set_m128i(xout3, xout1);
+
+    aes_genkey_sub<0x01>(&xout0, &xout1);
+    aes_genkey_sub<0x01>(&xout2, &xout3);
+    *k2 = _mm256_set_m128i(xout2, xout0);
+    *k3 = _mm256_set_m128i(xout3, xout1);
+
+    aes_genkey_sub<0x02>(&xout0, &xout1);
+    aes_genkey_sub<0x02>(&xout2, &xout3);
+    *k4 = _mm256_set_m128i(xout2, xout0);
+    *k5 = _mm256_set_m128i(xout3, xout1);
+
+    aes_genkey_sub<0x04>(&xout0, &xout1);
+    aes_genkey_sub<0x04>(&xout2, &xout3);
+    *k6 = _mm256_set_m128i(xout2, xout0);
+    *k7 = _mm256_set_m128i(xout3, xout1);
+
+    aes_genkey_sub<0x08>(&xout0, &xout1);
+    aes_genkey_sub<0x08>(&xout2, &xout3);
+    *k8 = _mm256_set_m128i(xout2, xout0);
+    *k9 = _mm256_set_m128i(xout3, xout1);
+}
+
+
+static FORCEINLINE void vaes_round(__m256i key, __m256i& x01, __m256i& x23, __m256i& x45, __m256i& x67)
+{
+    x01 = _mm256_aesenc_epi128(x01, key);
+    x23 = _mm256_aesenc_epi128(x23, key);
+    x45 = _mm256_aesenc_epi128(x45, key);
+    x67 = _mm256_aesenc_epi128(x67, key);
+}
+
+
+static FORCEINLINE void vaes_round(__m256i key, __m256i& x0, __m256i& x1, __m256i& x2, __m256i& x3, __m256i& x4, __m256i& x5, __m256i& x6, __m256i& x7)
+{
+    x0 = _mm256_aesenc_epi128(x0, key);
+    x1 = _mm256_aesenc_epi128(x1, key);
+    x2 = _mm256_aesenc_epi128(x2, key);
+    x3 = _mm256_aesenc_epi128(x3, key);
+    x4 = _mm256_aesenc_epi128(x4, key);
+    x5 = _mm256_aesenc_epi128(x5, key);
+    x6 = _mm256_aesenc_epi128(x6, key);
+    x7 = _mm256_aesenc_epi128(x7, key);
+}
+
+
+namespace xmrig {
+
+
+NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem)
+{
+    const size_t N = (memory / sizeof(__m256i)) / (half_mem ? 2 : 1);
+
+    __m256i xin01, xin23, xin45, xin67;
+    __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    const __m128i* input = reinterpret_cast<const __m128i*>(ctx->state);
+    __m256i* output = reinterpret_cast<__m256i*>(ctx->memory);
+
+    vaes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    if (half_mem && !ctx->first_half) {
+        const __m256i* p = reinterpret_cast<const __m256i*>(ctx->save_state);
+        xin01 = _mm256_loadu_si256(p + 0);
+        xin23 = _mm256_loadu_si256(p + 1);
+        xin45 = _mm256_loadu_si256(p + 2);
+        xin67 = _mm256_loadu_si256(p + 3);
+    }
+    else {
+        xin01 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + 4));
+        xin23 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + 6));
+        xin45 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + 8));
+        xin67 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + 10));
+    }
+
+    constexpr int output_increment = 64 / sizeof(__m256i);
+    constexpr int prefetch_dist = 2048 / sizeof(__m256i);
+
+    __m256i* e = output + N - prefetch_dist;
+    __m256i* prefetch_ptr = output + prefetch_dist;
+
+    for (int i = 0; i < 2; ++i) {
+        do {
+            _mm_prefetch((const char*)(prefetch_ptr), _MM_HINT_T0);
+            _mm_prefetch((const char*)(prefetch_ptr + output_increment), _MM_HINT_T0);
+
+            vaes_round(k0, xin01, xin23, xin45, xin67);
+            vaes_round(k1, xin01, xin23, xin45, xin67);
+            vaes_round(k2, xin01, xin23, xin45, xin67);
+            vaes_round(k3, xin01, xin23, xin45, xin67);
+            vaes_round(k4, xin01, xin23, xin45, xin67);
+            vaes_round(k5, xin01, xin23, xin45, xin67);
+            vaes_round(k6, xin01, xin23, xin45, xin67);
+            vaes_round(k7, xin01, xin23, xin45, xin67);
+            vaes_round(k8, xin01, xin23, xin45, xin67);
+            vaes_round(k9, xin01, xin23, xin45, xin67);
+
+            _mm256_store_si256(output + 0, xin01);
+            _mm256_store_si256(output + 1, xin23);
+
+            _mm256_store_si256(output + output_increment + 0, xin45);
+            _mm256_store_si256(output + output_increment + 1, xin67);
+
+            output += output_increment * 2;
+            prefetch_ptr += output_increment * 2;
+        } while (output < e);
+        e += prefetch_dist;
+        prefetch_ptr = output;
+    }
+
+    if (half_mem && ctx->first_half) {
+        __m256i* p = reinterpret_cast<__m256i*>(ctx->save_state);
+        _mm256_storeu_si256(p + 0, xin01);
+        _mm256_storeu_si256(p + 1, xin23);
+        _mm256_storeu_si256(p + 2, xin45);
+        _mm256_storeu_si256(p + 3, xin67);
+    }
+
+    _mm256_zeroupper();
+}
+
+
+NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem)
+{
+    const size_t N = (memory / sizeof(__m128i)) / (half_mem ? 2 : 1);
+
+    __m256i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
+    __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    const __m128i* input1 = reinterpret_cast<const __m128i*>(ctx1->state);
+    const __m128i* input2 = reinterpret_cast<const __m128i*>(ctx2->state);
+
+    __m128i* output1 = reinterpret_cast<__m128i*>(ctx1->memory);
+    __m128i* output2 = reinterpret_cast<__m128i*>(ctx2->memory);
+
+    vaes_genkey_double(input1, input2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    {
+        const bool b = half_mem && !ctx1->first_half && !ctx2->first_half;
+        const __m128i* p1 = b ? reinterpret_cast<const __m128i*>(ctx1->save_state) : (input1 + 4);
+        const __m128i* p2 = b ? reinterpret_cast<const __m128i*>(ctx2->save_state) : (input2 + 4);
+        xin0 = _mm256_loadu2_m128i(p2 + 0, p1 + 0);
+        xin1 = _mm256_loadu2_m128i(p2 + 1, p1 + 1);
+        xin2 = _mm256_loadu2_m128i(p2 + 2, p1 + 2);
+        xin3 = _mm256_loadu2_m128i(p2 + 3, p1 + 3);
+        xin4 = _mm256_loadu2_m128i(p2 + 4, p1 + 4);
+        xin5 = _mm256_loadu2_m128i(p2 + 5, p1 + 5);
+        xin6 = _mm256_loadu2_m128i(p2 + 6, p1 + 6);
+        xin7 = _mm256_loadu2_m128i(p2 + 7, p1 + 7);
+    }
+
+    constexpr int output_increment = 64 / sizeof(__m128i);
+    constexpr int prefetch_dist = 2048 / sizeof(__m128i);
+
+    __m128i* e = output1 + N - prefetch_dist;
+    __m128i* prefetch_ptr1 = output1 + prefetch_dist;
+    __m128i* prefetch_ptr2 = output2 + prefetch_dist;
+
+    for (int i = 0; i < 2; ++i) {
+        do {
+            _mm_prefetch((const char*)(prefetch_ptr1), _MM_HINT_T0);
+            _mm_prefetch((const char*)(prefetch_ptr1 + output_increment), _MM_HINT_T0);
+            _mm_prefetch((const char*)(prefetch_ptr2), _MM_HINT_T0);
+            _mm_prefetch((const char*)(prefetch_ptr2 + output_increment), _MM_HINT_T0);
+
+            vaes_round(k0, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k1, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k2, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k3, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k4, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k5, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k6, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k7, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k8, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k9, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+
+            _mm256_storeu2_m128i(output2 + 0, output1 + 0, xin0);
+            _mm256_storeu2_m128i(output2 + 1, output1 + 1, xin1);
+            _mm256_storeu2_m128i(output2 + 2, output1 + 2, xin2);
+            _mm256_storeu2_m128i(output2 + 3, output1 + 3, xin3);
+
+            _mm256_storeu2_m128i(output2 + output_increment + 0, output1 + output_increment + 0, xin4);
+            _mm256_storeu2_m128i(output2 + output_increment + 1, output1 + output_increment + 1, xin5);
+            _mm256_storeu2_m128i(output2 + output_increment + 2, output1 + output_increment + 2, xin6);
+            _mm256_storeu2_m128i(output2 + output_increment + 3, output1 + output_increment + 3, xin7);
+
+            output1 += output_increment * 2;
+            prefetch_ptr1 += output_increment * 2;
+            output2 += output_increment * 2;
+            prefetch_ptr2 += output_increment * 2;
+        } while (output1 < e);
+        e += prefetch_dist;
+        prefetch_ptr1 = output1;
+        prefetch_ptr2 = output2;
+    }
+
+    if (half_mem && ctx1->first_half && ctx2->first_half) {
+        __m128i* p1 = reinterpret_cast<__m128i*>(ctx1->save_state);
+        __m128i* p2 = reinterpret_cast<__m128i*>(ctx2->save_state);
+        _mm256_storeu2_m128i(p2 + 0, p1 + 0, xin0);
+        _mm256_storeu2_m128i(p2 + 1, p1 + 1, xin1);
+        _mm256_storeu2_m128i(p2 + 2, p1 + 2, xin2);
+        _mm256_storeu2_m128i(p2 + 3, p1 + 3, xin3);
+        _mm256_storeu2_m128i(p2 + 4, p1 + 4, xin4);
+        _mm256_storeu2_m128i(p2 + 5, p1 + 5, xin5);
+        _mm256_storeu2_m128i(p2 + 6, p1 + 6, xin6);
+        _mm256_storeu2_m128i(p2 + 7, p1 + 7, xin7);
+    }
+
+    _mm256_zeroupper();
+}
+
+
+NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem)
+{
+    const size_t N = (memory / sizeof(__m256i)) / (half_mem ? 2 : 1);
+
+    __m256i xout01, xout23, xout45, xout67;
+    __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    const __m256i* input = reinterpret_cast<const __m256i*>(ctx->memory);
+    __m256i* output = reinterpret_cast<__m256i*>(ctx->state);
+
+    vaes_genkey(reinterpret_cast<__m128i*>(output) + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xout01 = _mm256_loadu_si256(output + 2);
+    xout23 = _mm256_loadu_si256(output + 3);
+    xout45 = _mm256_loadu_si256(output + 4);
+    xout67 = _mm256_loadu_si256(output + 5);
+
+    const __m256i* input_begin = input;
+    for (size_t part = 0; part < (half_mem ? 2 : 1); ++part) {
+        if (half_mem && (part == 1)) {
+            input = input_begin;
+            ctx->first_half = false;
+            cn_explode_scratchpad_vaes(ctx, memory, half_mem);
+        }
+
+        for (size_t i = 0; i < N;) {
+            xout01 = _mm256_xor_si256(xout01, input[0]);
+            xout23 = _mm256_xor_si256(xout23, input[1]);
+
+            constexpr int input_increment = 64 / sizeof(__m256i);
+
+            xout45 = _mm256_xor_si256(xout45, input[input_increment]);
+            xout67 = _mm256_xor_si256(xout67, input[input_increment + 1]);
+
+            input += input_increment * 2;
+            i += 4;
+
+            if (i < N) {
+                _mm_prefetch((const char*)(input), _MM_HINT_T0);
+                _mm_prefetch((const char*)(input + input_increment), _MM_HINT_T0);
+            }
+
+            vaes_round(k0, xout01, xout23, xout45, xout67);
+            vaes_round(k1, xout01, xout23, xout45, xout67);
+            vaes_round(k2, xout01, xout23, xout45, xout67);
+            vaes_round(k3, xout01, xout23, xout45, xout67);
+            vaes_round(k4, xout01, xout23, xout45, xout67);
+            vaes_round(k5, xout01, xout23, xout45, xout67);
+            vaes_round(k6, xout01, xout23, xout45, xout67);
+            vaes_round(k7, xout01, xout23, xout45, xout67);
+            vaes_round(k8, xout01, xout23, xout45, xout67);
+            vaes_round(k9, xout01, xout23, xout45, xout67);
+        }
+    }
+
+    _mm256_storeu_si256(output + 2, xout01);
+    _mm256_storeu_si256(output + 3, xout23);
+    _mm256_storeu_si256(output + 4, xout45);
+    _mm256_storeu_si256(output + 5, xout67);
+
+    _mm256_zeroupper();
+}
+
+
+NOINLINE void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem)
+{
+    const size_t N = (memory / sizeof(__m128i)) / (half_mem ? 2 : 1);
+
+    __m256i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
+    __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    const __m128i* input1 = reinterpret_cast<const __m128i*>(ctx1->memory);
+    const __m128i* input2 = reinterpret_cast<const __m128i*>(ctx2->memory);
+
+    __m128i* output1 = reinterpret_cast<__m128i*>(ctx1->state);
+    __m128i* output2 = reinterpret_cast<__m128i*>(ctx2->state);
+
+    vaes_genkey_double(output1 + 2, output2 + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xout0 = _mm256_loadu2_m128i(output2 + 4, output1 + 4);
+    xout1 = _mm256_loadu2_m128i(output2 + 5, output1 + 5);
+    xout2 = _mm256_loadu2_m128i(output2 + 6, output1 + 6);
+    xout3 = _mm256_loadu2_m128i(output2 + 7, output1 + 7);
+    xout4 = _mm256_loadu2_m128i(output2 + 8, output1 + 8);
+    xout5 = _mm256_loadu2_m128i(output2 + 9, output1 + 9);
+    xout6 = _mm256_loadu2_m128i(output2 + 10, output1 + 10);
+    xout7 = _mm256_loadu2_m128i(output2 + 11, output1 + 11);
+
+    const __m128i* input_begin1 = input1;
+    const __m128i* input_begin2 = input2;
+    for (size_t part = 0; part < (half_mem ? 2 : 1); ++part) {
+        if (half_mem && (part == 1)) {
+            input1 = input_begin1;
+            input2 = input_begin2;
+            ctx1->first_half = false;
+            ctx2->first_half = false;
+            cn_explode_scratchpad_vaes_double(ctx1, ctx2, memory, half_mem);
+        }
+
+        for (size_t i = 0; i < N;) {
+            xout0 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + 0, input1 + 0), xout0);
+            xout1 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + 1, input1 + 1), xout1);
+            xout2 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + 2, input1 + 2), xout2);
+            xout3 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + 3, input1 + 3), xout3);
+
+            constexpr int input_increment = 64 / sizeof(__m128i);
+
+            xout4 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + input_increment + 0, input1 + input_increment + 0), xout4);
+            xout5 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + input_increment + 1, input1 + input_increment + 1), xout5);
+            xout6 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + input_increment + 2, input1 + input_increment + 2), xout6);
+            xout7 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + input_increment + 3, input1 + input_increment + 3), xout7);
+
+            input1 += input_increment * 2;
+            input2 += input_increment * 2;
+            i += 8;
+
+            if (i < N) {
+                _mm_prefetch((const char*)(input1), _MM_HINT_T0);
+                _mm_prefetch((const char*)(input1 + input_increment), _MM_HINT_T0);
+                _mm_prefetch((const char*)(input2), _MM_HINT_T0);
+                _mm_prefetch((const char*)(input2 + input_increment), _MM_HINT_T0);
+            }
+
+            vaes_round(k0, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k1, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k2, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k3, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k4, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k5, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k6, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k7, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k8, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k9, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+        }
+    }
+
+    _mm256_storeu2_m128i(output2 + 4, output1 + 4, xout0);
+    _mm256_storeu2_m128i(output2 + 5, output1 + 5, xout1);
+    _mm256_storeu2_m128i(output2 + 6, output1 + 6, xout2);
+    _mm256_storeu2_m128i(output2 + 7, output1 + 7, xout3);
+    _mm256_storeu2_m128i(output2 + 8, output1 + 8, xout4);
+    _mm256_storeu2_m128i(output2 + 9, output1 + 9, xout5);
+    _mm256_storeu2_m128i(output2 + 10, output1 + 10, xout6);
+    _mm256_storeu2_m128i(output2 + 11, output1 + 11, xout7);
+
+    _mm256_zeroupper();
+}
+
+
+} // xmrig
--- a/src/crypto/cn/CryptoNight_x86_vaes.h
+++ b/src/crypto/cn/CryptoNight_x86_vaes.h
@ -0,0 +1,48 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2019 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2020 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2020 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_X86_VAES_H
+#define XMRIG_CRYPTONIGHT_X86_VAES_H
+
+
+#include "crypto/cn/CnAlgo.h"
+
+
+struct cryptonight_ctx;
+
+
+namespace xmrig {
+
+
+void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem);
+void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem);
+void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem);
+void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem);
+
+
+} // xmrig
+
+
+#endif /* XMRIG_CRYPTONIGHT_X86_VAES_H */
--- a/src/crypto/cn/asm/cn1/cnv1_double_main_loop.inc
+++ b/src/crypto/cn/asm/cn1/cnv1_double_main_loop.inc
@ -0,0 +1,132 @@
+	mov	QWORD PTR [rsp+8], rbx
+	mov	QWORD PTR [rsp+16], rbp
+	mov	QWORD PTR [rsp+24], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 32
+	mov	rdx, QWORD PTR [rcx]
+	mov	r8, QWORD PTR [rcx+8]
+	mov	r12d, 524288
+	movaps	XMMWORD PTR [rsp+16], xmm6
+	mov	rbx, QWORD PTR [rdx+32]
+	xor	rbx, QWORD PTR [rdx]
+	mov	rsi, QWORD PTR [rdx+40]
+	mov	r10, rbx
+	xor	rsi, QWORD PTR [rdx+8]
+	and	r10d, 2097136
+	mov	rdi, QWORD PTR [r8+32]
+	xor	rdi, QWORD PTR [r8]
+	movq	xmm3, rbx
+	mov	rbp, QWORD PTR [r8+40]
+	mov	r9, rdi
+	xor	rbp, QWORD PTR [r8+8]
+	movq	xmm0, rsi
+	mov	rcx, QWORD PTR [rdx+56]
+	and	r9d, 2097136
+	xor	rcx, QWORD PTR [rdx+24]
+	movq	xmm4, rdi
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	mov	r14, QWORD PTR [rdx+224]
+	mov	r13, QWORD PTR [rdx+232]
+	mov	r15, QWORD PTR [r8+224]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rbp
+	movq	xmm5, rax
+	punpcklqdq xmm4, xmm0
+	mov	rax, QWORD PTR [r8+48]
+	movq	xmm0, rcx
+	xor	rax, QWORD PTR [r8+16]
+	mov	rcx, QWORD PTR [r8+56]
+	xor	rcx, QWORD PTR [r8+24]
+	movdqu	xmm1, XMMWORD PTR [r14+r10]
+	movq	xmm6, rax
+	punpcklqdq xmm5, xmm0
+	mov	rax, QWORD PTR [rdx+240]
+	movq	xmm0, rcx
+	movdqu	xmm2, XMMWORD PTR [r15+r9]
+	mov	QWORD PTR [rsp], rax
+	mov	rax, QWORD PTR [r8+240]
+	mov	QWORD PTR [rsp+8], rax
+	punpcklqdq xmm6, xmm0
+
+	ALIGN(64)
+main_loop_cnv1_double:
+	aesenc	xmm1, xmm3
+	aesenc	xmm2, xmm4
+	movdqa	xmm0, xmm1
+	movq	r11, xmm2
+	pxor	xmm0, xmm5
+	movdqa	xmm5, xmm1
+	movq	QWORD PTR [r14+r10], xmm0
+	pextrq	rcx, xmm0, 1
+	mov	eax, ecx
+	movdqa	xmm0, xmm2
+	shr	rax, 24
+	pxor	xmm0, xmm6
+	movdqa	xmm6, xmm2
+	mov	eax, DWORD PTR [r13+rax*4]
+	xor	rax, rcx
+	mov	QWORD PTR [r14+r10+8], rax
+	movq	QWORD PTR [r15+r9], xmm0
+	pextrq	rcx, xmm0, 1
+	mov	eax, ecx
+	shr	rax, 24
+	mov	eax, DWORD PTR [r13+rax*4]
+	xor	rax, rcx
+	movq	rcx, xmm1
+	mov	QWORD PTR [r15+r9+8], rax
+	mov	r9, rcx
+	and	r9d, 2097136
+	mov	r10, QWORD PTR [r14+r9]
+	mov	r8, QWORD PTR [r14+r9+8]
+	mov	rax, r10
+	mul	rcx
+	add	rsi, rax
+	add	rbx, rdx
+	mov	rax, QWORD PTR [rsp]
+	mov	QWORD PTR [r14+r9], rbx
+	xor	rax, rsi
+	mov	QWORD PTR [r14+r9+8], rax
+	xor	rsi, r8
+	xor	rbx, r10
+	mov	r8, r11
+	and	r8d, 2097136
+	mov	r10, rbx
+	and	r10d, 2097136
+	movq	xmm3, rbx
+	pinsrq	xmm3, rsi, 1
+	mov	r9, QWORD PTR [r15+r8]
+	mov	rcx, QWORD PTR [r15+r8+8]
+	mov	rax, r9
+	movdqu	xmm1, XMMWORD PTR [r14+r10]
+	mul	r11
+	add	rbp, rax
+	add	rdi, rdx
+	mov	rax, QWORD PTR [rsp+8]
+	mov	QWORD PTR [r15+r8], rdi
+	xor	rax, rbp
+	xor	rdi, r9
+	mov	QWORD PTR [r15+r8+8], rax
+	mov	r9, rdi
+	xor	rbp, rcx
+	and	r9d, 2097136
+	movq	xmm4, rdi
+	pinsrq	xmm4, rbp, 1
+	movdqu	xmm2, XMMWORD PTR [r15+r9]
+	sub	r12, 1
+	jne	main_loop_cnv1_double
+
+	mov	rbx, QWORD PTR [rsp+80]
+	mov	rbp, QWORD PTR [rsp+88]
+	mov	rsi, QWORD PTR [rsp+96]
+	movaps	xmm6, XMMWORD PTR [rsp+16]
+	add	rsp, 32
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
--- a/src/crypto/cn/asm/cn1/cnv1_quad_main_loop.inc
+++ b/src/crypto/cn/asm/cn1/cnv1_quad_main_loop.inc
@ -0,0 +1,263 @@
+	mov	rax, rsp
+	mov	QWORD PTR [rax+8], rbx
+	mov	QWORD PTR [rax+16], rbp
+	mov	QWORD PTR [rax+24], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 144
+	mov	r8, QWORD PTR [rcx]
+	mov	r9, QWORD PTR [rcx+8]
+	mov	r10, QWORD PTR [rcx+16]
+	mov	r11, QWORD PTR [rcx+24]
+	mov	rbp, QWORD PTR [r8+224]
+	mov	r13, QWORD PTR [r8+232]
+	mov	r14, QWORD PTR [r9+224]
+	mov	r15, QWORD PTR [r10+224]
+	mov	r12, QWORD PTR [r11+224]
+	mov	rcx, QWORD PTR [r8+40]
+	xor	rcx, QWORD PTR [r8+8]
+	mov	rbx, QWORD PTR [r8+32]
+	xor	rbx, QWORD PTR [r8]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+40]
+	xor	rcx, QWORD PTR [r9+8]
+	movq	xmm1, rbx
+	movaps	XMMWORD PTR [rax-56], xmm6
+	movaps	XMMWORD PTR [rax-72], xmm7
+	movaps	XMMWORD PTR [rax-88], xmm8
+	movaps	XMMWORD PTR [rax-104], xmm9
+	movaps	XMMWORD PTR [rax-120], xmm10
+	movaps	XMMWORD PTR [rsp+48], xmm11
+	movaps	XMMWORD PTR [rsp+32], xmm12
+	and	ebx, 2097136
+	mov	rsi, QWORD PTR [r10+32]
+	movq	xmm2, rdi
+	mov	rax, QWORD PTR [r8+240]
+	and	edi, 2097136
+	xor	rsi, QWORD PTR [r10]
+	mov	rdx, QWORD PTR [r8+56]
+	xor	rdx, QWORD PTR [r8+24]
+	mov	QWORD PTR [rsp], rax
+	mov	rax, QWORD PTR [r9+240]
+	movq	xmm3, rsi
+	mov	QWORD PTR [rsp+8], rax
+	and	esi, 2097136
+	mov	rax, QWORD PTR [r10+240]
+	punpcklqdq xmm1, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r10+40]
+	xor	rcx, QWORD PTR [r10+8]
+	mov	QWORD PTR [rsp+16], rax
+	mov	rax, QWORD PTR [r11+240]
+	punpcklqdq xmm2, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp+24], rax
+	mov	rcx, QWORD PTR [r11+40]
+	xor	rcx, QWORD PTR [r11+8]
+	mov	rax, QWORD PTR [r11+32]
+	xor	rax, QWORD PTR [r11]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r8+48]
+	xor	rcx, QWORD PTR [r8+16]
+	movq	xmm4, rax
+	and	eax, 2097136
+	punpcklqdq xmm4, xmm0
+	movq	xmm0, rdx
+	mov	rdx, QWORD PTR [r9+56]
+	xor	rdx, QWORD PTR [r9+24]
+	movq	xmm5, rcx
+	mov	rcx, QWORD PTR [r9+48]
+	xor	rcx, QWORD PTR [r9+16]
+	punpcklqdq xmm5, xmm0
+	movq	xmm0, rdx
+	mov	rdx, QWORD PTR [r10+56]
+	xor	rdx, QWORD PTR [r10+24]
+	movq	xmm6, rcx
+	mov	rcx, QWORD PTR [r10+48]
+	xor	rcx, QWORD PTR [r10+16]
+	punpcklqdq xmm6, xmm0
+	movq	xmm0, rdx
+	mov	rdx, QWORD PTR [r11+56]
+	movq	xmm7, rcx
+	punpcklqdq xmm7, xmm0
+	xor	rdx, QWORD PTR [r11+24]
+	mov	rcx, QWORD PTR [r11+48]
+	xor	rcx, QWORD PTR [r11+16]
+	mov	r11d, 524288
+	movdqu	xmm9, XMMWORD PTR [rbp+rbx]
+	movdqu	xmm10, XMMWORD PTR [r14+rdi]
+	movq	xmm0, rdx
+	movdqu	xmm11, XMMWORD PTR [r15+rsi]
+	movdqu	xmm12, XMMWORD PTR [r12+rax]
+	movq	xmm8, rcx
+	punpcklqdq xmm8, xmm0
+
+	ALIGN(64)
+main_loop_cnv1_quad:
+	aesenc	xmm9, xmm1
+	aesenc	xmm10, xmm2
+	aesenc	xmm11, xmm3
+	aesenc	xmm12, xmm4
+	movd	ecx, xmm9
+	and	ecx, 2097136
+	prefetcht0 BYTE PTR [rcx+rbp]
+	movd	ecx, xmm10
+	and	ecx, 2097136
+	prefetcht0 BYTE PTR [rcx+r14]
+	movd	ecx, xmm11
+	and	ecx, 2097136
+	prefetcht0 BYTE PTR [rcx+r15]
+	movd	ecx, xmm12
+	and	ecx, 2097136
+	prefetcht0 BYTE PTR [rcx+r12]
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm5
+	movdqa	xmm5, xmm9
+	movq	QWORD PTR [rbp+rbx], xmm0
+	pextrq	rdx, xmm0, 1
+	mov	ecx, edx
+	movdqa	xmm0, xmm10
+	shr	rcx, 24
+	pxor	xmm0, xmm6
+	mov	ecx, DWORD PTR [r13+rcx*4]
+	xor	rcx, rdx
+	mov	QWORD PTR [rbp+rbx+8], rcx
+	movq	rbx, xmm1
+	movq	QWORD PTR [r14+rdi], xmm0
+	pextrq	rdx, xmm0, 1
+	mov	ecx, edx
+	movdqa	xmm0, xmm11
+	shr	rcx, 24
+	pxor	xmm0, xmm7
+	mov	ecx, DWORD PTR [r13+rcx*4]
+	xor	rcx, rdx
+	mov	QWORD PTR [r14+rdi+8], rcx
+	movq	rdi, xmm2
+	movq	QWORD PTR [r15+rsi], xmm0
+	pextrq	rdx, xmm0, 1
+	mov	ecx, edx
+	movdqa	xmm0, xmm12
+	shr	rcx, 24
+	pxor	xmm0, xmm8
+	mov	ecx, DWORD PTR [r13+rcx*4]
+	xor	rcx, rdx
+	mov	QWORD PTR [r15+rsi+8], rcx
+	movq	QWORD PTR [r12+rax], xmm0
+	pextrq	rdx, xmm0, 1
+	mov	ecx, edx
+	shr	rcx, 24
+	mov	ecx, DWORD PTR [r13+rcx*4]
+	xor	rcx, rdx
+	mov	QWORD PTR [r12+rax+8], rcx
+	movq	rcx, xmm9
+	mov	r8, rcx
+	and	r8d, 2097136
+	mov	r9, QWORD PTR [rbp+r8]
+	mov	r10, QWORD PTR [rbp+r8+8]
+	mov	rax, r9
+	mul	rcx
+	pextrq	rcx, xmm1, 1
+	add	rcx, rax
+	add	rbx, rdx
+	mov	rax, QWORD PTR [rsp]
+	mov	QWORD PTR [rbp+r8], rbx
+	xor	rax, rcx
+	mov	QWORD PTR [rbp+r8+8], rax
+	xor	rcx, r10
+	xor	rbx, r9
+	movq	xmm1, rbx
+	and	ebx, 2097136
+	pinsrq	xmm1, rcx, 1
+	movq	rcx, xmm10
+	mov	r8, rcx
+	and	r8d, 2097136
+	movdqu	xmm9, XMMWORD PTR [rbp+rbx]
+	mov	r9, QWORD PTR [r14+r8]
+	mov	r10, QWORD PTR [r14+r8+8]
+	mov	rax, r9
+	mul	rcx
+	pextrq	rcx, xmm2, 1
+	add	rcx, rax
+	add	rdi, rdx
+	mov	rax, QWORD PTR [rsp+8]
+	mov	QWORD PTR [r14+r8], rdi
+	xor	rax, rcx
+	xor	rdi, r9
+	mov	QWORD PTR [r14+r8+8], rax
+	xor	rcx, r10
+	movq	xmm2, rdi
+	and	edi, 2097136
+	pinsrq	xmm2, rcx, 1
+	movq	rcx, xmm11
+	movq	rsi, xmm3
+	mov	r8, rcx
+	and	r8d, 2097136
+	movdqa	xmm6, xmm10
+	movdqa	xmm7, xmm11
+	movdqa	xmm8, xmm12
+	movdqu	xmm10, XMMWORD PTR [r14+rdi]
+	mov	r9, QWORD PTR [r15+r8]
+	mov	r10, QWORD PTR [r15+r8+8]
+	mov	rax, r9
+	mul	rcx
+	pextrq	rcx, xmm3, 1
+	add	rcx, rax
+	add	rsi, rdx
+	mov	rax, QWORD PTR [rsp+16]
+	xor	rax, rcx
+	mov	QWORD PTR [r15+r8], rsi
+	mov	QWORD PTR [r15+r8+8], rax
+	xor	rcx, r10
+	xor	rsi, r9
+	movq	xmm3, rsi
+	and	esi, 2097136
+	pinsrq	xmm3, rcx, 1
+	movq	rcx, xmm12
+	mov	r8, rcx
+	and	r8d, 2097136
+	movdqu	xmm11, XMMWORD PTR [r15+rsi]
+	mov	r9, QWORD PTR [r12+r8]
+	mov	r10, QWORD PTR [r12+r8+8]
+	mov	rax, r9
+	mul	rcx
+	mov	rcx, rax
+	movq	rax, xmm4
+	add	rax, rdx
+	mov	QWORD PTR [r12+r8], rax
+	xor	rax, r9
+	pextrq	rdx, xmm4, 1
+	add	rdx, rcx
+	mov	rcx, QWORD PTR [rsp+24]
+	xor	rcx, rdx
+	xor	rdx, r10
+	movq	xmm4, rax
+	mov	QWORD PTR [r12+r8+8], rcx
+	and	eax, 2097136
+	pinsrq	xmm4, rdx, 1
+	movdqu	xmm12, XMMWORD PTR [r12+rax]
+	sub	r11, 1
+	jne	main_loop_cnv1_quad
+
+	movaps	xmm7, XMMWORD PTR [rsp+112]
+	lea	r11, QWORD PTR [rsp+144]
+	mov	rbx, QWORD PTR [r11+48]
+	mov	rbp, QWORD PTR [r11+56]
+	mov	rsi, QWORD PTR [r11+64]
+	movaps	xmm6, XMMWORD PTR [r11-16]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm9, XMMWORD PTR [r11-64]
+	movaps	xmm10, XMMWORD PTR [r11-80]
+	movaps	xmm11, XMMWORD PTR [r11-96]
+	movaps	xmm12, XMMWORD PTR [r11-112]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
--- a/src/crypto/cn/asm/cn1/cnv1_single_main_loop.inc
+++ b/src/crypto/cn/asm/cn1/cnv1_single_main_loop.inc
@ -0,0 +1,66 @@
+	mov	QWORD PTR [rsp+8], rbx
+	mov	QWORD PTR [rsp+16], rbp
+	mov	QWORD PTR [rsp+24], rsi
+	mov	QWORD PTR [rsp+32], rdi
+	push	r13
+	push	r14
+	push	r15
+	mov	rdx, QWORD PTR [rcx]
+	mov	esi, 524288
+	mov	r11, QWORD PTR [rdx+32]
+	xor	r11, QWORD PTR [rdx]
+	mov	rdi, QWORD PTR [rdx+224]
+	mov	rbx, QWORD PTR [rdx+40]
+	xor	rbx, QWORD PTR [rdx+8]
+	mov	rcx, QWORD PTR [rdx+56]
+	xor	rcx, QWORD PTR [rdx+24]
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	mov	rbp, QWORD PTR [rdx+240]
+	mov	r14, QWORD PTR [rdx+232]
+	movq	xmm2, rax
+	pinsrq	xmm2, rcx, 1
+
+	ALIGN(64)
+main_loop_cnv1_single:
+	mov	r8, r11
+	and	r8d, 2097136
+	movdqu	xmm1, XMMWORD PTR [rdi+r8]
+	movq	xmm0, r11
+	pinsrq	xmm0, rbx, 1
+	aesenc	xmm1, xmm0
+	movq	r15, xmm1
+	mov	r9, r15
+	and	r9d, 2097136
+	movdqa	xmm0, xmm1
+	pxor	xmm0, xmm2
+	movdqa	xmm2, xmm1
+	movq	QWORD PTR [rdi+r8], xmm0
+	pextrq	rdx, xmm0, 1
+	mov	eax, edx
+	shr	rax, 24
+	mov	ecx, DWORD PTR [r14+rax*4]
+	xor	rcx, rdx
+	mov	QWORD PTR [rdi+r8+8], rcx
+	mov	r10, QWORD PTR [rdi+r9]
+	mov	r8, QWORD PTR [rdi+r9+8]
+	mov	rax, r10
+	mul	r15
+	add	rbx, rax
+	add	r11, rdx
+	mov	QWORD PTR [rdi+r9], r11
+	mov	rax, rbx
+	xor	rbx, r8
+	xor	r11, r10
+	xor	rax, rbp
+	mov	QWORD PTR [rdi+r9+8], rax
+	sub	rsi, 1
+	jne	main_loop_cnv1_single
+
+	pop	r15
+	pop	r14
+	pop	r13
+	mov	rbx, QWORD PTR [rsp+8]
+	mov	rbp, QWORD PTR [rsp+16]
+	mov	rsi, QWORD PTR [rsp+24]
+	mov	rdi, QWORD PTR [rsp+32]
--- a/src/crypto/cn/asm/cn_main_loop.S
+++ b/src/crypto/cn/asm/cn_main_loop.S
@ -11,6 +11,9 @@
 #   define FN_PREFIX(fn) fn
 .section .text
 #endif
+.global FN_PREFIX(cnv1_single_mainloop_asm)
+.global FN_PREFIX(cnv1_double_mainloop_asm)
+.global FN_PREFIX(cnv1_quad_mainloop_asm)
 .global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
 .global FN_PREFIX(cnv2_mainloop_ryzen_asm)
 .global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
@ -19,6 +22,33 @@
 .global FN_PREFIX(cnv2_rwz_double_mainloop_asm)
 .global FN_PREFIX(cnv2_upx_double_mainloop_zen3_asm)

+ALIGN(64)
+FN_PREFIX(cnv1_single_mainloop_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn1/cnv1_single_main_loop.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+FN_PREFIX(cnv1_double_mainloop_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn1/cnv1_double_main_loop.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+FN_PREFIX(cnv1_quad_mainloop_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn1/cnv1_quad_main_loop.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
 ALIGN(64)
 FN_PREFIX(cnv2_mainloop_ivybridge_asm):
 	sub rsp, 48
--- a/src/crypto/cn/asm/cn_main_loop.asm
+++ b/src/crypto/cn/asm/cn_main_loop.asm
@ -1,4 +1,7 @@
 _TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
+PUBLIC cnv1_single_mainloop_asm
+PUBLIC cnv1_double_mainloop_asm
+PUBLIC cnv1_quad_mainloop_asm
 PUBLIC cnv2_mainloop_ivybridge_asm
 PUBLIC cnv2_mainloop_ryzen_asm
 PUBLIC cnv2_mainloop_bulldozer_asm
@ -6,6 +9,27 @@ PUBLIC cnv2_double_mainloop_sandybridge_asm
 PUBLIC cnv2_rwz_mainloop_asm
 PUBLIC cnv2_rwz_double_mainloop_asm

+ALIGN(64)
+cnv1_single_mainloop_asm PROC
+	INCLUDE cn1/cnv1_single_main_loop.inc
+	ret 0
+	mov eax, 3735929054
+cnv1_single_mainloop_asm ENDP
+
+ALIGN(64)
+cnv1_double_mainloop_asm PROC
+	INCLUDE cn1/cnv1_double_main_loop.inc
+	ret 0
+	mov eax, 3735929054
+cnv1_double_mainloop_asm ENDP
+
+ALIGN(64)
+cnv1_quad_mainloop_asm PROC
+	INCLUDE cn1/cnv1_quad_main_loop.inc
+	ret 0
+	mov eax, 3735929054
+cnv1_quad_mainloop_asm ENDP
+
 ALIGN(64)
 cnv2_mainloop_ivybridge_asm PROC
 	INCLUDE cn2/cnv2_main_loop_ivybridge.inc
--- a/src/crypto/cn/asm/win64/cn1/cnv1_double_main_loop.inc
+++ b/src/crypto/cn/asm/win64/cn1/cnv1_double_main_loop.inc
@ -0,0 +1,132 @@
+	mov	QWORD PTR [rsp+8], rbx
+	mov	QWORD PTR [rsp+16], rbp
+	mov	QWORD PTR [rsp+24], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 32
+	mov	rdx, QWORD PTR [rcx]
+	mov	r8, QWORD PTR [rcx+8]
+	mov	r12d, 524288
+	movaps	XMMWORD PTR [rsp+16], xmm6
+	mov	rbx, QWORD PTR [rdx+32]
+	xor	rbx, QWORD PTR [rdx]
+	mov	rsi, QWORD PTR [rdx+40]
+	mov	r10, rbx
+	xor	rsi, QWORD PTR [rdx+8]
+	and	r10d, 2097136
+	mov	rdi, QWORD PTR [r8+32]
+	xor	rdi, QWORD PTR [r8]
+	movd	xmm3, rbx
+	mov	rbp, QWORD PTR [r8+40]
+	mov	r9, rdi
+	xor	rbp, QWORD PTR [r8+8]
+	movd	xmm0, rsi
+	mov	rcx, QWORD PTR [rdx+56]
+	and	r9d, 2097136
+	xor	rcx, QWORD PTR [rdx+24]
+	movd	xmm4, rdi
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	mov	r14, QWORD PTR [rdx+224]
+	mov	r13, QWORD PTR [rdx+232]
+	mov	r15, QWORD PTR [r8+224]
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, rbp
+	movd	xmm5, rax
+	punpcklqdq xmm4, xmm0
+	mov	rax, QWORD PTR [r8+48]
+	movd	xmm0, rcx
+	xor	rax, QWORD PTR [r8+16]
+	mov	rcx, QWORD PTR [r8+56]
+	xor	rcx, QWORD PTR [r8+24]
+	movdqu	xmm1, XMMWORD PTR [r14+r10]
+	movd	xmm6, rax
+	punpcklqdq xmm5, xmm0
+	mov	rax, QWORD PTR [rdx+240]
+	movd	xmm0, rcx
+	movdqu	xmm2, XMMWORD PTR [r15+r9]
+	mov	QWORD PTR [rsp], rax
+	mov	rax, QWORD PTR [r8+240]
+	mov	QWORD PTR [rsp+8], rax
+	punpcklqdq xmm6, xmm0
+
+	ALIGN(64)
+main_loop_cnv1_double:
+	aesenc	xmm1, xmm3
+	aesenc	xmm2, xmm4
+	movdqa	xmm0, xmm1
+	movd	r11, xmm2
+	pxor	xmm0, xmm5
+	movdqa	xmm5, xmm1
+	movd	QWORD PTR [r14+r10], xmm0
+	pextrq	rcx, xmm0, 1
+	mov	eax, ecx
+	movdqa	xmm0, xmm2
+	shr	rax, 24
+	pxor	xmm0, xmm6
+	movdqa	xmm6, xmm2
+	mov	eax, DWORD PTR [r13+rax*4]
+	xor	rax, rcx
+	mov	QWORD PTR [r14+r10+8], rax
+	movd	QWORD PTR [r15+r9], xmm0
+	pextrq	rcx, xmm0, 1
+	mov	eax, ecx
+	shr	rax, 24
+	mov	eax, DWORD PTR [r13+rax*4]
+	xor	rax, rcx
+	movd	rcx, xmm1
+	mov	QWORD PTR [r15+r9+8], rax
+	mov	r9, rcx
+	and	r9d, 2097136
+	mov	r10, QWORD PTR [r14+r9]
+	mov	r8, QWORD PTR [r14+r9+8]
+	mov	rax, r10
+	mul	rcx
+	add	rsi, rax
+	add	rbx, rdx
+	mov	rax, QWORD PTR [rsp]
+	mov	QWORD PTR [r14+r9], rbx
+	xor	rax, rsi
+	mov	QWORD PTR [r14+r9+8], rax
+	xor	rsi, r8
+	xor	rbx, r10
+	mov	r8, r11
+	and	r8d, 2097136
+	mov	r10, rbx
+	and	r10d, 2097136
+	movd	xmm3, rbx
+	pinsrq	xmm3, rsi, 1
+	mov	r9, QWORD PTR [r15+r8]
+	mov	rcx, QWORD PTR [r15+r8+8]
+	mov	rax, r9
+	movdqu	xmm1, XMMWORD PTR [r14+r10]
+	mul	r11
+	add	rbp, rax
+	add	rdi, rdx
+	mov	rax, QWORD PTR [rsp+8]
+	mov	QWORD PTR [r15+r8], rdi
+	xor	rax, rbp
+	xor	rdi, r9
+	mov	QWORD PTR [r15+r8+8], rax
+	mov	r9, rdi
+	xor	rbp, rcx
+	and	r9d, 2097136
+	movd	xmm4, rdi
+	pinsrq	xmm4, rbp, 1
+	movdqu	xmm2, XMMWORD PTR [r15+r9]
+	sub	r12, 1
+	jne	main_loop_cnv1_double
+
+	mov	rbx, QWORD PTR [rsp+80]
+	mov	rbp, QWORD PTR [rsp+88]
+	mov	rsi, QWORD PTR [rsp+96]
+	movaps	xmm6, XMMWORD PTR [rsp+16]
+	add	rsp, 32
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
--- a/src/crypto/cn/asm/win64/cn1/cnv1_quad_main_loop.inc
+++ b/src/crypto/cn/asm/win64/cn1/cnv1_quad_main_loop.inc
@ -0,0 +1,263 @@
+	mov	rax, rsp
+	mov	QWORD PTR [rax+8], rbx
+	mov	QWORD PTR [rax+16], rbp
+	mov	QWORD PTR [rax+24], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 144
+	mov	r8, QWORD PTR [rcx]
+	mov	r9, QWORD PTR [rcx+8]
+	mov	r10, QWORD PTR [rcx+16]
+	mov	r11, QWORD PTR [rcx+24]
+	mov	rbp, QWORD PTR [r8+224]
+	mov	r13, QWORD PTR [r8+232]
+	mov	r14, QWORD PTR [r9+224]
+	mov	r15, QWORD PTR [r10+224]
+	mov	r12, QWORD PTR [r11+224]
+	mov	rcx, QWORD PTR [r8+40]
+	xor	rcx, QWORD PTR [r8+8]
+	mov	rbx, QWORD PTR [r8+32]
+	xor	rbx, QWORD PTR [r8]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+40]
+	xor	rcx, QWORD PTR [r9+8]
+	movd	xmm1, rbx
+	movaps	XMMWORD PTR [rax-56], xmm6
+	movaps	XMMWORD PTR [rax-72], xmm7
+	movaps	XMMWORD PTR [rax-88], xmm8
+	movaps	XMMWORD PTR [rax-104], xmm9
+	movaps	XMMWORD PTR [rax-120], xmm10
+	movaps	XMMWORD PTR [rsp+48], xmm11
+	movaps	XMMWORD PTR [rsp+32], xmm12
+	and	ebx, 2097136
+	mov	rsi, QWORD PTR [r10+32]
+	movd	xmm2, rdi
+	mov	rax, QWORD PTR [r8+240]
+	and	edi, 2097136
+	xor	rsi, QWORD PTR [r10]
+	mov	rdx, QWORD PTR [r8+56]
+	xor	rdx, QWORD PTR [r8+24]
+	mov	QWORD PTR [rsp], rax
+	mov	rax, QWORD PTR [r9+240]
+	movd	xmm3, rsi
+	mov	QWORD PTR [rsp+8], rax
+	and	esi, 2097136
+	mov	rax, QWORD PTR [r10+240]
+	punpcklqdq xmm1, xmm0
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [r10+40]
+	xor	rcx, QWORD PTR [r10+8]
+	mov	QWORD PTR [rsp+16], rax
+	mov	rax, QWORD PTR [r11+240]
+	punpcklqdq xmm2, xmm0
+	movd	xmm0, rcx
+	mov	QWORD PTR [rsp+24], rax
+	mov	rcx, QWORD PTR [r11+40]
+	xor	rcx, QWORD PTR [r11+8]
+	mov	rax, QWORD PTR [r11+32]
+	xor	rax, QWORD PTR [r11]
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [r8+48]
+	xor	rcx, QWORD PTR [r8+16]
+	movd	xmm4, rax
+	and	eax, 2097136
+	punpcklqdq xmm4, xmm0
+	movd	xmm0, rdx
+	mov	rdx, QWORD PTR [r9+56]
+	xor	rdx, QWORD PTR [r9+24]
+	movd	xmm5, rcx
+	mov	rcx, QWORD PTR [r9+48]
+	xor	rcx, QWORD PTR [r9+16]
+	punpcklqdq xmm5, xmm0
+	movd	xmm0, rdx
+	mov	rdx, QWORD PTR [r10+56]
+	xor	rdx, QWORD PTR [r10+24]
+	movd	xmm6, rcx
+	mov	rcx, QWORD PTR [r10+48]
+	xor	rcx, QWORD PTR [r10+16]
+	punpcklqdq xmm6, xmm0
+	movd	xmm0, rdx
+	mov	rdx, QWORD PTR [r11+56]
+	movd	xmm7, rcx
+	punpcklqdq xmm7, xmm0
+	xor	rdx, QWORD PTR [r11+24]
+	mov	rcx, QWORD PTR [r11+48]
+	xor	rcx, QWORD PTR [r11+16]
+	mov	r11d, 524288
+	movdqu	xmm9, XMMWORD PTR [rbp+rbx]
+	movdqu	xmm10, XMMWORD PTR [r14+rdi]
+	movd	xmm0, rdx
+	movdqu	xmm11, XMMWORD PTR [r15+rsi]
+	movdqu	xmm12, XMMWORD PTR [r12+rax]
+	movd	xmm8, rcx
+	punpcklqdq xmm8, xmm0
+
+	ALIGN(64)
+main_loop_cnv1_quad:
+	aesenc	xmm9, xmm1
+	aesenc	xmm10, xmm2
+	aesenc	xmm11, xmm3
+	aesenc	xmm12, xmm4
+	movd	ecx, xmm9
+	and	ecx, 2097136
+	prefetcht0 BYTE PTR [rcx+rbp]
+	movd	ecx, xmm10
+	and	ecx, 2097136
+	prefetcht0 BYTE PTR [rcx+r14]
+	movd	ecx, xmm11
+	and	ecx, 2097136
+	prefetcht0 BYTE PTR [rcx+r15]
+	movd	ecx, xmm12
+	and	ecx, 2097136
+	prefetcht0 BYTE PTR [rcx+r12]
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm5
+	movdqa	xmm5, xmm9
+	movd	QWORD PTR [rbp+rbx], xmm0
+	pextrq	rdx, xmm0, 1
+	mov	ecx, edx
+	movdqa	xmm0, xmm10
+	shr	rcx, 24
+	pxor	xmm0, xmm6
+	mov	ecx, DWORD PTR [r13+rcx*4]
+	xor	rcx, rdx
+	mov	QWORD PTR [rbp+rbx+8], rcx
+	movd	rbx, xmm1
+	movd	QWORD PTR [r14+rdi], xmm0
+	pextrq	rdx, xmm0, 1
+	mov	ecx, edx
+	movdqa	xmm0, xmm11
+	shr	rcx, 24
+	pxor	xmm0, xmm7
+	mov	ecx, DWORD PTR [r13+rcx*4]
+	xor	rcx, rdx
+	mov	QWORD PTR [r14+rdi+8], rcx
+	movd	rdi, xmm2
+	movd	QWORD PTR [r15+rsi], xmm0
+	pextrq	rdx, xmm0, 1
+	mov	ecx, edx
+	movdqa	xmm0, xmm12
+	shr	rcx, 24
+	pxor	xmm0, xmm8
+	mov	ecx, DWORD PTR [r13+rcx*4]
+	xor	rcx, rdx
+	mov	QWORD PTR [r15+rsi+8], rcx
+	movd	QWORD PTR [r12+rax], xmm0
+	pextrq	rdx, xmm0, 1
+	mov	ecx, edx
+	shr	rcx, 24
+	mov	ecx, DWORD PTR [r13+rcx*4]
+	xor	rcx, rdx
+	mov	QWORD PTR [r12+rax+8], rcx
+	movd	rcx, xmm9
+	mov	r8, rcx
+	and	r8d, 2097136
+	mov	r9, QWORD PTR [rbp+r8]
+	mov	r10, QWORD PTR [rbp+r8+8]
+	mov	rax, r9
+	mul	rcx
+	pextrq	rcx, xmm1, 1
+	add	rcx, rax
+	add	rbx, rdx
+	mov	rax, QWORD PTR [rsp]
+	mov	QWORD PTR [rbp+r8], rbx
+	xor	rax, rcx
+	mov	QWORD PTR [rbp+r8+8], rax
+	xor	rcx, r10
+	xor	rbx, r9
+	movd	xmm1, rbx
+	and	ebx, 2097136
+	pinsrq	xmm1, rcx, 1
+	movd	rcx, xmm10
+	mov	r8, rcx
+	and	r8d, 2097136
+	movdqu	xmm9, XMMWORD PTR [rbp+rbx]
+	mov	r9, QWORD PTR [r14+r8]
+	mov	r10, QWORD PTR [r14+r8+8]
+	mov	rax, r9
+	mul	rcx
+	pextrq	rcx, xmm2, 1
+	add	rcx, rax
+	add	rdi, rdx
+	mov	rax, QWORD PTR [rsp+8]
+	mov	QWORD PTR [r14+r8], rdi
+	xor	rax, rcx
+	xor	rdi, r9
+	mov	QWORD PTR [r14+r8+8], rax
+	xor	rcx, r10
+	movd	xmm2, rdi
+	and	edi, 2097136
+	pinsrq	xmm2, rcx, 1
+	movd	rcx, xmm11
+	movd	rsi, xmm3
+	mov	r8, rcx
+	and	r8d, 2097136
+	movdqa	xmm6, xmm10
+	movdqa	xmm7, xmm11
+	movdqa	xmm8, xmm12
+	movdqu	xmm10, XMMWORD PTR [r14+rdi]
+	mov	r9, QWORD PTR [r15+r8]
+	mov	r10, QWORD PTR [r15+r8+8]
+	mov	rax, r9
+	mul	rcx
+	pextrq	rcx, xmm3, 1
+	add	rcx, rax
+	add	rsi, rdx
+	mov	rax, QWORD PTR [rsp+16]
+	xor	rax, rcx
+	mov	QWORD PTR [r15+r8], rsi
+	mov	QWORD PTR [r15+r8+8], rax
+	xor	rcx, r10
+	xor	rsi, r9
+	movd	xmm3, rsi
+	and	esi, 2097136
+	pinsrq	xmm3, rcx, 1
+	movd	rcx, xmm12
+	mov	r8, rcx
+	and	r8d, 2097136
+	movdqu	xmm11, XMMWORD PTR [r15+rsi]
+	mov	r9, QWORD PTR [r12+r8]
+	mov	r10, QWORD PTR [r12+r8+8]
+	mov	rax, r9
+	mul	rcx
+	mov	rcx, rax
+	movd	rax, xmm4
+	add	rax, rdx
+	mov	QWORD PTR [r12+r8], rax
+	xor	rax, r9
+	pextrq	rdx, xmm4, 1
+	add	rdx, rcx
+	mov	rcx, QWORD PTR [rsp+24]
+	xor	rcx, rdx
+	xor	rdx, r10
+	movd	xmm4, rax
+	mov	QWORD PTR [r12+r8+8], rcx
+	and	eax, 2097136
+	pinsrq	xmm4, rdx, 1
+	movdqu	xmm12, XMMWORD PTR [r12+rax]
+	sub	r11, 1
+	jne	main_loop_cnv1_quad
+
+	movaps	xmm7, XMMWORD PTR [rsp+112]
+	lea	r11, QWORD PTR [rsp+144]
+	mov	rbx, QWORD PTR [r11+48]
+	mov	rbp, QWORD PTR [r11+56]
+	mov	rsi, QWORD PTR [r11+64]
+	movaps	xmm6, XMMWORD PTR [r11-16]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm9, XMMWORD PTR [r11-64]
+	movaps	xmm10, XMMWORD PTR [r11-80]
+	movaps	xmm11, XMMWORD PTR [r11-96]
+	movaps	xmm12, XMMWORD PTR [r11-112]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
--- a/src/crypto/cn/asm/win64/cn1/cnv1_single_main_loop.inc
+++ b/src/crypto/cn/asm/win64/cn1/cnv1_single_main_loop.inc
@ -0,0 +1,66 @@
+	mov	QWORD PTR [rsp+8], rbx
+	mov	QWORD PTR [rsp+16], rbp
+	mov	QWORD PTR [rsp+24], rsi
+	mov	QWORD PTR [rsp+32], rdi
+	push	r13
+	push	r14
+	push	r15
+	mov	rdx, QWORD PTR [rcx]
+	mov	esi, 524288
+	mov	r11, QWORD PTR [rdx+32]
+	xor	r11, QWORD PTR [rdx]
+	mov	rdi, QWORD PTR [rdx+224]
+	mov	rbx, QWORD PTR [rdx+40]
+	xor	rbx, QWORD PTR [rdx+8]
+	mov	rcx, QWORD PTR [rdx+56]
+	xor	rcx, QWORD PTR [rdx+24]
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	mov	rbp, QWORD PTR [rdx+240]
+	mov	r14, QWORD PTR [rdx+232]
+	movd	xmm2, rax
+	pinsrq	xmm2, rcx, 1
+
+	ALIGN(64)
+main_loop_cnv1_single:
+	mov	r8, r11
+	and	r8d, 2097136
+	movdqu	xmm1, XMMWORD PTR [rdi+r8]
+	movd	xmm0, r11
+	pinsrq	xmm0, rbx, 1
+	aesenc	xmm1, xmm0
+	movd	r15, xmm1
+	mov	r9, r15
+	and	r9d, 2097136
+	movdqa	xmm0, xmm1
+	pxor	xmm0, xmm2
+	movdqa	xmm2, xmm1
+	movd	QWORD PTR [rdi+r8], xmm0
+	pextrq	rdx, xmm0, 1
+	mov	eax, edx
+	shr	rax, 24
+	mov	ecx, DWORD PTR [r14+rax*4]
+	xor	rcx, rdx
+	mov	QWORD PTR [rdi+r8+8], rcx
+	mov	r10, QWORD PTR [rdi+r9]
+	mov	r8, QWORD PTR [rdi+r9+8]
+	mov	rax, r10
+	mul	r15
+	add	rbx, rax
+	add	r11, rdx
+	mov	QWORD PTR [rdi+r9], r11
+	mov	rax, rbx
+	xor	rbx, r8
+	xor	r11, r10
+	xor	rax, rbp
+	mov	QWORD PTR [rdi+r9+8], rax
+	sub	rsi, 1
+	jne	main_loop_cnv1_single
+
+	pop	r15
+	pop	r14
+	pop	r13
+	mov	rbx, QWORD PTR [rsp+8]
+	mov	rbp, QWORD PTR [rsp+16]
+	mov	rsi, QWORD PTR [rsp+24]
+	mov	rdi, QWORD PTR [rsp+32]
--- a/src/crypto/cn/asm/win64/cn_main_loop.S
+++ b/src/crypto/cn/asm/win64/cn_main_loop.S
@ -1,6 +1,9 @@
 #define ALIGN(x) .align 64
 .intel_syntax noprefix
 .section .text
+.global cnv1_single_mainloop_asm
+.global cnv1_double_mainloop_asm
+.global cnv1_quad_mainloop_asm
 .global cnv2_mainloop_ivybridge_asm
 .global cnv2_mainloop_ryzen_asm
 .global cnv2_mainloop_bulldozer_asm
@ -9,6 +12,24 @@
 .global cnv2_rwz_double_mainloop_asm
 .global cnv2_upx_double_mainloop_zen3_asm

+ALIGN(64)
+cnv1_single_mainloop_asm:
+	#include "../cn1/cnv1_single_main_loop.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv1_double_mainloop_asm:
+	#include "../cn1/cnv1_double_main_loop.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv1_quad_mainloop_asm:
+	#include "../cn1/cnv1_quad_main_loop.inc"
+	ret 0
+	mov eax, 3735929054
+
 ALIGN(64)
 cnv2_mainloop_ivybridge_asm:
 	#include "../cn2/cnv2_main_loop_ivybridge.inc"
--- a/src/crypto/cn/asm/win64/cn_main_loop.asm
+++ b/src/crypto/cn/asm/win64/cn_main_loop.asm
@ -1,4 +1,7 @@
 _TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
+PUBLIC cnv1_single_mainloop_asm
+PUBLIC cnv1_double_mainloop_asm
+PUBLIC cnv1_quad_mainloop_asm
 PUBLIC cnv2_mainloop_ivybridge_asm
 PUBLIC cnv2_mainloop_ryzen_asm
 PUBLIC cnv2_mainloop_bulldozer_asm
@ -6,28 +9,49 @@ PUBLIC cnv2_double_mainloop_sandybridge_asm
 PUBLIC cnv2_rwz_mainloop_asm
 PUBLIC cnv2_rwz_double_mainloop_asm

-ALIGN 64
+ALIGN(64)
+cnv1_single_mainloop_asm PROC
+	INCLUDE cn1/cnv1_single_main_loop.inc
+	ret 0
+	mov eax, 3735929054
+cnv1_single_mainloop_asm ENDP
+
+ALIGN(64)
+cnv1_double_mainloop_asm PROC
+	INCLUDE cn1/cnv1_double_main_loop.inc
+	ret 0
+	mov eax, 3735929054
+cnv1_double_mainloop_asm ENDP
+
+ALIGN(64)
+cnv1_quad_mainloop_asm PROC
+	INCLUDE cn1/cnv1_quad_main_loop.inc
+	ret 0
+	mov eax, 3735929054
+cnv1_quad_mainloop_asm ENDP
+
+ALIGN(64)
 cnv2_mainloop_ivybridge_asm PROC
 	INCLUDE cn2/cnv2_main_loop_ivybridge.inc
 	ret 0
 	mov eax, 3735929054
 cnv2_mainloop_ivybridge_asm ENDP

-ALIGN 64
+ALIGN(64)
 cnv2_mainloop_ryzen_asm PROC
 	INCLUDE cn2/cnv2_main_loop_ryzen.inc
 	ret 0
 	mov eax, 3735929054
 cnv2_mainloop_ryzen_asm ENDP

-ALIGN 64
+ALIGN(64)
 cnv2_mainloop_bulldozer_asm PROC
 	INCLUDE cn2/cnv2_main_loop_bulldozer.inc
 	ret 0
 	mov eax, 3735929054
 cnv2_mainloop_bulldozer_asm ENDP

-ALIGN 64
+ALIGN(64)
 cnv2_double_mainloop_sandybridge_asm PROC
 	INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc
 	ret 0
--- a/src/crypto/cn/c_groestl.c
+++ b/src/crypto/cn/c_groestl.c
@ -4,7 +4,7 @@
 *
 *  This work is based on the implementation of
 *          Soeren S. Thomsen and Krystian Matusiewicz
- *          
+ *
 *
 */

@ -22,7 +22,7 @@ const uint8_t indices_cyclic[15] = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6};
 #define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) {temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \
 															v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \
 															v1 = temp_var;}
-  
+

 #define COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t)				\
   tu = T[2*(uint32_t)x[4*c0+0]];			    \
@ -161,11 +161,11 @@ static void F512(uint32_t *h, const uint32_t *m) {

 /* digest up to msglen bytes of input (full blocks only) */
 static void Transform(groestlHashState *ctx,
-	       const uint8_t *input, 
+	       const uint8_t *input,
 	       int msglen) {

  /* digest message, one block at a time */
-  for (; msglen >= SIZE512; 
+  for (; msglen >= SIZE512;
       msglen -= SIZE512, input += SIZE512) {
    F512(ctx->chaining,(uint32_t*)input);

@ -199,7 +199,7 @@ static void OutputTransformation(groestlHashState *ctx) {
 	RND512P((uint8_t*)y, temp, 0x00000009);
 	for (j = 0; j < 2*COLS512; j++) {
 	  ctx->chaining[j] ^= temp[j];
-	}									  
+	}									
 }

 /* initialise context */
@ -313,7 +313,7 @@ static void Final(groestlHashState* ctx,
    ctx->block_counter2 >>= 8;
  }
  /* digest final padding block */
-  Transform(ctx, ctx->buffer, SIZE512); 
+  Transform(ctx, ctx->buffer, SIZE512);
  /* perform output transformation */
  OutputTransformation(ctx);

@ -332,7 +332,7 @@ static void Final(groestlHashState* ctx,
 }

 /* hash bit sequence */
-void groestl(const BitSequence* data, 
+void groestl(const BitSequence* data,
 		DataLength databitlen,
 		BitSequence* hashval) {

--- a/src/crypto/cn/c_groestl.h
+++ b/src/crypto/cn/c_groestl.h
@ -4,10 +4,10 @@
 #include "crypto_uint8.h"
 #include "crypto_uint32.h"
 #include "crypto_uint64.h"
-#include "crypto_hash.h" 
+#include "crypto_hash.h"

-typedef crypto_uint8 uint8_t; 
-typedef crypto_uint32 uint32_t; 
+typedef crypto_uint8 uint8_t;
+typedef crypto_uint32 uint32_t;
 typedef crypto_uint64 uint64_t;
 */
 #include <stdint.h>
--- a/src/crypto/cn/c_skein.c
+++ b/src/crypto/cn/c_skein.c
@ -5,7 +5,7 @@
 ** Source code author: Doug Whiting, 2008.
 **
 ** This algorithm and source code is released to the public domain.
-** 
+**
 ************************************************************************/

 #define  SKEIN_PORT_CODE /* instantiate any code in skein_port.h */
@ -57,7 +57,7 @@ static int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);

 /*****************************************************************
 ** "Internal" Skein definitions
-**    -- not needed for sequential hashing API, but will be 
+**    -- not needed for sequential hashing API, but will be
 **           helpful for other uses of Skein (e.g., tree hash mode).
 **    -- included here so that they can be shared between
 **           reference and optimized code.
@ -179,11 +179,11 @@ static int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
 #define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */
 #define Skein_assert(x)
 #elif   defined(SKEIN_ASSERT)
-#include <assert.h>     
-#define Skein_Assert(x,retCode) assert(x) 
-#define Skein_assert(x)         assert(x) 
+#include <assert.h>
+#define Skein_Assert(x,retCode) assert(x)
+#define Skein_assert(x)         assert(x)
 #else
-#include <assert.h>     
+#include <assert.h>
 #define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /*  caller  error */
 #define Skein_assert(x)         assert(x)                     /* internal error */
 #endif
@ -191,8 +191,8 @@ static int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
 /*****************************************************************
 ** Skein block function constants (shared across Ref and Opt code)
 ******************************************************************/
-enum    
-{   
+enum
+{
  /* Skein_512 round rotation constants */
  R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
  R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
@ -251,7 +251,7 @@ const u64b_t SKEIN_512_IV_256[] =
 #define BLK_BITS        (WCNT*64)               /* some useful definitions for code here */
 #define KW_TWK_BASE     (0)
 #define KW_KEY_BASE     (3)
-#define ks              (kw + KW_KEY_BASE)                
+#define ks              (kw + KW_KEY_BASE)
 #define ts              (kw + KW_TWK_BASE)

 #ifdef SKEIN_DEBUG
@ -310,7 +310,7 @@ static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,s
        ks[5] = ctx->X[5];
        ks[6] = ctx->X[6];
        ks[7] = ctx->X[7];
-        ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ 
+        ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
                ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;

        ts[2] = ts[0] ^ ts[1];
@ -338,7 +338,7 @@ static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,s
    X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
    X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \

-#if SKEIN_UNROLL_512 == 0                       
+#if SKEIN_UNROLL_512 == 0
 #define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)      /* unrolled */  \
    Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
@ -469,7 +469,7 @@ static int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
        u08b_t  b[SKEIN_512_STATE_BYTES];
        u64b_t  w[SKEIN_512_STATE_WORDS];
        } cfg;                              /* config block */
-        
+
    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
    ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */

@ -548,7 +548,7 @@ static int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msg

    return SKEIN_SUCCESS;
    }
-   
+
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* finalize the hash computation and output the result */
 static int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
@ -562,7 +562,7 @@ static int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
        memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);

    Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
-    
+
    /* now output the result */
    byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */

--- a/src/crypto/cn/c_skein.h
+++ b/src/crypto/cn/c_skein.h
@ -9,7 +9,7 @@
 ** This algorithm and source code is released to the public domain.
 **
 ***************************************************************************
-** 
+**
 ** The following compile-time switches may be defined to control some
 ** tradeoffs between speed, code size, error checking, and security.
 **
@ -20,8 +20,8 @@
 **                            [default: no callouts (no overhead)]
 **
 **  SKEIN_ERR_CHECK        -- how error checking is handled inside Skein
-**                            code. If not defined, most error checking 
-**                            is disabled (for performance). Otherwise, 
+**                            code. If not defined, most error checking
+**                            is disabled (for performance). Otherwise,
 **                            the switch value is interpreted as:
 **                                0: use assert()      to flag errors
 **                                1: return SKEIN_FAIL to flag errors
--- a/src/crypto/cn/soft_aes.h
+++ b/src/crypto/cn/soft_aes.h
@ -124,9 +124,9 @@ static inline __m128i soft_aesenc(__m128i in, __m128i key)

 static inline uint32_t sub_word(uint32_t key)
 {
-    return (saes_sbox[key >> 24 ] << 24)   | 
-        (saes_sbox[(key >> 16) & 0xff] << 16 ) | 
-        (saes_sbox[(key >> 8)  & 0xff] << 8  ) | 
+    return (saes_sbox[key >> 24 ] << 24)   |
+        (saes_sbox[(key >> 16) & 0xff] << 16 ) |
+        (saes_sbox[(key >> 8)  & 0xff] << 8  ) |
         saes_sbox[key & 0xff];
 }

--- a/src/crypto/cn/sse2neon.h
+++ b/src/crypto/cn/sse2neon.h
@ -344,7 +344,7 @@ typedef union ALIGN_STRUCT(16) SIMDVec {

 // Older gcc does not define vld1q_u8_x4 type
 #if defined(__GNUC__) && !defined(__clang__) &&                        \
-    ((__GNUC__ <= 10 && defined(__arm__)) ||                           \
+    ((__GNUC__ <= 11 && defined(__arm__)) ||                           \
     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
     (__GNUC__ <= 9 && defined(__aarch64__)))
 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
--- a/src/crypto/common/Nonce.cpp
+++ b/src/crypto/common/Nonce.cpp
@ -16,6 +16,7 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

+#include "base/tools/Alignment.h"
 #include "crypto/common/Nonce.h"


@ -53,10 +54,10 @@ bool xmrig::Nonce::next(uint8_t index, uint32_t *nonce, uint32_t reserveCount, u
            continue;
        }

-        *nonce = (nonce[0] & ~mask) | counter;
+        writeUnaligned(nonce, static_cast<uint32_t>((readUnaligned(nonce) & ~mask) | counter));

        if (mask > 0xFFFFFFFFULL) {
-            nonce[1] = (nonce[1] & (~mask >> 32)) | (counter >> 32);
+            writeUnaligned(nonce + 1, static_cast<uint32_t>((readUnaligned(nonce + 1) & (~mask >> 32)) | (counter >> 32)));
        }

        return true;
--- a/src/crypto/ghostrider/CMakeLists.txt
+++ b/src/crypto/ghostrider/CMakeLists.txt
@ -0,0 +1,85 @@
+cmake_minimum_required(VERSION 2.8.12)
+project(GhostRider)
+
+set(HEADERS
+    sph_types.h
+    sph_blake.h
+    sph_bmw.h
+    sph_cubehash.h
+    sph_echo.h
+    sph_fugue.h
+    sph_groestl.h
+    sph_hamsi.h
+    sph_jh.h
+    sph_keccak.h
+    sph_luffa.h
+    sph_sha2.h
+    sph_shabal.h
+    sph_shavite.h
+    sph_simd.h
+    sph_skein.h
+    sph_whirlpool.h
+    ghostrider.h
+)
+
+set(SOURCES
+    sph_blake.c
+    sph_bmw.c
+    sph_cubehash.c
+    sph_echo.c
+    sph_fugue.c
+    sph_groestl.c
+    sph_hamsi.c
+    sph_jh.c
+    sph_keccak.c
+    sph_luffa.c
+    sph_shabal.c
+    sph_shavite.c
+    sph_simd.c
+    sph_sha2.c
+    sph_skein.c
+    sph_whirlpool.c
+    ghostrider.cpp
+)
+
+if (CMAKE_C_COMPILER_ID MATCHES MSVC)
+    set_source_files_properties(sph_blake.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+    set_source_files_properties(sph_bmw.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+    set_source_files_properties(sph_cubehash.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+    set_source_files_properties(sph_echo.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+    set_source_files_properties(sph_fugue.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+    set_source_files_properties(sph_groestl.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+    set_source_files_properties(sph_hamsi.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+    set_source_files_properties(sph_jh.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+    set_source_files_properties(sph_keccak.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+    set_source_files_properties(sph_luffa.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+    set_source_files_properties(sph_shabal.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+    set_source_files_properties(sph_shavite.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+    set_source_files_properties(sph_simd.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+    set_source_files_properties(sph_sha2.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+    set_source_files_properties(sph_skein.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+    set_source_files_properties(sph_whirlpool.c PROPERTIES COMPILE_FLAGS_RELEASE "/O1 /Oi /Os")
+elseif (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang)
+    set_source_files_properties(sph_blake.c PROPERTIES COMPILE_FLAGS "-Os")
+    set_source_files_properties(sph_bmw.c PROPERTIES COMPILE_FLAGS "-Os")
+    set_source_files_properties(sph_cubehash.c PROPERTIES COMPILE_FLAGS "-Os")
+    set_source_files_properties(sph_echo.c PROPERTIES COMPILE_FLAGS "-Os")
+    set_source_files_properties(sph_fugue.c PROPERTIES COMPILE_FLAGS "-Os")
+    set_source_files_properties(sph_groestl.c PROPERTIES COMPILE_FLAGS "-Os")
+    set_source_files_properties(sph_hamsi.c PROPERTIES COMPILE_FLAGS "-Os")
+    set_source_files_properties(sph_jh.c PROPERTIES COMPILE_FLAGS "-Os -fno-tree-vrp")
+    set_source_files_properties(sph_keccak.c PROPERTIES COMPILE_FLAGS "-Os")
+    set_source_files_properties(sph_luffa.c PROPERTIES COMPILE_FLAGS "-Os -Wno-unused-const-variable")
+    set_source_files_properties(sph_shabal.c PROPERTIES COMPILE_FLAGS "-Os")
+    set_source_files_properties(sph_shavite.c PROPERTIES COMPILE_FLAGS "-Os")
+    set_source_files_properties(sph_simd.c PROPERTIES COMPILE_FLAGS "-Os")
+    set_source_files_properties(sph_sha2.c PROPERTIES COMPILE_FLAGS "-Os")
+    set_source_files_properties(sph_skein.c PROPERTIES COMPILE_FLAGS "-Os")
+    set_source_files_properties(sph_whirlpool.c PROPERTIES COMPILE_FLAGS "-Os")
+endif()
+
+include_directories(.)
+include_directories(../..)
+include_directories(${UV_INCLUDE_DIR})
+
+add_library(ghostrider STATIC ${HEADERS} ${SOURCES})
--- a/src/crypto/ghostrider/README.md
+++ b/src/crypto/ghostrider/README.md
@ -0,0 +1,39 @@
+# GhostRider (Raptoreum) release notes
+
+**XMRig** supports GhostRider algorithm starting from version **v6.16.0**.
+
+No tuning is required - auto-config works well on most CPUs!
+
+### Sample command line (non-SSL port)
+```
+xmrig -a gr -o raptoreumemporium.com:3008 -u WALLET_ADDRESS -p x
+```
+
+### Sample command line (SSL port)
+```
+xmrig -a gr -o rtm.suprnova.cc:4273 --tls -u WALLET_ADDRESS -p x
+```
+
+You can use **rtm_ghostrider_example.cmd** as a template and put pool URL and your wallet address there. The general XMRig documentation is available [here](https://xmrig.com/docs/miner).
+
+**Using `--threads` or `-t` option is NOT recommended because it turns off advanced built-in config.** If you want to tweak the nubmer of threads used for GhostRider, it's recommended to start using config.json instead of command line. The best suitable command line option for this is `--cpu-max-threads-hint=N` where N can be between 0 and 100.
+
+## Performance
+
+While individual algorithm implementations are a bit unoptimized, XMRig achieves higher hashrates by employing better auto-config and more fine-grained thread scheduling: it can calculate a single batch of hashes using 2 threads for parts that don't require much cache. For example, on a typical Intel CPU (2 MB cache per core) it will use 1 thread per core for cn/fast, and 2 threads per core for other Cryptonight variants while calculating the same batch of hashes, always achieving more than 50% CPU load.
+
+For the same reason, XMRig can sometimes use less than 100% CPU on Ryzen 3000/5000 CPUs if it finds that running 1 thread per core is faster for some Cryptonight variants on your system.
+
+**Windows** (detailed results [here](https://imgur.com/a/0njIVVW))
+CPU|cpuminer-gr-avx2 1.2.4.1 (tuned), h/s|XMRig v6.16.2 (MSVC build), h/s|Speedup
+-|-|-|-
+AMD Ryzen 7 4700U|632.6|733.1|+15.89%
+Intel Core i7-2600|496.4|554.6|+11.72%
+AMD Ryzen 7 3700X @ 4.1 GHz|2453.0|2496.5|+1.77%
+AMD Ryzen 5 5600X @ 4.65 GHz|2112.6|2337.5|+10.65%
+
+**Linux (outdated)** (tested by **Delgon**, detailed results [here](https://cdn.discordapp.com/attachments/604375870236524574/913167614749048872/unknown.png))
+CPU|cpuminer-gr-avx2 1.2.4.1 (tuned), h/s|XMRig v6.16.0 (GCC build), h/s|Speedup
+-|-|-|-
+AMD Ryzen 9 3900X|3746.51|3604.89|-3.78%
+2xIntel Xeon E5-2698v3|2563.4|2638.38|+2.925%
--- a/src/crypto/ghostrider/aes_helper.c
+++ b/src/crypto/ghostrider/aes_helper.c
@ -0,0 +1,392 @@
+/* $Id: aes_helper.c 220 2010-06-09 09:21:50Z tp $ */
+/*
+ * AES tables. This file is not meant to be compiled by itself; it
+ * is included by some hash function implementations. It contains
+ * the precomputed tables and helper macros for evaluating an AES
+ * round, optionally with a final XOR with a subkey.
+ *
+ * By default, this file defines the tables and macros for little-endian
+ * processing (i.e. it is assumed that the input bytes have been read
+ * from memory and assembled with the little-endian convention). If
+ * the 'AES_BIG_ENDIAN' macro is defined (to a non-zero integer value)
+ * when this file is included, then the tables and macros for big-endian
+ * processing are defined instead. The big-endian tables and macros have
+ * names distinct from the little-endian tables and macros, hence it is
+ * possible to have both simultaneously, by including this file twice
+ * (with and without the AES_BIG_ENDIAN macro).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include "sph_types.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+#if AES_BIG_ENDIAN
+
+#define AESx(x)   ( ((SPH_C32(x) >> 24) & SPH_C32(0x000000FF)) \
+                  | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+                  | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+                  | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+
+#define AES0      AES0_BE
+#define AES1      AES1_BE
+#define AES2      AES2_BE
+#define AES3      AES3_BE
+
+#define AES_ROUND_BE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3)   do { \
+		(Y0) = AES0[((X0) >> 24) & 0xFF] \
+			^ AES1[((X1) >> 16) & 0xFF] \
+			^ AES2[((X2) >> 8) & 0xFF] \
+			^ AES3[(X3) & 0xFF] ^ (K0); \
+		(Y1) = AES0[((X1) >> 24) & 0xFF] \
+			^ AES1[((X2) >> 16) & 0xFF] \
+			^ AES2[((X3) >> 8) & 0xFF] \
+			^ AES3[(X0) & 0xFF] ^ (K1); \
+		(Y2) = AES0[((X2) >> 24) & 0xFF] \
+			^ AES1[((X3) >> 16) & 0xFF] \
+			^ AES2[((X0) >> 8) & 0xFF] \
+			^ AES3[(X1) & 0xFF] ^ (K2); \
+		(Y3) = AES0[((X3) >> 24) & 0xFF] \
+			^ AES1[((X0) >> 16) & 0xFF] \
+			^ AES2[((X1) >> 8) & 0xFF] \
+			^ AES3[(X2) & 0xFF] ^ (K3); \
+	} while (0)
+
+#define AES_ROUND_NOKEY_BE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
+	AES_ROUND_BE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
+
+#else
+
+#define AESx(x)   SPH_C32(x)
+#define AES0      AES0_LE
+#define AES1      AES1_LE
+#define AES2      AES2_LE
+#define AES3      AES3_LE
+
+#define AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3)   do { \
+		(Y0) = AES0[(X0) & 0xFF] \
+			^ AES1[((X1) >> 8) & 0xFF] \
+			^ AES2[((X2) >> 16) & 0xFF] \
+			^ AES3[((X3) >> 24) & 0xFF] ^ (K0); \
+		(Y1) = AES0[(X1) & 0xFF] \
+			^ AES1[((X2) >> 8) & 0xFF] \
+			^ AES2[((X3) >> 16) & 0xFF] \
+			^ AES3[((X0) >> 24) & 0xFF] ^ (K1); \
+		(Y2) = AES0[(X2) & 0xFF] \
+			^ AES1[((X3) >> 8) & 0xFF] \
+			^ AES2[((X0) >> 16) & 0xFF] \
+			^ AES3[((X1) >> 24) & 0xFF] ^ (K2); \
+		(Y3) = AES0[(X3) & 0xFF] \
+			^ AES1[((X0) >> 8) & 0xFF] \
+			^ AES2[((X1) >> 16) & 0xFF] \
+			^ AES3[((X2) >> 24) & 0xFF] ^ (K3); \
+	} while (0)
+
+#define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
+	AES_ROUND_LE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
+
+#endif
+
+/*
+ * The AES*[] tables allow us to perform a fast evaluation of an AES
+ * round; table AESi[] combines SubBytes for a byte at row i, and
+ * MixColumns for the column where that byte goes after ShiftRows.
+ */
+
+static const sph_u32 AES0[256] = {
+	AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
+	AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
+	AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
+	AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC),
+	AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA),
+	AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB),
+	AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45),
+	AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B),
+	AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C),
+	AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83),
+	AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9),
+	AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A),
+	AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D),
+	AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F),
+	AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF),
+	AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA),
+	AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34),
+	AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B),
+	AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D),
+	AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413),
+	AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1),
+	AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6),
+	AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972),
+	AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85),
+	AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED),
+	AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511),
+	AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE),
+	AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B),
+	AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05),
+	AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1),
+	AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142),
+	AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF),
+	AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3),
+	AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E),
+	AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A),
+	AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6),
+	AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3),
+	AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B),
+	AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428),
+	AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD),
+	AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14),
+	AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8),
+	AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4),
+	AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2),
+	AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA),
+	AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949),
+	AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF),
+	AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810),
+	AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C),
+	AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697),
+	AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E),
+	AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F),
+	AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC),
+	AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C),
+	AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969),
+	AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27),
+	AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122),
+	AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433),
+	AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9),
+	AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5),
+	AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A),
+	AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0),
+	AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E),
+	AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
+};
+
+static const sph_u32 AES1[256] = {
+	AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
+	AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
+	AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
+	AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
+	AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
+	AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
+	AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
+	AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
+	AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
+	AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
+	AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
+	AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
+	AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
+	AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
+	AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
+	AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
+	AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
+	AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
+	AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
+	AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
+	AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
+	AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
+	AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
+	AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
+	AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
+	AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
+	AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
+	AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
+	AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
+	AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
+	AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
+	AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
+	AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
+	AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
+	AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
+	AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
+	AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
+	AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
+	AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
+	AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
+	AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
+	AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
+	AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
+	AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
+	AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
+	AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
+	AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
+	AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
+	AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
+	AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
+	AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
+	AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
+	AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
+	AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
+	AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
+	AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
+	AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
+	AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
+	AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
+	AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
+	AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
+	AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
+	AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
+	AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
+};
+
+static const sph_u32 AES2[256] = {
+	AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
+	AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
+	AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
+	AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
+	AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
+	AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
+	AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
+	AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
+	AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
+	AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
+	AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
+	AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
+	AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
+	AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
+	AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
+	AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
+	AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
+	AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
+	AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
+	AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
+	AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
+	AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
+	AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
+	AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
+	AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
+	AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
+	AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
+	AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
+	AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
+	AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
+	AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
+	AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
+	AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
+	AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
+	AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
+	AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
+	AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
+	AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
+	AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
+	AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
+	AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
+	AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
+	AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
+	AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
+	AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
+	AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
+	AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
+	AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
+	AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
+	AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
+	AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
+	AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
+	AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
+	AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
+	AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
+	AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
+	AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
+	AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
+	AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
+	AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
+	AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
+	AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
+	AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
+	AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
+};
+
+static const sph_u32 AES3[256] = {
+	AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
+	AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
+	AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
+	AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676),
+	AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D),
+	AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0),
+	AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF),
+	AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0),
+	AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626),
+	AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC),
+	AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1),
+	AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515),
+	AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3),
+	AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A),
+	AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2),
+	AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575),
+	AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A),
+	AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0),
+	AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3),
+	AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484),
+	AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED),
+	AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B),
+	AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939),
+	AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF),
+	AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB),
+	AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585),
+	AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F),
+	AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8),
+	AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F),
+	AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5),
+	AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121),
+	AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2),
+	AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC),
+	AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717),
+	AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D),
+	AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373),
+	AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC),
+	AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888),
+	AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414),
+	AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB),
+	AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A),
+	AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C),
+	AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262),
+	AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979),
+	AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D),
+	AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9),
+	AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA),
+	AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808),
+	AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E),
+	AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6),
+	AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F),
+	AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A),
+	AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666),
+	AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E),
+	AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9),
+	AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E),
+	AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111),
+	AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494),
+	AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9),
+	AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF),
+	AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D),
+	AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868),
+	AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F),
+	AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
+};
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/crypto/ghostrider/ghostrider.cpp
+++ b/src/crypto/ghostrider/ghostrider.cpp
@ -0,0 +1,860 @@
+/* XMRig
+ * Copyright 2018-2022 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2022 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "ghostrider.h"
+#include "sph_blake.h"
+#include "sph_bmw.h"
+#include "sph_groestl.h"
+#include "sph_jh.h"
+#include "sph_keccak.h"
+#include "sph_skein.h"
+#include "sph_luffa.h"
+#include "sph_cubehash.h"
+#include "sph_shavite.h"
+#include "sph_simd.h"
+#include "sph_echo.h"
+#include "sph_hamsi.h"
+#include "sph_fugue.h"
+#include "sph_shabal.h"
+#include "sph_whirlpool.h"
+
+#include "base/io/log/Log.h"
+#include "base/io/log/Tags.h"
+#include "base/tools/Chrono.h"
+#include "backend/cpu/Cpu.h"
+#include "crypto/cn/CnHash.h"
+#include "crypto/cn/CnCtx.h"
+#include "crypto/cn/CryptoNight.h"
+#include "crypto/common/VirtualMemory.h"
+
+#include <thread>
+#include <atomic>
+#include <uv.h>
+
+#ifdef XMRIG_FEATURE_HWLOC
+#include "base/kernel/OS.h"
+#include "backend/cpu/platform/HwlocCpuInfo.h"
+#include <hwloc.h>
+#endif
+
+#if defined(XMRIG_ARM)
+#   include "crypto/cn/sse2neon.h"
+#elif defined(__GNUC__)
+#   include <x86intrin.h>
+#else
+#   include <intrin.h>
+#endif
+
+#define CORE_HASH(i, x) static void h##i(const uint8_t* data, size_t size, uint8_t* output) \
+{ \
+    sph_##x##_context ctx; \
+    sph_##x##_init(&ctx); \
+    sph_##x(&ctx, data, size); \
+    sph_##x##_close(&ctx, output); \
+}
+
+CORE_HASH( 0, blake512   );
+CORE_HASH( 1, bmw512     );
+CORE_HASH( 2, groestl512 );
+CORE_HASH( 3, jh512      );
+CORE_HASH( 4, keccak512  );
+CORE_HASH( 5, skein512   );
+CORE_HASH( 6, luffa512   );
+CORE_HASH( 7, cubehash512);
+CORE_HASH( 8, shavite512 );
+CORE_HASH( 9, simd512    );
+CORE_HASH(10, echo512    );
+CORE_HASH(11, hamsi512   );
+CORE_HASH(12, fugue512   );
+CORE_HASH(13, shabal512  );
+CORE_HASH(14, whirlpool  );
+
+#undef CORE_HASH
+
+typedef void (*core_hash_func)(const uint8_t* data, size_t size, uint8_t* output);
+static const core_hash_func core_hash[15] = { h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, h14 };
+
+namespace xmrig
+{
+
+
+static constexpr Algorithm::Id cn_hash[6] = {
+    Algorithm::CN_GR_0,
+    Algorithm::CN_GR_1,
+    Algorithm::CN_GR_2,
+    Algorithm::CN_GR_3,
+    Algorithm::CN_GR_4,
+    Algorithm::CN_GR_5,
+};
+
+static constexpr const char* cn_names[6] = {
+    "cn/dark (512 KB)",
+    "cn/dark-lite (256 KB)",
+    "cn/fast (2 MB)",
+    "cn/lite (1 MB)",
+    "cn/turtle (256 KB)",
+    "cn/turtle-lite (128 KB)",
+};
+
+static constexpr size_t cn_sizes[6] = {
+    Algorithm::l3(Algorithm::CN_GR_0),     // 512 KB
+    Algorithm::l3(Algorithm::CN_GR_1) / 2, // 256 KB
+    Algorithm::l3(Algorithm::CN_GR_2),     // 2 MB
+    Algorithm::l3(Algorithm::CN_GR_3),     // 1 MB
+    Algorithm::l3(Algorithm::CN_GR_4),     // 256 KB
+    Algorithm::l3(Algorithm::CN_GR_5) / 2, // 128 KB
+};
+
+static constexpr CnHash::AlgoVariant av_hw_aes[5] = { CnHash::AV_SINGLE, CnHash::AV_SINGLE, CnHash::AV_DOUBLE, CnHash::AV_TRIPLE, CnHash::AV_QUAD };
+static constexpr CnHash::AlgoVariant av_soft_aes[5] = { CnHash::AV_SINGLE_SOFT, CnHash::AV_SINGLE_SOFT, CnHash::AV_DOUBLE_SOFT, CnHash::AV_TRIPLE_SOFT, CnHash::AV_QUAD_SOFT };
+
+template<size_t N>
+static inline void select_indices(uint32_t (&indices)[N], const uint8_t* seed)
+{
+    bool selected[N] = {};
+
+    uint32_t k = 0;
+    for (uint32_t i = 0; i < 64; ++i) {
+        const uint8_t index = ((seed[i / 2] >> ((i & 1) * 4)) & 0xF) % N;
+        if (!selected[index]) {
+            selected[index] = true;
+            indices[k++] = index;
+            if (k >= N) {
+                return;
+            }
+        }
+    }
+
+    for (uint32_t i = 0; i < N; ++i) {
+        if (!selected[i]) {
+            indices[k++] = i;
+        }
+    }
+}
+
+
+namespace ghostrider
+{
+
+
+#ifdef XMRIG_FEATURE_HWLOC
+
+
+static struct AlgoTune
+{
+    double hashrate = 0.0;
+    uint32_t step = 1;
+    uint32_t threads = 1;
+} tuneDefault[6], tune8MB[6];
+
+
+struct HelperThread
+{
+    HelperThread(hwloc_bitmap_t cpu_set, int priority, bool is8MB) : m_cpuSet(cpu_set), m_priority(priority), m_is8MB(is8MB)
+    {
+        uv_mutex_init(&m_mutex);
+        uv_cond_init(&m_cond);
+
+        m_thread = new std::thread(&HelperThread::run, this);
+        do {
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
+        } while (!m_ready);
+    }
+
+    ~HelperThread()
+    {
+        uv_mutex_lock(&m_mutex);
+        m_finished = true;
+        uv_cond_signal(&m_cond);
+        uv_mutex_unlock(&m_mutex);
+
+        m_thread->join();
+        delete m_thread;
+
+        uv_mutex_destroy(&m_mutex);
+        uv_cond_destroy(&m_cond);
+
+        hwloc_bitmap_free(m_cpuSet);
+    }
+
+    struct TaskBase
+    {
+        virtual ~TaskBase() {}
+        virtual void run() = 0;
+    };
+
+    template<typename T>
+    struct Task : TaskBase
+    {
+        inline Task(T&& task) : m_task(std::move(task))
+        {
+            static_assert(sizeof(Task) <= 128, "Task struct is too large");
+        }
+
+        void run() override
+        {
+            m_task();
+            this->~Task();
+        }
+
+        T m_task;
+    };
+
+    template<typename T>
+    inline void launch_task(T&& task)
+    {
+        uv_mutex_lock(&m_mutex);
+        new (&m_tasks[m_numTasks++]) Task<T>(std::move(task));
+        uv_cond_signal(&m_cond);
+        uv_mutex_unlock(&m_mutex);
+    }
+
+    inline void wait() const
+    {
+        while (m_numTasks) {
+            _mm_pause();
+        }
+    }
+
+    void run()
+    {
+        if (hwloc_bitmap_weight(m_cpuSet) > 0) {
+            hwloc_topology_t topology = reinterpret_cast<HwlocCpuInfo*>(Cpu::info())->topology();
+            if (hwloc_set_cpubind(topology, m_cpuSet, HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT) < 0) {
+                hwloc_set_cpubind(topology, m_cpuSet, HWLOC_CPUBIND_THREAD);
+            }
+        }
+
+        OS::setThreadPriority(m_priority);
+
+        uv_mutex_lock(&m_mutex);
+        m_ready = true;
+
+        do {
+            uv_cond_wait(&m_cond, &m_mutex);
+
+            const uint32_t n = m_numTasks;
+            if (n > 0) {
+                for (uint32_t i = 0; i < n; ++i) {
+                    reinterpret_cast<TaskBase*>(&m_tasks[i])->run();
+                }
+                std::atomic_thread_fence(std::memory_order_seq_cst);
+                m_numTasks = 0;
+            }
+        } while (!m_finished);
+
+        uv_mutex_unlock(&m_mutex);
+    }
+
+    uv_mutex_t m_mutex;
+    uv_cond_t m_cond;
+
+    alignas(16) uint8_t m_tasks[4][128] = {};
+    volatile uint32_t m_numTasks = 0;
+    volatile bool m_ready = false;
+    volatile bool m_finished = false;
+    hwloc_bitmap_t m_cpuSet = {};
+    int m_priority = -1;
+    bool m_is8MB = false;
+
+    std::thread* m_thread = nullptr;
+};
+
+
+void benchmark()
+{
+#ifndef XMRIG_ARM
+    static std::atomic<int> done{ 0 };
+    if (done.exchange(1)) {
+        return;
+    }
+
+    std::thread t([]() {
+        // Try to avoid CPU core 0 because many system threads use it and can interfere
+        uint32_t thread_index1 = (Cpu::info()->threads() > 2) ? 2 : 0;
+
+        hwloc_topology_t topology = reinterpret_cast<HwlocCpuInfo*>(Cpu::info())->topology();
+        hwloc_obj_t pu = hwloc_get_pu_obj_by_os_index(topology, thread_index1);
+        hwloc_obj_t pu2;
+        hwloc_get_closest_objs(topology, pu, &pu2, 1);
+        uint32_t thread_index2 = pu2 ? pu2->os_index : thread_index1;
+
+        if (thread_index2 < thread_index1) {
+            std::swap(thread_index1, thread_index2);
+        }
+
+        OS::setThreadAffinity(thread_index1);
+        OS::setThreadPriority(3);
+
+        constexpr uint32_t N = 1U << 21;
+
+        VirtualMemory::init(0, N);
+        VirtualMemory* memory = new VirtualMemory(N * 8, true, false, false);
+
+        // 2 MB cache per core by default
+        size_t max_scratchpad_size = 1U << 21;
+
+        if ((Cpu::info()->L3() >> 22) > Cpu::info()->cores()) {
+            // At least 1 core can run with 8 MB cache
+            max_scratchpad_size = 1U << 23;
+        }
+        else if ((Cpu::info()->L3() >> 22) >= Cpu::info()->cores()) {
+            // All cores can run with 4 MB cache
+            max_scratchpad_size = 1U << 22;
+        }
+
+        LOG_VERBOSE("Running GhostRider benchmark on logical CPUs %u and %u (max scratchpad size %zu MB, huge pages %s)", thread_index1, thread_index2, max_scratchpad_size >> 20, memory->isHugePages() ? "on" : "off");
+
+        cryptonight_ctx* ctx[8];
+        CnCtx::create(ctx, memory->scratchpad(), N, 8);
+
+        const CnHash::AlgoVariant* av = Cpu::info()->hasAES() ? av_hw_aes : av_soft_aes;
+
+        uint8_t buf[80];
+        uint8_t hash[32 * 8];
+
+        LOG_VERBOSE("%24s |  N  | Hashrate", "Algorithm");
+        LOG_VERBOSE("-------------------------|-----|-------------");
+
+        for (uint32_t algo = 0; algo < 6; ++algo) {
+            for (uint64_t step : { 1, 2, 4}) {
+                const size_t cur_scratchpad_size = cn_sizes[algo] * step;
+                if (cur_scratchpad_size > max_scratchpad_size) {
+                    continue;
+                }
+
+                auto f = CnHash::fn(cn_hash[algo], av[step], Assembly::AUTO);
+
+                double start_time = Chrono::highResolutionMSecs();
+
+                double min_dt = 1e10;
+                for (uint32_t iter = 0;; ++iter) {
+                    double t1 = Chrono::highResolutionMSecs();
+
+                    // Stop after 15 milliseconds, but only if at least 10 iterations were done
+                    if ((iter >= 10) && (t1 - start_time >= 15.0)) {
+                        break;
+                    }
+
+                    f(buf, sizeof(buf), hash, ctx, 0);
+
+                    const double dt = Chrono::highResolutionMSecs() - t1;
+                    if (dt < min_dt) {
+                        min_dt = dt;
+                    }
+                }
+
+                const double hashrate = step * 1e3 / min_dt;
+                LOG_VERBOSE("%24s | %" PRIu64 "x1 | %.2f h/s", cn_names[algo], step, hashrate);
+
+                if (hashrate > tune8MB[algo].hashrate) {
+                    tune8MB[algo].hashrate = hashrate;
+                    tune8MB[algo].step = static_cast<uint32_t>(step);
+                    tune8MB[algo].threads = 1;
+                }
+
+                if ((cur_scratchpad_size < (1U << 23)) && (hashrate > tuneDefault[algo].hashrate)) {
+                    tuneDefault[algo].hashrate = hashrate;
+                    tuneDefault[algo].step = static_cast<uint32_t>(step);
+                    tuneDefault[algo].threads = 1;
+                }
+            }
+        }
+
+        hwloc_bitmap_t helper_set = hwloc_bitmap_alloc();
+        hwloc_bitmap_set(helper_set, thread_index2);
+        HelperThread* helper = new HelperThread(helper_set, 3, false);
+
+        for (uint32_t algo = 0; algo < 6; ++algo) {
+            for (uint64_t step : { 1, 2, 4}) {
+                const size_t cur_scratchpad_size = cn_sizes[algo] * step * 2;
+                if (cur_scratchpad_size > max_scratchpad_size) {
+                    continue;
+                }
+
+                auto f = CnHash::fn(cn_hash[algo], av[step], Assembly::AUTO);
+
+                double start_time = Chrono::highResolutionMSecs();
+
+                double min_dt = 1e10;
+                for (uint32_t iter = 0;; ++iter) {
+                    double t1 = Chrono::highResolutionMSecs();
+
+                    // Stop after 30 milliseconds, but only if at least 10 iterations were done
+                    if ((iter >= 10) && (t1 - start_time >= 30.0)) {
+                        break;
+                    }
+
+                    helper->launch_task([&f, &buf, &hash, &ctx, &step]() { f(buf, sizeof(buf), hash + step * 32, ctx + step, 0); });
+                    f(buf, sizeof(buf), hash, ctx, 0);
+                    helper->wait();
+
+                    const double dt = Chrono::highResolutionMSecs() - t1;
+                    if (dt < min_dt) {
+                        min_dt = dt;
+                    }
+                }
+
+                const double hashrate = step * 2e3 / min_dt * 1.0075;
+                LOG_VERBOSE("%24s | %" PRIu64 "x2 | %.2f h/s", cn_names[algo], step, hashrate);
+
+                if (hashrate > tune8MB[algo].hashrate) {
+                    tune8MB[algo].hashrate = hashrate;
+                    tune8MB[algo].step = static_cast<uint32_t>(step);
+                    tune8MB[algo].threads = 2;
+                }
+
+                if ((cur_scratchpad_size < (1U << 23)) && (hashrate > tuneDefault[algo].hashrate)) {
+                    tuneDefault[algo].hashrate = hashrate;
+                    tuneDefault[algo].step = static_cast<uint32_t>(step);
+                    tuneDefault[algo].threads = 2;
+                }
+            }
+        }
+
+        delete helper;
+
+        CnCtx::release(ctx, 8);
+        delete memory;
+    });
+
+    t.join();
+
+    LOG_VERBOSE("---------------------------------------------");
+    LOG_VERBOSE("|         GhostRider tuning results         |");
+    LOG_VERBOSE("---------------------------------------------");
+
+    for (int algo = 0; algo < 6; ++algo) {
+        LOG_VERBOSE("%24s | %ux%u | %.2f h/s", cn_names[algo], tuneDefault[algo].step, tuneDefault[algo].threads, tuneDefault[algo].hashrate);
+        if ((tune8MB[algo].step != tuneDefault[algo].step) || (tune8MB[algo].threads != tuneDefault[algo].threads)) {
+            LOG_VERBOSE("%24s | %ux%u | %.2f h/s", cn_names[algo], tune8MB[algo].step, tune8MB[algo].threads, tune8MB[algo].hashrate);
+        }
+    }
+#endif
+}
+
+
+template <typename func>
+static inline bool findByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambda)
+{
+    for (size_t i = 0; i < obj->arity; i++) {
+        if (obj->children[i]->type == type) {
+            if (lambda(obj->children[i])) {
+                return true;
+            }
+        }
+        else {
+            if (findByType(obj->children[i], type, lambda)) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+
+HelperThread* create_helper_thread(int64_t cpu_index, int priority, const std::vector<int64_t>& affinities)
+{
+#ifndef XMRIG_ARM
+    hwloc_bitmap_t helper_cpu_set = hwloc_bitmap_alloc();
+    hwloc_bitmap_t main_threads_set = hwloc_bitmap_alloc();
+
+    for (int64_t i : affinities) {
+        if (i >= 0) {
+            hwloc_bitmap_set(main_threads_set, i);
+        }
+    }
+
+    if (cpu_index >= 0) {
+        hwloc_topology_t topology = reinterpret_cast<HwlocCpuInfo*>(Cpu::info())->topology();
+        hwloc_obj_t root = hwloc_get_root_obj(topology);
+
+        bool is8MB = false;
+
+        findByType(root, HWLOC_OBJ_L3CACHE, [cpu_index, &is8MB](hwloc_obj_t obj) {
+            if (!hwloc_bitmap_isset(obj->cpuset, cpu_index)) {
+                return false;
+            }
+
+            uint32_t num_cores = 0;
+            findByType(obj, HWLOC_OBJ_CORE, [&num_cores](hwloc_obj_t) { ++num_cores; return false; });
+
+            if ((obj->attr->cache.size >> 22) > num_cores) {
+                uint32_t num_8MB_cores = (obj->attr->cache.size >> 22) - num_cores;
+
+                is8MB = findByType(obj, HWLOC_OBJ_CORE, [cpu_index, &num_8MB_cores](hwloc_obj_t obj2) {
+                    if (num_8MB_cores > 0) {
+                        --num_8MB_cores;
+                        if (hwloc_bitmap_isset(obj2->cpuset, cpu_index)) {
+                            return true;
+                        }
+                    }
+                    return false;
+                });
+            }
+            return true;
+        });
+
+        for (auto obj_type : { HWLOC_OBJ_CORE, HWLOC_OBJ_L1CACHE, HWLOC_OBJ_L2CACHE, HWLOC_OBJ_L3CACHE }) {
+            findByType(root, obj_type, [cpu_index, helper_cpu_set, main_threads_set](hwloc_obj_t obj) {
+                const hwloc_cpuset_t& s = obj->cpuset;
+                if (hwloc_bitmap_isset(s, cpu_index)) {
+                    hwloc_bitmap_andnot(helper_cpu_set, s, main_threads_set);
+                    if (hwloc_bitmap_weight(helper_cpu_set) > 0) {
+                        return true;
+                    }
+                }
+                return false;
+            });
+
+            if (hwloc_bitmap_weight(helper_cpu_set) > 0) {
+                return new HelperThread(helper_cpu_set, priority, is8MB);
+            }
+        }
+    }
+#endif
+
+    return nullptr;
+}
+
+
+void destroy_helper_thread(HelperThread* t)
+{
+    delete t;
+}
+
+
+void hash_octa(const uint8_t* data, size_t size, uint8_t* output, cryptonight_ctx** ctx, HelperThread* helper, bool verbose)
+{
+    enum { N = 8 };
+
+    uint8_t* ctx_memory[N];
+    for (size_t i = 0; i < N; ++i) {
+        ctx_memory[i] = ctx[i]->memory;
+    }
+
+    // PrevBlockHash (GhostRider's seed) is stored in bytes [4; 36)
+    uint32_t core_indices[15];
+    select_indices(core_indices, data + 4);
+
+    uint32_t cn_indices[6];
+    select_indices(cn_indices, data + 4);
+
+    if (verbose) {
+        static uint32_t prev_indices[3];
+        if (memcmp(cn_indices, prev_indices, sizeof(prev_indices)) != 0) {
+            memcpy(prev_indices, cn_indices, sizeof(prev_indices));
+            for (int i = 0; i < 3; ++i) {
+                LOG_INFO("%s GhostRider algo %d: %s", Tags::cpu(), i + 1, cn_names[cn_indices[i]]);
+            }
+        }
+    }
+
+    const CnHash::AlgoVariant* av = Cpu::info()->hasAES() ? av_hw_aes : av_soft_aes;
+    const AlgoTune* tune = (helper && helper->m_is8MB) ? tune8MB : tuneDefault;
+
+    uint8_t tmp[64 * N];
+
+    if (helper && (tune[cn_indices[0]].threads == 2) && (tune[cn_indices[1]].threads == 2) && (tune[cn_indices[2]].threads == 2)) {
+        const size_t n = N / 2;
+
+        helper->launch_task([n, av, data, size, &ctx_memory, ctx, &cn_indices, &core_indices, &tmp, output, tune]() {
+            const uint8_t* input = data;
+            size_t input_size = size;
+
+            for (size_t part = 0; part < 3; ++part) {
+                const AlgoTune& t = tune[cn_indices[part]];
+
+                // Allocate scratchpads
+                {
+                    uint8_t* p = ctx_memory[4];
+
+                    for (size_t i = n, k = 4; i < N; ++i) {
+                        if ((i % t.step) == 0) {
+                            k = 4;
+                            p = ctx_memory[4];
+                        }
+                        else if (p - ctx_memory[k] >= (1 << 21)) {
+                            ++k;
+                            p = ctx_memory[k];
+                        }
+                        ctx[i]->memory = p;
+                        p += cn_sizes[cn_indices[part]];
+                    }
+                }
+
+                for (size_t i = 0; i < 5; ++i) {
+                    for (size_t j = n; j < N; ++j) {
+                        core_hash[core_indices[part * 5 + i]](input + j * input_size, input_size, tmp + j * 64);
+                    }
+                    input = tmp;
+                    input_size = 64;
+                }
+
+                auto f = CnHash::fn(cn_hash[cn_indices[part]], av[t.step], Assembly::AUTO);
+                for (size_t j = n; j < N; j += t.step) {
+                    f(tmp + j * 64, 64, output + j * 32, ctx + n, 0);
+                }
+
+                for (size_t j = n; j < N; ++j) {
+                    memcpy(tmp + j * 64, output + j * 32, 32);
+                    memset(tmp + j * 64 + 32, 0, 32);
+                }
+            }
+        });
+
+        const uint8_t* input = data;
+        size_t input_size = size;
+
+        for (size_t part = 0; part < 3; ++part) {
+            const AlgoTune& t = tune[cn_indices[part]];
+
+            // Allocate scratchpads
+            {
+                uint8_t* p = ctx_memory[0];
+
+                for (size_t i = 0, k = 0; i < n; ++i) {
+                    if ((i % t.step) == 0) {
+                        k = 0;
+                        p = ctx_memory[0];
+                    }
+                    else if (p - ctx_memory[k] >= (1 << 21)) {
+                        ++k;
+                        p = ctx_memory[k];
+                    }
+                    ctx[i]->memory = p;
+                    p += cn_sizes[cn_indices[part]];
+                }
+            }
+
+            for (size_t i = 0; i < 5; ++i) {
+                for (size_t j = 0; j < n; ++j) {
+                    core_hash[core_indices[part * 5 + i]](input + j * input_size, input_size, tmp + j * 64);
+                }
+                input = tmp;
+                input_size = 64;
+            }
+
+            auto f = CnHash::fn(cn_hash[cn_indices[part]], av[t.step], Assembly::AUTO);
+            for (size_t j = 0; j < n; j += t.step) {
+                f(tmp + j * 64, 64, output + j * 32, ctx, 0);
+            }
+
+            for (size_t j = 0; j < n; ++j) {
+                memcpy(tmp + j * 64, output + j * 32, 32);
+                memset(tmp + j * 64 + 32, 0, 32);
+            }
+        }
+
+        helper->wait();
+    }
+    else {
+        for (size_t part = 0; part < 3; ++part) {
+            const AlgoTune& t = tune[cn_indices[part]];
+
+            // Allocate scratchpads
+            {
+                uint8_t* p = ctx_memory[0];
+                const size_t n = N / t.threads;
+
+                // Thread 1
+                for (size_t i = 0, k = 0; i < n; ++i) {
+                    if ((i % t.step) == 0) {
+                        k = 0;
+                        p = ctx_memory[0];
+                    }
+                    else if (p - ctx_memory[k] >= (1 << 21)) {
+                        ++k;
+                        p = ctx_memory[k];
+                    }
+                    ctx[i]->memory = p;
+                    p += cn_sizes[cn_indices[part]];
+                }
+
+                // Thread 2
+                for (size_t i = n, k = 4; i < N; ++i) {
+                    if ((i % t.step) == 0) {
+                        k = 4;
+                        p = ctx_memory[4];
+                    }
+                    else if (p - ctx_memory[k] >= (1 << 21)) {
+                        ++k;
+                        p = ctx_memory[k];
+                    }
+                    ctx[i]->memory = p;
+                    p += cn_sizes[cn_indices[part]];
+                }
+            }
+
+            size_t n = N;
+
+            if (helper && (t.threads == 2)) {
+                n = N / 2;
+
+                helper->launch_task([data, size, n, &cn_indices, &core_indices, part, &tmp, av, &t, output, ctx]() {
+                    const uint8_t* input = data;
+                    size_t input_size = size;
+
+                    for (size_t i = 0; i < 5; ++i) {
+                        for (size_t j = n; j < N; ++j) {
+                            core_hash[core_indices[part * 5 + i]](input + j * input_size, input_size, tmp + j * 64);
+                        }
+                        input = tmp;
+                        input_size = 64;
+                    }
+
+                    auto f = CnHash::fn(cn_hash[cn_indices[part]], av[t.step], Assembly::AUTO);
+                    for (size_t j = n; j < N; j += t.step) {
+                        f(tmp + j * 64, 64, output + j * 32, ctx + n, 0);
+                    }
+
+                    for (size_t j = n; j < N; ++j) {
+                        memcpy(tmp + j * 64, output + j * 32, 32);
+                        memset(tmp + j * 64 + 32, 0, 32);
+                    }
+                });
+            }
+
+            for (size_t i = 0; i < 5; ++i) {
+                for (size_t j = 0; j < n; ++j) {
+                    core_hash[core_indices[part * 5 + i]](data + j * size, size, tmp + j * 64);
+                }
+                data = tmp;
+                size = 64;
+            }
+
+            auto f = CnHash::fn(cn_hash[cn_indices[part]], av[t.step], Assembly::AUTO);
+            for (size_t j = 0; j < n; j += t.step) {
+                f(tmp + j * 64, 64, output + j * 32, ctx, 0);
+            }
+
+            for (size_t j = 0; j < n; ++j) {
+                memcpy(tmp + j * 64, output + j * 32, 32);
+                memset(tmp + j * 64 + 32, 0, 32);
+            }
+
+            if (helper && (t.threads == 2)) {
+                helper->wait();
+            }
+        }
+    }
+
+    for (size_t i = 0; i < N; ++i) {
+        ctx[i]->memory = ctx_memory[i];
+    }
+}
+
+
+#else // XMRIG_FEATURE_HWLOC
+
+
+void benchmark() {}
+HelperThread* create_helper_thread(int64_t, int, const std::vector<int64_t>&) { return nullptr; }
+void destroy_helper_thread(HelperThread*) {}
+
+
+void hash_octa(const uint8_t* data, size_t size, uint8_t* output, cryptonight_ctx** ctx, HelperThread*, bool verbose)
+{
+    constexpr uint32_t N = 8;
+
+    uint8_t* ctx_memory[N];
+    for (size_t i = 0; i < N; ++i) {
+        ctx_memory[i] = ctx[i]->memory;
+    }
+
+    // PrevBlockHash (GhostRider's seed) is stored in bytes [4; 36)
+    const uint8_t* seed = data + 4;
+
+    uint32_t core_indices[15];
+    select_indices(core_indices, seed);
+
+    uint32_t cn_indices[6];
+    select_indices(cn_indices, seed);
+
+#ifdef XMRIG_ARM
+    uint32_t step[6] = { 1, 1, 1, 1, 1, 1 };
+#else
+    uint32_t step[6] = { 4, 4, 1, 2, 4, 4 };
+#endif
+
+    if (verbose) {
+        static uint32_t prev_indices[3];
+        if (memcmp(cn_indices, prev_indices, sizeof(prev_indices)) != 0) {
+            memcpy(prev_indices, cn_indices, sizeof(prev_indices));
+            for (int i = 0; i < 3; ++i) {
+                LOG_INFO("%s GhostRider algo %d: %s", Tags::cpu(), i + 1, cn_names[cn_indices[i]]);
+            }
+        }
+    }
+
+    const CnHash::AlgoVariant* av = Cpu::info()->hasAES() ? av_hw_aes : av_soft_aes;
+
+    uint8_t tmp[64 * N];
+
+    for (size_t part = 0; part < 3; ++part) {
+
+        // Allocate scratchpads
+        {
+            uint8_t* p = ctx_memory[0];
+
+            for (size_t i = 0, k = 0; i < N; ++i) {
+                if ((i % step[cn_indices[part]]) == 0) {
+                    k = 0;
+                    p = ctx_memory[0];
+                }
+                else if (p - ctx_memory[k] >= (1 << 21)) {
+                    ++k;
+                    p = ctx_memory[k];
+                }
+                ctx[i]->memory = p;
+                p += cn_sizes[cn_indices[part]];
+            }
+        }
+
+        for (size_t i = 0; i < 5; ++i) {
+            for (size_t j = 0; j < N; ++j) {
+                core_hash[core_indices[part * 5 + i]](data + j * size, size, tmp + j * 64);
+            }
+            data = tmp;
+            size = 64;
+        }
+
+        auto f = CnHash::fn(cn_hash[cn_indices[part]], av[step[cn_indices[part]]], Assembly::AUTO);
+        for (size_t j = 0; j < N; j += step[cn_indices[part]]) {
+            f(tmp + j * 64, 64, output + j * 32, ctx, 0);
+        }
+
+        for (size_t j = 0; j < N; ++j) {
+            memcpy(tmp + j * 64, output + j * 32, 32);
+            memset(tmp + j * 64 + 32, 0, 32);
+        }
+    }
+
+    for (size_t i = 0; i < N; ++i) {
+        ctx[i]->memory = ctx_memory[i];
+    }
+}
+
+
+#endif // XMRIG_FEATURE_HWLOC
+
+
+} // namespace ghostrider
+
+
+} // namespace xmrig
--- a/src/crypto/ghostrider/ghostrider.h
+++ b/src/crypto/ghostrider/ghostrider.h
@ -0,0 +1,52 @@
+/* XMRig
+ * Copyright 2018-2022 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2022 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_GR_HASH_H
+#define XMRIG_GR_HASH_H
+
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+
+struct cryptonight_ctx;
+
+
+namespace xmrig
+{
+
+
+namespace ghostrider
+{
+
+
+struct HelperThread;
+
+void benchmark();
+HelperThread* create_helper_thread(int64_t cpu_index, int priority, const std::vector<int64_t>& affinities);
+void destroy_helper_thread(HelperThread* t);
+void hash_octa(const uint8_t* data, size_t size, uint8_t* output, cryptonight_ctx** ctx, HelperThread* helper, bool verbose = true);
+
+
+} // namespace ghostrider
+
+
+} // namespace xmrig
+
+#endif // XMRIG_GR_HASH_H
--- a/src/crypto/ghostrider/md_helper.c
+++ b/src/crypto/ghostrider/md_helper.c
@ -0,0 +1,346 @@
+/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * This file contains some functions which implement the external data
+ * handling and padding for Merkle-Damgard hash functions which follow
+ * the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
+ *
+ * API: this file is meant to be included, not compiled as a stand-alone
+ * file. Some macros must be defined:
+ *   RFUN   name for the round function
+ *   HASH   "short name" for the hash function
+ *   BE32   defined for big-endian, 32-bit based (e.g. SHA-1)
+ *   LE32   defined for little-endian, 32-bit based (e.g. MD5)
+ *   BE64   defined for big-endian, 64-bit based (e.g. SHA-512)
+ *   LE64   defined for little-endian, 64-bit based (no example yet)
+ *   PW01   if defined, append 0x01 instead of 0x80 (for Tiger)
+ *   BLEN   if defined, length of a message block (in bytes)
+ *   PLW1   if defined, length is defined on one 64-bit word only (for Tiger)
+ *   PLW4   if defined, length is defined on four 64-bit words (for WHIRLPOOL)
+ *   SVAL   if defined, reference to the context state information
+ *
+ * BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
+ * this is used for instance for Tiger, which works on 64-bit words but
+ * uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
+ * ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
+ * set, then only one word (64 bits) will be used to encode the input
+ * message length (in bits), otherwise two words will be used (as in
+ * SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
+ * not PLW1), four 64-bit words will be used to encode the message length
+ * (in bits). Note that regardless of those settings, only 64-bit message
+ * lengths are supported (in bits): messages longer than 2 Exabytes will be
+ * improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
+ * 2 millions Terabytes, which is huge).
+ *
+ * If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
+ * function. This is used for Tiger2, which is identical to Tiger except
+ * when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
+ * of the 0x01 from original Tiger).
+ *
+ * The RFUN function is invoked with two arguments, the first pointing to
+ * aligned data (as a "const void *"), the second being state information
+ * from the context structure. By default, this state information is the
+ * "val" field from the context, and this field is assumed to be an array
+ * of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
+ * from the context structure. The "val" field can have any type, except
+ * for the output encoding which assumes that it is an array of "sph_u32"
+ * values. By defining NO_OUTPUT, this last step is deactivated; the
+ * includer code is then responsible for writing out the hash result. When
+ * NO_OUTPUT is defined, the third parameter to the "close()" function is
+ * ignored.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)     SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)    a ## b
+
+#undef SPH_BLEN
+#undef SPH_WLEN
+#if defined BE64 || defined LE64
+#define SPH_BLEN    128U
+#define SPH_WLEN      8U
+#else
+#define SPH_BLEN     64U
+#define SPH_WLEN      4U
+#endif
+
+#ifdef BLEN
+#undef SPH_BLEN
+#define SPH_BLEN    BLEN
+#endif
+
+#undef SPH_MAXPAD
+#if defined PLW1
+#define SPH_MAXPAD   (SPH_BLEN - SPH_WLEN)
+#elif defined PLW4
+#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 2))
+#else
+#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 1))
+#endif
+
+#undef SPH_VAL
+#undef SPH_NO_OUTPUT
+#ifdef SVAL
+#define SPH_VAL         SVAL
+#define SPH_NO_OUTPUT   1
+#else
+#define SPH_VAL   sc->val
+#endif
+
+#ifndef CLOSE_ONLY
+
+#ifdef SPH_UPTR
+static void
+SPH_XCAT(HASH, _short)(void *cc, const void *data, size_t len)
+#else
+void
+SPH_XCAT(sph_, HASH)(void *cc, const void *data, size_t len)
+#endif
+{
+	SPH_XCAT(sph_, SPH_XCAT(HASH, _context)) *sc;
+	size_t current;
+
+	sc = cc;
+#if SPH_64
+	current = (unsigned)sc->count & (SPH_BLEN - 1U);
+#else
+	current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
+#endif
+	while (len > 0) {
+		size_t clen;
+#if !SPH_64
+		sph_u32 clow, clow2;
+#endif
+
+		clen = SPH_BLEN - current;
+		if (clen > len)
+			clen = len;
+		memcpy(sc->buf + current, data, clen);
+		data = (const unsigned char *)data + clen;
+		current += clen;
+		len -= clen;
+		if (current == SPH_BLEN) {
+			RFUN(sc->buf, SPH_VAL);
+			current = 0;
+		}
+#if SPH_64
+		sc->count += clen;
+#else
+		clow = sc->count_low;
+		clow2 = SPH_T32(clow + clen);
+		sc->count_low = clow2;
+		if (clow2 < clow)
+			sc->count_high ++;
+#endif
+	}
+}
+
+#ifdef SPH_UPTR
+void
+SPH_XCAT(sph_, HASH)(void *cc, const void *data, size_t len)
+{
+	SPH_XCAT(sph_, SPH_XCAT(HASH, _context)) *sc;
+	unsigned current;
+	size_t orig_len;
+#if !SPH_64
+	sph_u32 clow, clow2;
+#endif
+
+	if (len < (2 * SPH_BLEN)) {
+		SPH_XCAT(HASH, _short)(cc, data, len);
+		return;
+	}
+	sc = cc;
+#if SPH_64
+	current = (unsigned)sc->count & (SPH_BLEN - 1U);
+#else
+	current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
+#endif
+	if (current > 0) {
+		unsigned t;
+
+		t = SPH_BLEN - current;
+		SPH_XCAT(HASH, _short)(cc, data, t);
+		data = (const unsigned char *)data + t;
+		len -= t;
+	}
+#if !SPH_UNALIGNED
+	if (((SPH_UPTR)data & (SPH_WLEN - 1U)) != 0) {
+		SPH_XCAT(HASH, _short)(cc, data, len);
+		return;
+	}
+#endif
+	orig_len = len;
+	while (len >= SPH_BLEN) {
+		RFUN(data, SPH_VAL);
+		len -= SPH_BLEN;
+		data = (const unsigned char *)data + SPH_BLEN;
+	}
+	if (len > 0)
+		memcpy(sc->buf, data, len);
+#if SPH_64
+	sc->count += (sph_u64)orig_len;
+#else
+	clow = sc->count_low;
+	clow2 = SPH_T32(clow + orig_len);
+	sc->count_low = clow2;
+	if (clow2 < clow)
+		sc->count_high ++;
+	/*
+	 * This code handles the improbable situation where "size_t" is
+	 * greater than 32 bits, and yet we do not have a 64-bit type.
+	 */
+	orig_len >>= 12;
+	orig_len >>= 10;
+	orig_len >>= 10;
+	sc->count_high += orig_len;
+#endif
+}
+#endif
+
+#endif
+
+/*
+ * Perform padding and produce result. The context is NOT reinitialized
+ * by this function.
+ */
+static void
+SPH_XCAT(HASH, _addbits_and_close)(void *cc,
+	unsigned ub, unsigned n, void *dst, unsigned rnum)
+{
+	SPH_XCAT(sph_, SPH_XCAT(HASH, _context)) *sc;
+	unsigned current, u;
+#if !SPH_64
+	sph_u32 low, high;
+#endif
+
+	sc = cc;
+#if SPH_64
+	current = (unsigned)sc->count & (SPH_BLEN - 1U);
+#else
+	current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
+#endif
+#ifdef PW01
+	sc->buf[current ++] = (0x100 | (ub & 0xFF)) >> (8 - n);
+#else
+	{
+		unsigned z;
+
+		z = 0x80 >> n;
+		sc->buf[current ++] = ((ub & -z) | z) & 0xFF;
+	}
+#endif
+	if (current > SPH_MAXPAD) {
+		memset(sc->buf + current, 0, SPH_BLEN - current);
+		RFUN(sc->buf, SPH_VAL);
+		memset(sc->buf, 0, SPH_MAXPAD);
+	} else {
+		memset(sc->buf + current, 0, SPH_MAXPAD - current);
+	}
+#if defined BE64
+#if defined PLW1
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#elif defined PLW4
+	memset(sc->buf + SPH_MAXPAD, 0, 2 * SPH_WLEN);
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD + 2 * SPH_WLEN,
+		sc->count >> 61);
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD + 3 * SPH_WLEN,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#else
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD, sc->count >> 61);
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD + SPH_WLEN,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#endif
+#elif defined LE64
+#if defined PLW1
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#elif defined PLW1
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD + SPH_WLEN, sc->count >> 61);
+	memset(sc->buf + SPH_MAXPAD + 2 * SPH_WLEN, 0, 2 * SPH_WLEN);
+#else
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD + SPH_WLEN, sc->count >> 61);
+#endif
+#else
+#if SPH_64
+#ifdef BE32
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#else
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#endif
+#else
+	low = sc->count_low;
+	high = SPH_T32((sc->count_high << 3) | (low >> 29));
+	low = SPH_T32(low << 3) + (sph_u32)n;
+#ifdef BE32
+	sph_enc32be(sc->buf + SPH_MAXPAD, high);
+	sph_enc32be(sc->buf + SPH_MAXPAD + SPH_WLEN, low);
+#else
+	sph_enc32le(sc->buf + SPH_MAXPAD, low);
+	sph_enc32le(sc->buf + SPH_MAXPAD + SPH_WLEN, high);
+#endif
+#endif
+#endif
+	RFUN(sc->buf, SPH_VAL);
+#ifdef SPH_NO_OUTPUT
+	(void)dst;
+	(void)rnum;
+	(void)u;
+#else
+	for (u = 0; u < rnum; u ++) {
+#if defined BE64
+		sph_enc64be((unsigned char *)dst + 8 * u, sc->val[u]);
+#elif defined LE64
+		sph_enc64le((unsigned char *)dst + 8 * u, sc->val[u]);
+#elif defined BE32
+		sph_enc32be((unsigned char *)dst + 4 * u, sc->val[u]);
+#else
+		sph_enc32le((unsigned char *)dst + 4 * u, sc->val[u]);
+#endif
+	}
+#endif
+}
+
+static void
+SPH_XCAT(HASH, _close)(void *cc, void *dst, unsigned rnum)
+{
+	SPH_XCAT(HASH, _addbits_and_close)(cc, 0, 0, dst, rnum);
+}
--- a/src/crypto/ghostrider/sph_blake.c
+++ b/src/crypto/ghostrider/sph_blake.c
--- a/src/crypto/ghostrider/sph_blake.h
+++ b/src/crypto/ghostrider/sph_blake.h
@ -0,0 +1,327 @@
+/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
+/**
+ * BLAKE interface. BLAKE is a family of functions which differ by their
+ * output size; this implementation defines BLAKE for output sizes 224,
+ * 256, 384 and 512 bits. This implementation conforms to the "third
+ * round" specification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_blake.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_BLAKE_H__
+#define SPH_BLAKE_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for BLAKE-224.
+ */
+#define SPH_SIZE_blake224   224
+
+/**
+ * Output size (in bits) for BLAKE-256.
+ */
+#define SPH_SIZE_blake256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BLAKE-384.
+ */
+#define SPH_SIZE_blake384   384
+
+/**
+ * Output size (in bits) for BLAKE-512.
+ */
+#define SPH_SIZE_blake512   512
+
+#endif
+
+/**
+ * This structure is a context for BLAKE-224 and BLAKE-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BLAKE computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BLAKE
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 H[8];
+	sph_u32 S[4];
+	sph_u32 T0, T1;
+#endif
+} sph_blake_small_context;
+
+/**
+ * This structure is a context for BLAKE-224 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_small_context sph_blake224_context;
+
+/**
+ * This structure is a context for BLAKE-256 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_small_context sph_blake256_context;
+
+#if SPH_64
+
+/**
+ * This structure is a context for BLAKE-384 and BLAKE-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BLAKE computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BLAKE
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u64 H[8];
+	sph_u64 S[4];
+	sph_u64 T0, T1;
+#endif
+} sph_blake_big_context;
+
+/**
+ * This structure is a context for BLAKE-384 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_big_context sph_blake384_context;
+
+/**
+ * This structure is a context for BLAKE-512 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_big_context sph_blake512_context;
+
+#endif
+
+/**
+ * Initialize a BLAKE-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-224 context (pointer to a
+ *             <code>sph_blake224_context</code>)
+ */
+void sph_blake224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-224 context
+ * @param dst   the destination buffer
+ */
+void sph_blake224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BLAKE-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-256 context (pointer to a
+ *             <code>sph_blake256_context</code>)
+ */
+void sph_blake256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-256 context
+ * @param dst   the destination buffer
+ */
+void sph_blake256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#if SPH_64
+
+/**
+ * Initialize a BLAKE-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-384 context (pointer to a
+ *             <code>sph_blake384_context</code>)
+ */
+void sph_blake384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-384 context
+ * @param dst   the destination buffer
+ */
+void sph_blake384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BLAKE-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-512 context (pointer to a
+ *             <code>sph_blake512_context</code>)
+ */
+void sph_blake512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-512 context
+ * @param dst   the destination buffer
+ */
+void sph_blake512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/crypto/ghostrider/sph_bmw.c
+++ b/src/crypto/ghostrider/sph_bmw.c
@ -0,0 +1,986 @@
+/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * BMW implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include "sph_bmw.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW
+#define SPH_SMALL_FOOTPRINT_BMW   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#if !defined(__AVX2__)
+
+static const sph_u32 IV224[] = {
+	SPH_C32(0x00010203), SPH_C32(0x04050607),
+	SPH_C32(0x08090A0B), SPH_C32(0x0C0D0E0F),
+	SPH_C32(0x10111213), SPH_C32(0x14151617),
+	SPH_C32(0x18191A1B), SPH_C32(0x1C1D1E1F),
+	SPH_C32(0x20212223), SPH_C32(0x24252627),
+	SPH_C32(0x28292A2B), SPH_C32(0x2C2D2E2F),
+	SPH_C32(0x30313233), SPH_C32(0x34353637),
+	SPH_C32(0x38393A3B), SPH_C32(0x3C3D3E3F)
+};
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0x40414243), SPH_C32(0x44454647),
+	SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
+	SPH_C32(0x50515253), SPH_C32(0x54555657),
+	SPH_C32(0x58595A5B), SPH_C32(0x5C5D5E5F),
+	SPH_C32(0x60616263), SPH_C32(0x64656667),
+	SPH_C32(0x68696A6B), SPH_C32(0x6C6D6E6F),
+	SPH_C32(0x70717273), SPH_C32(0x74757677),
+	SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
+};
+
+#endif // !AVX2
+
+#if SPH_64
+
+static const sph_u64 IV384[] = {
+	SPH_C64(0x0001020304050607), SPH_C64(0x08090A0B0C0D0E0F),
+	SPH_C64(0x1011121314151617), SPH_C64(0x18191A1B1C1D1E1F),
+	SPH_C64(0x2021222324252627), SPH_C64(0x28292A2B2C2D2E2F),
+	SPH_C64(0x3031323334353637), SPH_C64(0x38393A3B3C3D3E3F),
+	SPH_C64(0x4041424344454647), SPH_C64(0x48494A4B4C4D4E4F),
+	SPH_C64(0x5051525354555657), SPH_C64(0x58595A5B5C5D5E5F),
+	SPH_C64(0x6061626364656667), SPH_C64(0x68696A6B6C6D6E6F),
+	SPH_C64(0x7071727374757677), SPH_C64(0x78797A7B7C7D7E7F)
+};
+
+static const sph_u64 IV512[] = {
+	SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
+	SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
+	SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
+	SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
+	SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
+	SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
+	SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
+	SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
+};
+
+#endif
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+#define LPAR   (
+
+#define I16_16    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+#define I16_17    1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
+#define I16_18    2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17
+#define I16_19    3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18
+#define I16_20    4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
+#define I16_21    5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
+#define I16_22    6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+#define I16_23    7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
+#define I16_24    8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
+#define I16_25    9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
+#define I16_26   10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
+#define I16_27   11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
+#define I16_28   12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+#define I16_29   13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28
+#define I16_30   14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+#define I16_31   15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+
+#define M16_16    0,  1,  3,  4,  7, 10, 11
+#define M16_17    1,  2,  4,  5,  8, 11, 12
+#define M16_18    2,  3,  5,  6,  9, 12, 13
+#define M16_19    3,  4,  6,  7, 10, 13, 14
+#define M16_20    4,  5,  7,  8, 11, 14, 15
+#define M16_21    5,  6,  8,  9, 12, 15, 16
+#define M16_22    6,  7,  9, 10, 13,  0,  1
+#define M16_23    7,  8, 10, 11, 14,  1,  2
+#define M16_24    8,  9, 11, 12, 15,  2,  3
+#define M16_25    9, 10, 12, 13,  0,  3,  4
+#define M16_26   10, 11, 13, 14,  1,  4,  5
+#define M16_27   11, 12, 14, 15,  2,  5,  6
+#define M16_28   12, 13, 15, 16,  3,  6,  7
+#define M16_29   13, 14,  0,  1,  4,  7,  8
+#define M16_30   14, 15,  1,  2,  5,  8,  9
+#define M16_31   15, 16,  2,  3,  6,  9, 10
+
+#if !defined(__AVX2__)
+
+#define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
+                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
+#define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x,  8) ^ SPH_ROTL32(x, 23))
+#define ss2(x)    (((x) >> 2) ^ SPH_T32((x) << 1) \
+                  ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
+#define ss3(x)    (((x) >> 2) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
+#define ss4(x)    (((x) >> 1) ^ (x))
+#define ss5(x)    (((x) >> 2) ^ (x))
+#define rs1(x)    SPH_ROTL32(x,  3)
+#define rs2(x)    SPH_ROTL32(x,  7)
+#define rs3(x)    SPH_ROTL32(x, 13)
+#define rs4(x)    SPH_ROTL32(x, 16)
+#define rs5(x)    SPH_ROTL32(x, 19)
+#define rs6(x)    SPH_ROTL32(x, 23)
+#define rs7(x)    SPH_ROTL32(x, 27)
+
+#define Ks(j)   SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
+
+#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
+		- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
+
+#define expand1s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
+		+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
+		+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
+		+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1s(qf, mf, hf, i16) \
+	expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1s_(qf, mf, hf, i16, ix, iy) \
+	expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
+		+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
+		+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
+		+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2s(qf, mf, hf, i16) \
+	expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2s_(qf, mf, hf, i16, ix, iy) \
+	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#endif // !AVX2
+
+#if SPH_64
+
+#define sb0(x)    (((x) >> 1) ^ SPH_T64((x) << 3) \
+                  ^ SPH_ROTL64(x,  4) ^ SPH_ROTL64(x, 37))
+#define sb1(x)    (((x) >> 1) ^ SPH_T64((x) << 2) \
+                  ^ SPH_ROTL64(x, 13) ^ SPH_ROTL64(x, 43))
+#define sb2(x)    (((x) >> 2) ^ SPH_T64((x) << 1) \
+                  ^ SPH_ROTL64(x, 19) ^ SPH_ROTL64(x, 53))
+#define sb3(x)    (((x) >> 2) ^ SPH_T64((x) << 2) \
+                  ^ SPH_ROTL64(x, 28) ^ SPH_ROTL64(x, 59))
+#define sb4(x)    (((x) >> 1) ^ (x))
+#define sb5(x)    (((x) >> 2) ^ (x))
+#define rb1(x)    SPH_ROTL64(x,  5)
+#define rb2(x)    SPH_ROTL64(x, 11)
+#define rb3(x)    SPH_ROTL64(x, 27)
+#define rb4(x)    SPH_ROTL64(x, 32)
+#define rb5(x)    SPH_ROTL64(x, 37)
+#define rb6(x)    SPH_ROTL64(x, 43)
+#define rb7(x)    SPH_ROTL64(x, 53)
+
+#define Kb(j)   SPH_T64((sph_u64)(j) * SPH_C64(0x0555555555555555))
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+static const sph_u64 Kb_tab[] = {
+	Kb(16), Kb(17), Kb(18), Kb(19), Kb(20), Kb(21), Kb(22), Kb(23),
+	Kb(24), Kb(25), Kb(26), Kb(27), Kb(28), Kb(29), Kb(30), Kb(31)
+};
+
+#define rol_off(mf, j, off) \
+	SPH_ROTL64(mf(((j) + (off)) & 15), (((j) + (off)) & 15) + 1)
+
+#define add_elt_b(mf, hf, j) \
+	(SPH_T64(rol_off(mf, j, 0) + rol_off(mf, j, 3) \
+		- rol_off(mf, j, 10) + Kb_tab[j]) ^ hf(((j) + 7) & 15))
+
+#define expand1b(qf, mf, hf, i) \
+	SPH_T64(sb1(qf((i) - 16)) + sb2(qf((i) - 15)) \
+		+ sb3(qf((i) - 14)) + sb0(qf((i) - 13)) \
+		+ sb1(qf((i) - 12)) + sb2(qf((i) - 11)) \
+		+ sb3(qf((i) - 10)) + sb0(qf((i) - 9)) \
+		+ sb1(qf((i) - 8)) + sb2(qf((i) - 7)) \
+		+ sb3(qf((i) - 6)) + sb0(qf((i) - 5)) \
+		+ sb1(qf((i) - 4)) + sb2(qf((i) - 3)) \
+		+ sb3(qf((i) - 2)) + sb0(qf((i) - 1)) \
+		+ add_elt_b(mf, hf, (i) - 16))
+
+#define expand2b(qf, mf, hf, i) \
+	SPH_T64(qf((i) - 16) + rb1(qf((i) - 15)) \
+		+ qf((i) - 14) + rb2(qf((i) - 13)) \
+		+ qf((i) - 12) + rb3(qf((i) - 11)) \
+		+ qf((i) - 10) + rb4(qf((i) - 9)) \
+		+ qf((i) - 8) + rb5(qf((i) - 7)) \
+		+ qf((i) - 6) + rb6(qf((i) - 5)) \
+		+ qf((i) - 4) + rb7(qf((i) - 3)) \
+		+ sb4(qf((i) - 2)) + sb5(qf((i) - 1)) \
+		+ add_elt_b(mf, hf, (i) - 16))
+
+#else
+
+#define add_elt_b(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T64(SPH_ROTL64(mf(j0m), j1m) + SPH_ROTL64(mf(j3m), j4m) \
+		- SPH_ROTL64(mf(j10m), j11m) + Kb(j16)) ^ hf(j7m))
+
+#define expand1b_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T64(sb1(qf(i0)) + sb2(qf(i1)) + sb3(qf(i2)) + sb0(qf(i3)) \
+		+ sb1(qf(i4)) + sb2(qf(i5)) + sb3(qf(i6)) + sb0(qf(i7)) \
+		+ sb1(qf(i8)) + sb2(qf(i9)) + sb3(qf(i10)) + sb0(qf(i11)) \
+		+ sb1(qf(i12)) + sb2(qf(i13)) + sb3(qf(i14)) + sb0(qf(i15)) \
+		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1b(qf, mf, hf, i16) \
+	expand1b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1b_(qf, mf, hf, i16, ix, iy) \
+	expand1b_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2b_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T64(qf(i0) + rb1(qf(i1)) + qf(i2) + rb2(qf(i3)) \
+		+ qf(i4) + rb3(qf(i5)) + qf(i6) + rb4(qf(i7)) \
+		+ qf(i8) + rb5(qf(i9)) + qf(i10) + rb6(qf(i11)) \
+		+ qf(i12) + rb7(qf(i13)) + sb4(qf(i14)) + sb5(qf(i15)) \
+		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2b(qf, mf, hf, i16) \
+	expand2b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2b_(qf, mf, hf, i16, ix, iy) \
+	expand2b_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#endif
+
+#endif
+
+#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \
+	tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
+	op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
+
+#if !defined(__AVX2__)
+
+#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
+#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
+#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
+#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
+#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
+#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
+#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
+#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
+#define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
+#define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
+#define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
+#define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
+#define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
+#define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
+#define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
+#define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+#define MAKE_Qas   do { \
+		unsigned u; \
+		sph_u32 Ws[16]; \
+		Ws[ 0] = Ws0; \
+		Ws[ 1] = Ws1; \
+		Ws[ 2] = Ws2; \
+		Ws[ 3] = Ws3; \
+		Ws[ 4] = Ws4; \
+		Ws[ 5] = Ws5; \
+		Ws[ 6] = Ws6; \
+		Ws[ 7] = Ws7; \
+		Ws[ 8] = Ws8; \
+		Ws[ 9] = Ws9; \
+		Ws[10] = Ws10; \
+		Ws[11] = Ws11; \
+		Ws[12] = Ws12; \
+		Ws[13] = Ws13; \
+		Ws[14] = Ws14; \
+		Ws[15] = Ws15; \
+		for (u = 0; u < 15; u += 5) { \
+			qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \
+			qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \
+			qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \
+			qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \
+			qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \
+		} \
+		qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#else
+
+#define MAKE_Qas   do { \
+		qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
+		qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
+		qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
+		qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
+		qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
+		qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
+		qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
+		qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
+		qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
+		qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
+		qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
+		qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
+		qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
+		qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
+		qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
+		qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#endif
+
+#define MAKE_Qs   do { \
+		MAKE_Qas; \
+		MAKE_Qbs; \
+	} while (0)
+
+#define Qs(j)   (qt[j])
+
+#endif  // !AVX2
+
+#if SPH_64
+
+#define Wb0    MAKE_W(SPH_T64,  5, -,  7, +, 10, +, 13, +, 14)
+#define Wb1    MAKE_W(SPH_T64,  6, -,  8, +, 11, +, 14, -, 15)
+#define Wb2    MAKE_W(SPH_T64,  0, +,  7, +,  9, -, 12, +, 15)
+#define Wb3    MAKE_W(SPH_T64,  0, -,  1, +,  8, -, 10, +, 13)
+#define Wb4    MAKE_W(SPH_T64,  1, +,  2, +,  9, -, 11, -, 14)
+#define Wb5    MAKE_W(SPH_T64,  3, -,  2, +, 10, -, 12, +, 15)
+#define Wb6    MAKE_W(SPH_T64,  4, -,  0, -,  3, -, 11, +, 13)
+#define Wb7    MAKE_W(SPH_T64,  1, -,  4, -,  5, -, 12, -, 14)
+#define Wb8    MAKE_W(SPH_T64,  2, -,  5, -,  6, +, 13, -, 15)
+#define Wb9    MAKE_W(SPH_T64,  0, -,  3, +,  6, -,  7, +, 14)
+#define Wb10   MAKE_W(SPH_T64,  8, -,  1, -,  4, -,  7, +, 15)
+#define Wb11   MAKE_W(SPH_T64,  8, -,  0, -,  2, -,  5, +,  9)
+#define Wb12   MAKE_W(SPH_T64,  1, +,  3, -,  6, -,  9, +, 10)
+#define Wb13   MAKE_W(SPH_T64,  2, +,  4, +,  7, +, 10, +, 11)
+#define Wb14   MAKE_W(SPH_T64,  3, -,  5, +,  8, -, 11, -, 12)
+#define Wb15   MAKE_W(SPH_T64, 12, -,  4, -,  6, -,  9, +, 13)
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+#define MAKE_Qab   do { \
+		unsigned u; \
+		sph_u64 Wb[16]; \
+		Wb[ 0] = Wb0; \
+		Wb[ 1] = Wb1; \
+		Wb[ 2] = Wb2; \
+		Wb[ 3] = Wb3; \
+		Wb[ 4] = Wb4; \
+		Wb[ 5] = Wb5; \
+		Wb[ 6] = Wb6; \
+		Wb[ 7] = Wb7; \
+		Wb[ 8] = Wb8; \
+		Wb[ 9] = Wb9; \
+		Wb[10] = Wb10; \
+		Wb[11] = Wb11; \
+		Wb[12] = Wb12; \
+		Wb[13] = Wb13; \
+		Wb[14] = Wb14; \
+		Wb[15] = Wb15; \
+		for (u = 0; u < 15; u += 5) { \
+			qt[u + 0] = SPH_T64(sb0(Wb[u + 0]) + H(u + 1)); \
+			qt[u + 1] = SPH_T64(sb1(Wb[u + 1]) + H(u + 2)); \
+			qt[u + 2] = SPH_T64(sb2(Wb[u + 2]) + H(u + 3)); \
+			qt[u + 3] = SPH_T64(sb3(Wb[u + 3]) + H(u + 4)); \
+			qt[u + 4] = SPH_T64(sb4(Wb[u + 4]) + H(u + 5)); \
+		} \
+		qt[15] = SPH_T64(sb0(Wb[15]) + H(0)); \
+	} while (0)
+
+#define MAKE_Qbb   do { \
+		unsigned u; \
+		for (u = 16; u < 18; u ++) \
+			qt[u] = expand1b(Qb, M, H, u); \
+		for (u = 18; u < 32; u ++) \
+			qt[u] = expand2b(Qb, M, H, u); \
+	} while (0)
+
+#else
+
+#define MAKE_Qab   do { \
+		qt[ 0] = SPH_T64(sb0(Wb0 ) + H( 1)); \
+		qt[ 1] = SPH_T64(sb1(Wb1 ) + H( 2)); \
+		qt[ 2] = SPH_T64(sb2(Wb2 ) + H( 3)); \
+		qt[ 3] = SPH_T64(sb3(Wb3 ) + H( 4)); \
+		qt[ 4] = SPH_T64(sb4(Wb4 ) + H( 5)); \
+		qt[ 5] = SPH_T64(sb0(Wb5 ) + H( 6)); \
+		qt[ 6] = SPH_T64(sb1(Wb6 ) + H( 7)); \
+		qt[ 7] = SPH_T64(sb2(Wb7 ) + H( 8)); \
+		qt[ 8] = SPH_T64(sb3(Wb8 ) + H( 9)); \
+		qt[ 9] = SPH_T64(sb4(Wb9 ) + H(10)); \
+		qt[10] = SPH_T64(sb0(Wb10) + H(11)); \
+		qt[11] = SPH_T64(sb1(Wb11) + H(12)); \
+		qt[12] = SPH_T64(sb2(Wb12) + H(13)); \
+		qt[13] = SPH_T64(sb3(Wb13) + H(14)); \
+		qt[14] = SPH_T64(sb4(Wb14) + H(15)); \
+		qt[15] = SPH_T64(sb0(Wb15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbb   do { \
+		qt[16] = expand1b(Qb, M, H, 16); \
+		qt[17] = expand1b(Qb, M, H, 17); \
+		qt[18] = expand2b(Qb, M, H, 18); \
+		qt[19] = expand2b(Qb, M, H, 19); \
+		qt[20] = expand2b(Qb, M, H, 20); \
+		qt[21] = expand2b(Qb, M, H, 21); \
+		qt[22] = expand2b(Qb, M, H, 22); \
+		qt[23] = expand2b(Qb, M, H, 23); \
+		qt[24] = expand2b(Qb, M, H, 24); \
+		qt[25] = expand2b(Qb, M, H, 25); \
+		qt[26] = expand2b(Qb, M, H, 26); \
+		qt[27] = expand2b(Qb, M, H, 27); \
+		qt[28] = expand2b(Qb, M, H, 28); \
+		qt[29] = expand2b(Qb, M, H, 29); \
+		qt[30] = expand2b(Qb, M, H, 30); \
+		qt[31] = expand2b(Qb, M, H, 31); \
+	} while (0)
+
+#endif
+
+#define MAKE_Qb   do { \
+		MAKE_Qab; \
+		MAKE_Qbb; \
+	} while (0)
+
+#define Qb(j)   (qt[j])
+
+#endif
+
+#define FOLD(type, mkQ, tt, rol, mf, qf, dhf)   do { \
+		type qt[32], xl, xh; \
+		mkQ; \
+		xl = qf(16) ^ qf(17) ^ qf(18) ^ qf(19) \
+			^ qf(20) ^ qf(21) ^ qf(22) ^ qf(23); \
+		xh = xl ^ qf(24) ^ qf(25) ^ qf(26) ^ qf(27) \
+			^ qf(28) ^ qf(29) ^ qf(30) ^ qf(31); \
+		dhf( 0) = tt(((xh <<  5) ^ (qf(16) >>  5) ^ mf( 0)) \
+			+ (xl ^ qf(24) ^ qf( 0))); \
+		dhf( 1) = tt(((xh >>  7) ^ (qf(17) <<  8) ^ mf( 1)) \
+			+ (xl ^ qf(25) ^ qf( 1))); \
+		dhf( 2) = tt(((xh >>  5) ^ (qf(18) <<  5) ^ mf( 2)) \
+			+ (xl ^ qf(26) ^ qf( 2))); \
+		dhf( 3) = tt(((xh >>  1) ^ (qf(19) <<  5) ^ mf( 3)) \
+			+ (xl ^ qf(27) ^ qf( 3))); \
+		dhf( 4) = tt(((xh >>  3) ^ (qf(20) <<  0) ^ mf( 4)) \
+			+ (xl ^ qf(28) ^ qf( 4))); \
+		dhf( 5) = tt(((xh <<  6) ^ (qf(21) >>  6) ^ mf( 5)) \
+			+ (xl ^ qf(29) ^ qf( 5))); \
+		dhf( 6) = tt(((xh >>  4) ^ (qf(22) <<  6) ^ mf( 6)) \
+			+ (xl ^ qf(30) ^ qf( 6))); \
+		dhf( 7) = tt(((xh >> 11) ^ (qf(23) <<  2) ^ mf( 7)) \
+			+ (xl ^ qf(31) ^ qf( 7))); \
+		dhf( 8) = tt(rol(dhf(4),  9) + (xh ^ qf(24) ^ mf( 8)) \
+			+ ((xl << 8) ^ qf(23) ^ qf( 8))); \
+		dhf( 9) = tt(rol(dhf(5), 10) + (xh ^ qf(25) ^ mf( 9)) \
+			+ ((xl >> 6) ^ qf(16) ^ qf( 9))); \
+		dhf(10) = tt(rol(dhf(6), 11) + (xh ^ qf(26) ^ mf(10)) \
+			+ ((xl << 6) ^ qf(17) ^ qf(10))); \
+		dhf(11) = tt(rol(dhf(7), 12) + (xh ^ qf(27) ^ mf(11)) \
+			+ ((xl << 4) ^ qf(18) ^ qf(11))); \
+		dhf(12) = tt(rol(dhf(0), 13) + (xh ^ qf(28) ^ mf(12)) \
+			+ ((xl >> 3) ^ qf(19) ^ qf(12))); \
+		dhf(13) = tt(rol(dhf(1), 14) + (xh ^ qf(29) ^ mf(13)) \
+			+ ((xl >> 4) ^ qf(20) ^ qf(13))); \
+		dhf(14) = tt(rol(dhf(2), 15) + (xh ^ qf(30) ^ mf(14)) \
+			+ ((xl >> 7) ^ qf(21) ^ qf(14))); \
+		dhf(15) = tt(rol(dhf(3), 16) + (xh ^ qf(31) ^ mf(15)) \
+			+ ((xl >> 2) ^ qf(22) ^ qf(15))); \
+	} while (0)
+
+
+#if SPH_64
+
+#define FOLDb   FOLD(sph_u64, MAKE_Qb, SPH_T64, SPH_ROTL64, M, Qb, dH)
+
+#endif
+
+#if !defined(__AVX2__)
+
+#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
+
+static void
+compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
+{
+#if SPH_LITTLE_FAST
+#define M(x)    sph_dec32le_aligned(data + 4 * (x))
+#else
+	sph_u32 mv[16];
+
+	mv[ 0] = sph_dec32le_aligned(data +  0);
+	mv[ 1] = sph_dec32le_aligned(data +  4);
+	mv[ 2] = sph_dec32le_aligned(data +  8);
+	mv[ 3] = sph_dec32le_aligned(data + 12);
+	mv[ 4] = sph_dec32le_aligned(data + 16);
+	mv[ 5] = sph_dec32le_aligned(data + 20);
+	mv[ 6] = sph_dec32le_aligned(data + 24);
+	mv[ 7] = sph_dec32le_aligned(data + 28);
+	mv[ 8] = sph_dec32le_aligned(data + 32);
+	mv[ 9] = sph_dec32le_aligned(data + 36);
+	mv[10] = sph_dec32le_aligned(data + 40);
+	mv[11] = sph_dec32le_aligned(data + 44);
+	mv[12] = sph_dec32le_aligned(data + 48);
+	mv[13] = sph_dec32le_aligned(data + 52);
+	mv[14] = sph_dec32le_aligned(data + 56);
+	mv[15] = sph_dec32le_aligned(data + 60);
+#define M(x)    (mv[x])
+#endif
+#define H(x)    (h[x])
+#define dH(x)   (dh[x])
+
+	FOLDs;
+
+#undef M
+#undef H
+#undef dH
+}
+
+static const sph_u32 final_s[16] = {
+	SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2),
+	SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5),
+	SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8),
+	SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab),
+	SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae),
+	SPH_C32(0xaaaaaaaf)
+};
+
+static void
+bmw32_init(sph_bmw_small_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->H, iv, sizeof sc->H);
+	sc->ptr = 0;
+#if SPH_64
+	sc->bit_count = 0;
+#else
+	sc->bit_count_high = 0;
+	sc->bit_count_low = 0;
+#endif
+}
+
+static void
+bmw32(sph_bmw_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	sph_u32 htmp[16];
+	sph_u32 *h1, *h2;
+#if !SPH_64
+	sph_u32 tmp;
+#endif
+
+#if SPH_64
+	sc->bit_count += (sph_u64)len << 3;
+#else
+	tmp = sc->bit_count_low;
+	sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3));
+	if (sc->bit_count_low < tmp)
+		sc->bit_count_high ++;
+	sc->bit_count_high += len >> 29;
+#endif
+	buf = sc->buf;
+	ptr = sc->ptr;
+	h1 = sc->H;
+	h2 = htmp;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		ptr += clen;
+		if (ptr == sizeof sc->buf) {
+			sph_u32 *ht;
+
+			compress_small(buf, h1, h2);
+			ht = h1;
+			h1 = h2;
+			h2 = ht;
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+	if (h1 != sc->H)
+		memcpy(sc->H, h1, sizeof sc->H);
+}
+
+static void
+bmw32_close(sph_bmw_small_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr, u, v;
+	unsigned z;
+	sph_u32 h1[16], h2[16], *h;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	h = sc->H;
+	if (ptr > (sizeof sc->buf) - 8) {
+		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+		compress_small(buf, h, h1);
+		ptr = 0;
+		h = h1;
+	}
+	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
+#if SPH_64
+	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
+		SPH_T64(sc->bit_count + n));
+#else
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 8,
+		sc->bit_count_low + n);
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 4,
+		SPH_T32(sc->bit_count_high));
+#endif
+	compress_small(buf, h, h2);
+	for (u = 0; u < 16; u ++)
+		sph_enc32le_aligned(buf + 4 * u, h2[u]);
+	compress_small(buf, final_s, h1);
+	out = dst;
+	for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
+		sph_enc32le(out + 4 * u, h1[v]);
+}
+
+#endif // !AVX2
+
+#if SPH_64
+
+static void
+compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16])
+{
+#if SPH_LITTLE_FAST
+#define M(x)    sph_dec64le_aligned(data + 8 * (x))
+#else
+	sph_u64 mv[16];
+
+	mv[ 0] = sph_dec64le_aligned(data +   0);
+	mv[ 1] = sph_dec64le_aligned(data +   8);
+	mv[ 2] = sph_dec64le_aligned(data +  16);
+	mv[ 3] = sph_dec64le_aligned(data +  24);
+	mv[ 4] = sph_dec64le_aligned(data +  32);
+	mv[ 5] = sph_dec64le_aligned(data +  40);
+	mv[ 6] = sph_dec64le_aligned(data +  48);
+	mv[ 7] = sph_dec64le_aligned(data +  56);
+	mv[ 8] = sph_dec64le_aligned(data +  64);
+	mv[ 9] = sph_dec64le_aligned(data +  72);
+	mv[10] = sph_dec64le_aligned(data +  80);
+	mv[11] = sph_dec64le_aligned(data +  88);
+	mv[12] = sph_dec64le_aligned(data +  96);
+	mv[13] = sph_dec64le_aligned(data + 104);
+	mv[14] = sph_dec64le_aligned(data + 112);
+	mv[15] = sph_dec64le_aligned(data + 120);
+#define M(x)    (mv[x])
+#endif
+#define H(x)    (h[x])
+#define dH(x)   (dh[x])
+
+	FOLDb;
+
+#undef M
+#undef H
+#undef dH
+}
+
+static const sph_u64 final_b[16] = {
+	SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1),
+	SPH_C64(0xaaaaaaaaaaaaaaa2), SPH_C64(0xaaaaaaaaaaaaaaa3),
+	SPH_C64(0xaaaaaaaaaaaaaaa4), SPH_C64(0xaaaaaaaaaaaaaaa5),
+	SPH_C64(0xaaaaaaaaaaaaaaa6), SPH_C64(0xaaaaaaaaaaaaaaa7),
+	SPH_C64(0xaaaaaaaaaaaaaaa8), SPH_C64(0xaaaaaaaaaaaaaaa9),
+	SPH_C64(0xaaaaaaaaaaaaaaaa), SPH_C64(0xaaaaaaaaaaaaaaab),
+	SPH_C64(0xaaaaaaaaaaaaaaac), SPH_C64(0xaaaaaaaaaaaaaaad),
+	SPH_C64(0xaaaaaaaaaaaaaaae), SPH_C64(0xaaaaaaaaaaaaaaaf)
+};
+
+static void
+bmw64_init(sph_bmw_big_context *sc, const sph_u64 *iv)
+{
+	memcpy(sc->H, iv, sizeof sc->H);
+	sc->ptr = 0;
+	sc->bit_count = 0;
+}
+
+static void
+bmw64(sph_bmw_big_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	sph_u64 htmp[16];
+	sph_u64 *h1, *h2;
+
+	sc->bit_count += (sph_u64)len << 3;
+	buf = sc->buf;
+	ptr = sc->ptr;
+	h1 = sc->H;
+	h2 = htmp;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		ptr += clen;
+		if (ptr == sizeof sc->buf) {
+			sph_u64 *ht;
+
+			compress_big(buf, h1, h2);
+			ht = h1;
+			h1 = h2;
+			h2 = ht;
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+	if (h1 != sc->H)
+		memcpy(sc->H, h1, sizeof sc->H);
+}
+
+static void
+bmw64_close(sph_bmw_big_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w64)
+{
+	unsigned char *buf, *out;
+	size_t ptr, u, v;
+	unsigned z;
+	sph_u64 h1[16], h2[16], *h;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	h = sc->H;
+	if (ptr > (sizeof sc->buf) - 8) {
+		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+		compress_big(buf, h, h1);
+		ptr = 0;
+		h = h1;
+	}
+	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
+	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
+		SPH_T64(sc->bit_count + n));
+	compress_big(buf, h, h2);
+	for (u = 0; u < 16; u ++)
+		sph_enc64le_aligned(buf + 8 * u, h2[u]);
+	compress_big(buf, final_b, h1);
+	out = dst;
+	for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++)
+		sph_enc64le(out + 8 * u, h1[v]);
+}
+
+#endif
+
+#if !defined(__AVX2__)
+
+/* see sph_bmw.h */
+void
+sph_bmw224_init(void *cc)
+{
+	bmw32_init(cc, IV224);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw224(void *cc, const void *data, size_t len)
+{
+	bmw32(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw224_close(void *cc, void *dst)
+{
+	sph_bmw224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw32_close(cc, ub, n, dst, 7);
+//	sph_bmw224_init(cc);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256_init(void *cc)
+{
+	bmw32_init(cc, IV256);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256(void *cc, const void *data, size_t len)
+{
+	bmw32(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256_close(void *cc, void *dst)
+{
+	sph_bmw256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw32_close(cc, ub, n, dst, 8);
+//	sph_bmw256_init(cc);
+}
+
+#endif // !AVX2
+
+#if SPH_64
+
+/* see sph_bmw.h */
+void
+sph_bmw384_init(void *cc)
+{
+	bmw64_init(cc, IV384);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw384(void *cc, const void *data, size_t len)
+{
+	bmw64(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw384_close(void *cc, void *dst)
+{
+	sph_bmw384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw64_close(cc, ub, n, dst, 6);
+//	sph_bmw384_init(cc);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512_init(void *cc)
+{
+	bmw64_init(cc, IV512);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512(void *cc, const void *data, size_t len)
+{
+	bmw64(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512_close(void *cc, void *dst)
+{
+	sph_bmw512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw64_close(cc, ub, n, dst, 8);
+//	sph_bmw512_init(cc);
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/crypto/ghostrider/sph_bmw.h
+++ b/src/crypto/ghostrider/sph_bmw.h
@ -0,0 +1,337 @@
+/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * BMW interface. BMW (aka "Blue Midnight Wish") is a family of
+ * functions which differ by their output size; this implementation
+ * defines BMW for output sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_bmw.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_BMW_H__
+#define SPH_BMW_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for BMW-224.
+ */
+#define SPH_SIZE_bmw224   224
+
+/**
+ * Output size (in bits) for BMW-256.
+ */
+#define SPH_SIZE_bmw256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BMW-384.
+ */
+#define SPH_SIZE_bmw384   384
+
+/**
+ * Output size (in bits) for BMW-512.
+ */
+#define SPH_SIZE_bmw512   512
+
+#endif
+
+/**
+ * This structure is a context for BMW-224 and BMW-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+
+#if !defined(__AVX2__)
+
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 H[16];
+#if SPH_64
+	sph_u64 bit_count;
+#else
+	sph_u32 bit_count_high, bit_count_low;
+#endif
+#endif
+} sph_bmw_small_context;
+
+/**
+ * This structure is a context for BMW-224 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_small_context sph_bmw224_context;
+
+/**
+ * This structure is a context for BMW-256 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_small_context sph_bmw256_context;
+
+#endif // !AVX2
+
+#if SPH_64
+
+/**
+ * This structure is a context for BMW-384 and BMW-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u64 H[16];
+	sph_u64 bit_count;
+#endif
+} sph_bmw_big_context;
+
+/**
+ * This structure is a context for BMW-384 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_big_context sph_bmw384_context;
+
+/**
+ * This structure is a context for BMW-512 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_big_context sph_bmw512_context;
+
+#endif
+
+#if !defined(__AVX2__)
+
+/**
+ * Initialize a BMW-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-224 context (pointer to a
+ *             <code>sph_bmw224_context</code>)
+ */
+void sph_bmw224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-224 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BMW-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-256 context (pointer to a
+ *             <code>sph_bmw256_context</code>)
+ */
+void sph_bmw256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-256 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif // !AVX2
+
+#if SPH_64
+
+/**
+ * Initialize a BMW-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-384 context (pointer to a
+ *             <code>sph_bmw384_context</code>)
+ */
+void sph_bmw384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-384 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BMW-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-512 context (pointer to a
+ *             <code>sph_bmw512_context</code>)
+ */
+void sph_bmw512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-512 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/crypto/ghostrider/sph_cubehash.c
+++ b/src/crypto/ghostrider/sph_cubehash.c
@ -0,0 +1,723 @@
+/* $Id: cubehash.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * CubeHash implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_cubehash.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_CUBEHASH
+#define SPH_SMALL_FOOTPRINT_CUBEHASH   1
+#endif
+
+/*
+ * Some tests were conducted on an Intel Core2 Q6600 (32-bit and 64-bit
+ * mode), a PowerPC G3, and a MIPS-compatible CPU (Broadcom BCM3302).
+ * It appears that the optimal settings are:
+ *  -- full unroll, no state copy on the "big" systems (x86, PowerPC)
+ *  -- unroll to 4 or 8, state copy on the "small" system (MIPS)
+ */
+
+#if SPH_SMALL_FOOTPRINT_CUBEHASH
+
+#if !defined SPH_CUBEHASH_UNROLL
+#define SPH_CUBEHASH_UNROLL   4
+#endif
+#if !defined SPH_CUBEHASH_NOCOPY
+#define SPH_CUBEHASH_NOCOPY   1
+#endif
+
+#else
+
+#if !defined SPH_CUBEHASH_UNROLL
+#define SPH_CUBEHASH_UNROLL   0
+#endif
+#if !defined SPH_CUBEHASH_NOCOPY
+#define SPH_CUBEHASH_NOCOPY   0
+#endif
+
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV224[] = {
+	SPH_C32(0xB0FC8217), SPH_C32(0x1BEE1A90), SPH_C32(0x829E1A22),
+	SPH_C32(0x6362C342), SPH_C32(0x24D91C30), SPH_C32(0x03A7AA24),
+	SPH_C32(0xA63721C8), SPH_C32(0x85B0E2EF), SPH_C32(0xF35D13F3),
+	SPH_C32(0x41DA807D), SPH_C32(0x21A70CA6), SPH_C32(0x1F4E9774),
+	SPH_C32(0xB3E1C932), SPH_C32(0xEB0A79A8), SPH_C32(0xCDDAAA66),
+	SPH_C32(0xE2F6ECAA), SPH_C32(0x0A713362), SPH_C32(0xAA3080E0),
+	SPH_C32(0xD8F23A32), SPH_C32(0xCEF15E28), SPH_C32(0xDB086314),
+	SPH_C32(0x7F709DF7), SPH_C32(0xACD228A4), SPH_C32(0x704D6ECE),
+	SPH_C32(0xAA3EC95F), SPH_C32(0xE387C214), SPH_C32(0x3A6445FF),
+	SPH_C32(0x9CAB81C3), SPH_C32(0xC73D4B98), SPH_C32(0xD277AEBE),
+	SPH_C32(0xFD20151C), SPH_C32(0x00CB573E)
+};
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0xEA2BD4B4), SPH_C32(0xCCD6F29F), SPH_C32(0x63117E71),
+	SPH_C32(0x35481EAE), SPH_C32(0x22512D5B), SPH_C32(0xE5D94E63),
+	SPH_C32(0x7E624131), SPH_C32(0xF4CC12BE), SPH_C32(0xC2D0B696),
+	SPH_C32(0x42AF2070), SPH_C32(0xD0720C35), SPH_C32(0x3361DA8C),
+	SPH_C32(0x28CCECA4), SPH_C32(0x8EF8AD83), SPH_C32(0x4680AC00),
+	SPH_C32(0x40E5FBAB), SPH_C32(0xD89041C3), SPH_C32(0x6107FBD5),
+	SPH_C32(0x6C859D41), SPH_C32(0xF0B26679), SPH_C32(0x09392549),
+	SPH_C32(0x5FA25603), SPH_C32(0x65C892FD), SPH_C32(0x93CB6285),
+	SPH_C32(0x2AF2B5AE), SPH_C32(0x9E4B4E60), SPH_C32(0x774ABFDD),
+	SPH_C32(0x85254725), SPH_C32(0x15815AEB), SPH_C32(0x4AB6AAD6),
+	SPH_C32(0x9CDAF8AF), SPH_C32(0xD6032C0A)
+};
+
+static const sph_u32 IV384[] = {
+	SPH_C32(0xE623087E), SPH_C32(0x04C00C87), SPH_C32(0x5EF46453),
+	SPH_C32(0x69524B13), SPH_C32(0x1A05C7A9), SPH_C32(0x3528DF88),
+	SPH_C32(0x6BDD01B5), SPH_C32(0x5057B792), SPH_C32(0x6AA7A922),
+	SPH_C32(0x649C7EEE), SPH_C32(0xF426309F), SPH_C32(0xCB629052),
+	SPH_C32(0xFC8E20ED), SPH_C32(0xB3482BAB), SPH_C32(0xF89E5E7E),
+	SPH_C32(0xD83D4DE4), SPH_C32(0x44BFC10D), SPH_C32(0x5FC1E63D),
+	SPH_C32(0x2104E6CB), SPH_C32(0x17958F7F), SPH_C32(0xDBEAEF70),
+	SPH_C32(0xB4B97E1E), SPH_C32(0x32C195F6), SPH_C32(0x6184A8E4),
+	SPH_C32(0x796C2543), SPH_C32(0x23DE176D), SPH_C32(0xD33BBAEC),
+	SPH_C32(0x0C12E5D2), SPH_C32(0x4EB95A7B), SPH_C32(0x2D18BA01),
+	SPH_C32(0x04EE475F), SPH_C32(0x1FC5F22E)
+};
+
+static const sph_u32 IV512[] = {
+	SPH_C32(0x2AEA2A61), SPH_C32(0x50F494D4), SPH_C32(0x2D538B8B),
+	SPH_C32(0x4167D83E), SPH_C32(0x3FEE2313), SPH_C32(0xC701CF8C),
+	SPH_C32(0xCC39968E), SPH_C32(0x50AC5695), SPH_C32(0x4D42C787),
+	SPH_C32(0xA647A8B3), SPH_C32(0x97CF0BEF), SPH_C32(0x825B4537),
+	SPH_C32(0xEEF864D2), SPH_C32(0xF22090C4), SPH_C32(0xD0E5CD33),
+	SPH_C32(0xA23911AE), SPH_C32(0xFCD398D9), SPH_C32(0x148FE485),
+	SPH_C32(0x1B017BEF), SPH_C32(0xB6444532), SPH_C32(0x6A536159),
+	SPH_C32(0x2FF5781C), SPH_C32(0x91FA7934), SPH_C32(0x0DBADEA9),
+	SPH_C32(0xD65C8A2B), SPH_C32(0xA5A70E75), SPH_C32(0xB1C62456),
+	SPH_C32(0xBC796576), SPH_C32(0x1921C8F7), SPH_C32(0xE7989AF1),
+	SPH_C32(0x7795D246), SPH_C32(0xD43E3B44)
+};
+
+#define T32      SPH_T32
+#define ROTL32   SPH_ROTL32
+
+#if SPH_CUBEHASH_NOCOPY
+
+#define DECL_STATE
+#define READ_STATE(cc)
+#define WRITE_STATE(cc)
+
+#define x0   ((sc)->state[ 0])
+#define x1   ((sc)->state[ 1])
+#define x2   ((sc)->state[ 2])
+#define x3   ((sc)->state[ 3])
+#define x4   ((sc)->state[ 4])
+#define x5   ((sc)->state[ 5])
+#define x6   ((sc)->state[ 6])
+#define x7   ((sc)->state[ 7])
+#define x8   ((sc)->state[ 8])
+#define x9   ((sc)->state[ 9])
+#define xa   ((sc)->state[10])
+#define xb   ((sc)->state[11])
+#define xc   ((sc)->state[12])
+#define xd   ((sc)->state[13])
+#define xe   ((sc)->state[14])
+#define xf   ((sc)->state[15])
+#define xg   ((sc)->state[16])
+#define xh   ((sc)->state[17])
+#define xi   ((sc)->state[18])
+#define xj   ((sc)->state[19])
+#define xk   ((sc)->state[20])
+#define xl   ((sc)->state[21])
+#define xm   ((sc)->state[22])
+#define xn   ((sc)->state[23])
+#define xo   ((sc)->state[24])
+#define xp   ((sc)->state[25])
+#define xq   ((sc)->state[26])
+#define xr   ((sc)->state[27])
+#define xs   ((sc)->state[28])
+#define xt   ((sc)->state[29])
+#define xu   ((sc)->state[30])
+#define xv   ((sc)->state[31])
+
+#else
+
+#define DECL_STATE \
+	sph_u32 x0, x1, x2, x3, x4, x5, x6, x7; \
+	sph_u32 x8, x9, xa, xb, xc, xd, xe, xf; \
+	sph_u32 xg, xh, xi, xj, xk, xl, xm, xn; \
+	sph_u32 xo, xp, xq, xr, xs, xt, xu, xv;
+
+#define READ_STATE(cc)   do { \
+		x0 = (cc)->state[ 0]; \
+		x1 = (cc)->state[ 1]; \
+		x2 = (cc)->state[ 2]; \
+		x3 = (cc)->state[ 3]; \
+		x4 = (cc)->state[ 4]; \
+		x5 = (cc)->state[ 5]; \
+		x6 = (cc)->state[ 6]; \
+		x7 = (cc)->state[ 7]; \
+		x8 = (cc)->state[ 8]; \
+		x9 = (cc)->state[ 9]; \
+		xa = (cc)->state[10]; \
+		xb = (cc)->state[11]; \
+		xc = (cc)->state[12]; \
+		xd = (cc)->state[13]; \
+		xe = (cc)->state[14]; \
+		xf = (cc)->state[15]; \
+		xg = (cc)->state[16]; \
+		xh = (cc)->state[17]; \
+		xi = (cc)->state[18]; \
+		xj = (cc)->state[19]; \
+		xk = (cc)->state[20]; \
+		xl = (cc)->state[21]; \
+		xm = (cc)->state[22]; \
+		xn = (cc)->state[23]; \
+		xo = (cc)->state[24]; \
+		xp = (cc)->state[25]; \
+		xq = (cc)->state[26]; \
+		xr = (cc)->state[27]; \
+		xs = (cc)->state[28]; \
+		xt = (cc)->state[29]; \
+		xu = (cc)->state[30]; \
+		xv = (cc)->state[31]; \
+	} while (0)
+
+#define WRITE_STATE(cc)   do { \
+		(cc)->state[ 0] = x0; \
+		(cc)->state[ 1] = x1; \
+		(cc)->state[ 2] = x2; \
+		(cc)->state[ 3] = x3; \
+		(cc)->state[ 4] = x4; \
+		(cc)->state[ 5] = x5; \
+		(cc)->state[ 6] = x6; \
+		(cc)->state[ 7] = x7; \
+		(cc)->state[ 8] = x8; \
+		(cc)->state[ 9] = x9; \
+		(cc)->state[10] = xa; \
+		(cc)->state[11] = xb; \
+		(cc)->state[12] = xc; \
+		(cc)->state[13] = xd; \
+		(cc)->state[14] = xe; \
+		(cc)->state[15] = xf; \
+		(cc)->state[16] = xg; \
+		(cc)->state[17] = xh; \
+		(cc)->state[18] = xi; \
+		(cc)->state[19] = xj; \
+		(cc)->state[20] = xk; \
+		(cc)->state[21] = xl; \
+		(cc)->state[22] = xm; \
+		(cc)->state[23] = xn; \
+		(cc)->state[24] = xo; \
+		(cc)->state[25] = xp; \
+		(cc)->state[26] = xq; \
+		(cc)->state[27] = xr; \
+		(cc)->state[28] = xs; \
+		(cc)->state[29] = xt; \
+		(cc)->state[30] = xu; \
+		(cc)->state[31] = xv; \
+	} while (0)
+
+#endif
+
+#define INPUT_BLOCK   do { \
+		x0 ^= sph_dec32le_aligned(buf +  0); \
+		x1 ^= sph_dec32le_aligned(buf +  4); \
+		x2 ^= sph_dec32le_aligned(buf +  8); \
+		x3 ^= sph_dec32le_aligned(buf + 12); \
+		x4 ^= sph_dec32le_aligned(buf + 16); \
+		x5 ^= sph_dec32le_aligned(buf + 20); \
+		x6 ^= sph_dec32le_aligned(buf + 24); \
+		x7 ^= sph_dec32le_aligned(buf + 28); \
+	} while (0)
+
+#define ROUND_EVEN   do { \
+		xg = T32(x0 + xg); \
+		x0 = ROTL32(x0, 7); \
+		xh = T32(x1 + xh); \
+		x1 = ROTL32(x1, 7); \
+		xi = T32(x2 + xi); \
+		x2 = ROTL32(x2, 7); \
+		xj = T32(x3 + xj); \
+		x3 = ROTL32(x3, 7); \
+		xk = T32(x4 + xk); \
+		x4 = ROTL32(x4, 7); \
+		xl = T32(x5 + xl); \
+		x5 = ROTL32(x5, 7); \
+		xm = T32(x6 + xm); \
+		x6 = ROTL32(x6, 7); \
+		xn = T32(x7 + xn); \
+		x7 = ROTL32(x7, 7); \
+		xo = T32(x8 + xo); \
+		x8 = ROTL32(x8, 7); \
+		xp = T32(x9 + xp); \
+		x9 = ROTL32(x9, 7); \
+		xq = T32(xa + xq); \
+		xa = ROTL32(xa, 7); \
+		xr = T32(xb + xr); \
+		xb = ROTL32(xb, 7); \
+		xs = T32(xc + xs); \
+		xc = ROTL32(xc, 7); \
+		xt = T32(xd + xt); \
+		xd = ROTL32(xd, 7); \
+		xu = T32(xe + xu); \
+		xe = ROTL32(xe, 7); \
+		xv = T32(xf + xv); \
+		xf = ROTL32(xf, 7); \
+		x8 ^= xg; \
+		x9 ^= xh; \
+		xa ^= xi; \
+		xb ^= xj; \
+		xc ^= xk; \
+		xd ^= xl; \
+		xe ^= xm; \
+		xf ^= xn; \
+		x0 ^= xo; \
+		x1 ^= xp; \
+		x2 ^= xq; \
+		x3 ^= xr; \
+		x4 ^= xs; \
+		x5 ^= xt; \
+		x6 ^= xu; \
+		x7 ^= xv; \
+		xi = T32(x8 + xi); \
+		x8 = ROTL32(x8, 11); \
+		xj = T32(x9 + xj); \
+		x9 = ROTL32(x9, 11); \
+		xg = T32(xa + xg); \
+		xa = ROTL32(xa, 11); \
+		xh = T32(xb + xh); \
+		xb = ROTL32(xb, 11); \
+		xm = T32(xc + xm); \
+		xc = ROTL32(xc, 11); \
+		xn = T32(xd + xn); \
+		xd = ROTL32(xd, 11); \
+		xk = T32(xe + xk); \
+		xe = ROTL32(xe, 11); \
+		xl = T32(xf + xl); \
+		xf = ROTL32(xf, 11); \
+		xq = T32(x0 + xq); \
+		x0 = ROTL32(x0, 11); \
+		xr = T32(x1 + xr); \
+		x1 = ROTL32(x1, 11); \
+		xo = T32(x2 + xo); \
+		x2 = ROTL32(x2, 11); \
+		xp = T32(x3 + xp); \
+		x3 = ROTL32(x3, 11); \
+		xu = T32(x4 + xu); \
+		x4 = ROTL32(x4, 11); \
+		xv = T32(x5 + xv); \
+		x5 = ROTL32(x5, 11); \
+		xs = T32(x6 + xs); \
+		x6 = ROTL32(x6, 11); \
+		xt = T32(x7 + xt); \
+		x7 = ROTL32(x7, 11); \
+		xc ^= xi; \
+		xd ^= xj; \
+		xe ^= xg; \
+		xf ^= xh; \
+		x8 ^= xm; \
+		x9 ^= xn; \
+		xa ^= xk; \
+		xb ^= xl; \
+		x4 ^= xq; \
+		x5 ^= xr; \
+		x6 ^= xo; \
+		x7 ^= xp; \
+		x0 ^= xu; \
+		x1 ^= xv; \
+		x2 ^= xs; \
+		x3 ^= xt; \
+	} while (0)
+
+#define ROUND_ODD   do { \
+		xj = T32(xc + xj); \
+		xc = ROTL32(xc, 7); \
+		xi = T32(xd + xi); \
+		xd = ROTL32(xd, 7); \
+		xh = T32(xe + xh); \
+		xe = ROTL32(xe, 7); \
+		xg = T32(xf + xg); \
+		xf = ROTL32(xf, 7); \
+		xn = T32(x8 + xn); \
+		x8 = ROTL32(x8, 7); \
+		xm = T32(x9 + xm); \
+		x9 = ROTL32(x9, 7); \
+		xl = T32(xa + xl); \
+		xa = ROTL32(xa, 7); \
+		xk = T32(xb + xk); \
+		xb = ROTL32(xb, 7); \
+		xr = T32(x4 + xr); \
+		x4 = ROTL32(x4, 7); \
+		xq = T32(x5 + xq); \
+		x5 = ROTL32(x5, 7); \
+		xp = T32(x6 + xp); \
+		x6 = ROTL32(x6, 7); \
+		xo = T32(x7 + xo); \
+		x7 = ROTL32(x7, 7); \
+		xv = T32(x0 + xv); \
+		x0 = ROTL32(x0, 7); \
+		xu = T32(x1 + xu); \
+		x1 = ROTL32(x1, 7); \
+		xt = T32(x2 + xt); \
+		x2 = ROTL32(x2, 7); \
+		xs = T32(x3 + xs); \
+		x3 = ROTL32(x3, 7); \
+		x4 ^= xj; \
+		x5 ^= xi; \
+		x6 ^= xh; \
+		x7 ^= xg; \
+		x0 ^= xn; \
+		x1 ^= xm; \
+		x2 ^= xl; \
+		x3 ^= xk; \
+		xc ^= xr; \
+		xd ^= xq; \
+		xe ^= xp; \
+		xf ^= xo; \
+		x8 ^= xv; \
+		x9 ^= xu; \
+		xa ^= xt; \
+		xb ^= xs; \
+		xh = T32(x4 + xh); \
+		x4 = ROTL32(x4, 11); \
+		xg = T32(x5 + xg); \
+		x5 = ROTL32(x5, 11); \
+		xj = T32(x6 + xj); \
+		x6 = ROTL32(x6, 11); \
+		xi = T32(x7 + xi); \
+		x7 = ROTL32(x7, 11); \
+		xl = T32(x0 + xl); \
+		x0 = ROTL32(x0, 11); \
+		xk = T32(x1 + xk); \
+		x1 = ROTL32(x1, 11); \
+		xn = T32(x2 + xn); \
+		x2 = ROTL32(x2, 11); \
+		xm = T32(x3 + xm); \
+		x3 = ROTL32(x3, 11); \
+		xp = T32(xc + xp); \
+		xc = ROTL32(xc, 11); \
+		xo = T32(xd + xo); \
+		xd = ROTL32(xd, 11); \
+		xr = T32(xe + xr); \
+		xe = ROTL32(xe, 11); \
+		xq = T32(xf + xq); \
+		xf = ROTL32(xf, 11); \
+		xt = T32(x8 + xt); \
+		x8 = ROTL32(x8, 11); \
+		xs = T32(x9 + xs); \
+		x9 = ROTL32(x9, 11); \
+		xv = T32(xa + xv); \
+		xa = ROTL32(xa, 11); \
+		xu = T32(xb + xu); \
+		xb = ROTL32(xb, 11); \
+		x0 ^= xh; \
+		x1 ^= xg; \
+		x2 ^= xj; \
+		x3 ^= xi; \
+		x4 ^= xl; \
+		x5 ^= xk; \
+		x6 ^= xn; \
+		x7 ^= xm; \
+		x8 ^= xp; \
+		x9 ^= xo; \
+		xa ^= xr; \
+		xb ^= xq; \
+		xc ^= xt; \
+		xd ^= xs; \
+		xe ^= xv; \
+		xf ^= xu; \
+	} while (0)
+
+/*
+ * There is no need to unroll all 16 rounds. The word-swapping permutation
+ * is an involution, so we need to unroll an even number of rounds. On
+ * "big" systems, unrolling 4 rounds yields about 97% of the speed
+ * achieved with full unrolling; and it keeps the code more compact
+ * for small architectures.
+ */
+
+#if SPH_CUBEHASH_UNROLL == 2
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 8; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#elif SPH_CUBEHASH_UNROLL == 4
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 4; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#elif SPH_CUBEHASH_UNROLL == 8
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 2; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#else
+
+#define SIXTEEN_ROUNDS   do { \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+	} while (0)
+
+#endif
+
+static void
+cubehash_init(sph_cubehash_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->state, iv, sizeof sc->state);
+	sc->ptr = 0;
+}
+
+static void
+cubehash_core(sph_cubehash_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			INPUT_BLOCK;
+			SIXTEEN_ROUNDS;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE(sc);
+	sc->ptr = ptr;
+}
+
+static void
+cubehash_close(sph_cubehash_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr;
+	unsigned z;
+	int i;
+	DECL_STATE
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	READ_STATE(sc);
+	INPUT_BLOCK;
+	for (i = 0; i < 11; i ++) {
+		SIXTEEN_ROUNDS;
+		if (i == 0)
+			xv ^= SPH_C32(1);
+	}
+	WRITE_STATE(sc);
+	out = dst;
+	for (z = 0; z < out_size_w32; z ++)
+		sph_enc32le(out + (z << 2), sc->state[z]);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_init(void *cc)
+{
+	cubehash_init(cc, IV224);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_close(void *cc, void *dst)
+{
+	sph_cubehash224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 7);
+	sph_cubehash224_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_init(void *cc)
+{
+	cubehash_init(cc, IV256);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_close(void *cc, void *dst)
+{
+	sph_cubehash256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 8);
+	sph_cubehash256_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_init(void *cc)
+{
+	cubehash_init(cc, IV384);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_close(void *cc, void *dst)
+{
+	sph_cubehash384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 12);
+	sph_cubehash384_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_init(void *cc)
+{
+	cubehash_init(cc, IV512);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_close(void *cc, void *dst)
+{
+	sph_cubehash512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 16);
+	sph_cubehash512_init(cc);
+}
+#ifdef __cplusplus
+}
+#endif
--- a/src/crypto/ghostrider/sph_cubehash.h
+++ b/src/crypto/ghostrider/sph_cubehash.h
@ -0,0 +1,292 @@
+/* $Id: sph_cubehash.h 180 2010-05-08 02:29:25Z tp $ */
+/**
+ * CubeHash interface. CubeHash is a family of functions which differ by
+ * their output size; this implementation defines CubeHash for output
+ * sizes 224, 256, 384 and 512 bits, with the "standard parameters"
+ * (CubeHash16/32 with the CubeHash specification notations).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_cubehash.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_CUBEHASH_H__
+#define SPH_CUBEHASH_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for CubeHash-224.
+ */
+#define SPH_SIZE_cubehash224   224
+
+/**
+ * Output size (in bits) for CubeHash-256.
+ */
+#define SPH_SIZE_cubehash256   256
+
+/**
+ * Output size (in bits) for CubeHash-384.
+ */
+#define SPH_SIZE_cubehash384   384
+
+/**
+ * Output size (in bits) for CubeHash-512.
+ */
+#define SPH_SIZE_cubehash512   512
+
+/**
+ * This structure is a context for CubeHash computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a CubeHash computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running CubeHash computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 state[32];
+#endif
+} sph_cubehash_context;
+
+/**
+ * Type for a CubeHash-224 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash224_context;
+
+/**
+ * Type for a CubeHash-256 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash256_context;
+
+/**
+ * Type for a CubeHash-384 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash384_context;
+
+/**
+ * Type for a CubeHash-512 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash512_context;
+
+/**
+ * Initialize a CubeHash-224 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-224 context (pointer to a
+ *             <code>sph_cubehash224_context</code>)
+ */
+void sph_cubehash224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-224 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-256 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-256 context (pointer to a
+ *             <code>sph_cubehash256_context</code>)
+ */
+void sph_cubehash256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-256 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-384 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-384 context (pointer to a
+ *             <code>sph_cubehash384_context</code>)
+ */
+void sph_cubehash384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-384 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-512 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-512 context (pointer to a
+ *             <code>sph_cubehash512_context</code>)
+ */
+void sph_cubehash512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-512 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/crypto/ghostrider/sph_echo.c
+++ b/src/crypto/ghostrider/sph_echo.c
--- a/src/crypto/ghostrider/sph_echo.h
+++ b/src/crypto/ghostrider/sph_echo.h
@ -0,0 +1,319 @@
+/* $Id: sph_echo.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * ECHO interface. ECHO is a family of functions which differ by
+ * their output size; this implementation defines ECHO for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_echo.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_ECHO_H__
+#define SPH_ECHO_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for ECHO-224.
+ */
+#define SPH_SIZE_echo224   224
+
+/**
+ * Output size (in bits) for ECHO-256.
+ */
+#define SPH_SIZE_echo256   256
+
+/**
+ * Output size (in bits) for ECHO-384.
+ */
+#define SPH_SIZE_echo384   384
+
+/**
+ * Output size (in bits) for ECHO-512.
+ */
+#define SPH_SIZE_echo512   512
+
+/**
+ * This structure is a context for ECHO computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an ECHO computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for ECHO-224
+ * and ECHO-256.
+ *
+ * The contents of this structure are private. A running ECHO computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[192];    /* first field, for alignment */
+	size_t ptr;
+	union {
+		sph_u32 Vs[4][4];
+#if SPH_64
+		sph_u64 Vb[4][2];
+#endif
+	} u;
+	sph_u32 C0, C1, C2, C3;
+#endif
+} sph_echo_small_context;
+
+/**
+ * This structure is a context for ECHO computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an ECHO computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for ECHO-384
+ * and ECHO-512.
+ *
+ * The contents of this structure are private. A running ECHO computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	union {
+		sph_u32 Vs[8][4];
+#if SPH_64
+		sph_u64 Vb[8][2];
+#endif
+	} u;
+	sph_u32 C0, C1, C2, C3;
+#endif
+} sph_echo_big_context;
+
+/**
+ * Type for a ECHO-224 context (identical to the common "small" context).
+ */
+typedef sph_echo_small_context sph_echo224_context;
+
+/**
+ * Type for a ECHO-256 context (identical to the common "small" context).
+ */
+typedef sph_echo_small_context sph_echo256_context;
+
+/**
+ * Type for a ECHO-384 context (identical to the common "big" context).
+ */
+typedef sph_echo_big_context sph_echo384_context;
+
+/**
+ * Type for a ECHO-512 context (identical to the common "big" context).
+ */
+typedef sph_echo_big_context sph_echo512_context;
+
+/**
+ * Initialize an ECHO-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-224 context (pointer to a
+ *             <code>sph_echo224_context</code>)
+ */
+void sph_echo224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-224 context
+ * @param dst   the destination buffer
+ */
+void sph_echo224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-256 context (pointer to a
+ *             <code>sph_echo256_context</code>)
+ */
+void sph_echo256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-256 context
+ * @param dst   the destination buffer
+ */
+void sph_echo256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-384 context (pointer to a
+ *             <code>sph_echo384_context</code>)
+ */
+void sph_echo384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-384 context
+ * @param dst   the destination buffer
+ */
+void sph_echo384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-512 context (pointer to a
+ *             <code>sph_echo512_context</code>)
+ */
+void sph_echo512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-512 context
+ * @param dst   the destination buffer
+ */
+void sph_echo512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/src/crypto/ghostrider/sph_fugue.c
+++ b/src/crypto/ghostrider/sph_fugue.c
--- a/src/crypto/ghostrider/sph_fugue.h
+++ b/src/crypto/ghostrider/sph_fugue.h
@ -0,0 +1,89 @@
+#ifndef SPH_FUGUE_H__
+#define SPH_FUGUE_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#define SPH_SIZE_fugue224   224
+
+#define SPH_SIZE_fugue256   256
+
+#define SPH_SIZE_fugue384   384
+
+#define SPH_SIZE_fugue512   512
+
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	sph_u32 partial;
+	unsigned partial_len;
+	unsigned round_shift;
+	sph_u32 S[36];
+#if SPH_64
+	sph_u64 bit_count;
+#else
+	sph_u32 bit_count_high, bit_count_low;
+#endif
+#endif
+} sph_fugue_context;
+
+typedef sph_fugue_context sph_fugue224_context;
+
+typedef sph_fugue_context sph_fugue256_context;
+
+typedef sph_fugue_context sph_fugue384_context;
+
+typedef sph_fugue_context sph_fugue512_context;
+
+void sph_fugue224_init(void *cc);
+
+void sph_fugue224(void *cc, const void *data, size_t len);
+
+void sph_fugue224_close(void *cc, void *dst);
+
+void sph_fugue224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+void sph_fugue256_init(void *cc);
+
+void sph_fugue256(void *cc, const void *data, size_t len);
+
+void sph_fugue256_close(void *cc, void *dst);
+
+void sph_fugue256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+void sph_fugue384_init(void *cc);
+
+void sph_fugue384(void *cc, const void *data, size_t len);
+
+void sph_fugue384_close(void *cc, void *dst);
+
+void sph_fugue384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+void sph_fugue512_init(void *cc);
+
+void sph_fugue512(void *cc, const void *data, size_t len);
+
+void sph_fugue512_close(void *cc, void *dst);
+
+void sph_fugue512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#define sph_fugue512_full( cc, dst, data, len ) \
+do{ \
+   sph_fugue512_init( cc ); \
+   sph_fugue512( cc, data, len ); \
+   sph_fugue512_close( cc, dst ); \
+}while(0)
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/crypto/ghostrider/sph_groestl.c
+++ b/src/crypto/ghostrider/sph_groestl.c
--- a/src/crypto/ghostrider/sph_groestl.h
+++ b/src/crypto/ghostrider/sph_groestl.h
@ -0,0 +1,329 @@
+/* $Id: sph_groestl.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Groestl interface. This code implements Groestl with the recommended
+ * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_groestl.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_GROESTL_H__
+#define SPH_GROESTL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "sph_types.h"
+#include <stddef.h>
+
+/**
+ * Output size (in bits) for Groestl-224.
+ */
+#define SPH_SIZE_groestl224 224
+
+/**
+ * Output size (in bits) for Groestl-256.
+ */
+#define SPH_SIZE_groestl256 256
+
+/**
+ * Output size (in bits) for Groestl-384.
+ */
+#define SPH_SIZE_groestl384 384
+
+/**
+ * Output size (in bits) for Groestl-512.
+ */
+#define SPH_SIZE_groestl512 512
+
+/**
+ * This structure is a context for Groestl-224 and Groestl-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a Groestl computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running Groestl
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+  unsigned char buf[64]; /* first field, for alignment */
+  size_t ptr;
+  union {
+#if SPH_64
+    sph_u64 wide[8];
+#endif
+    sph_u32 narrow[16];
+  } state;
+#if SPH_64
+  sph_u64 count;
+#else
+  sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_groestl_small_context;
+
+/**
+ * This structure is a context for Groestl-224 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_small_context sph_groestl224_context;
+
+/**
+ * This structure is a context for Groestl-256 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_small_context sph_groestl256_context;
+
+/**
+ * This structure is a context for Groestl-384 and Groestl-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a Groestl computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running Groestl
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+  unsigned char buf[128]; /* first field, for alignment */
+  size_t ptr;
+  union {
+#if SPH_64
+    sph_u64 wide[16];
+#endif
+    sph_u32 narrow[32];
+  } state;
+#if SPH_64
+  sph_u64 count;
+#else
+  sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_groestl_big_context;
+
+/**
+ * This structure is a context for Groestl-384 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_big_context sph_groestl384_context;
+
+/**
+ * This structure is a context for Groestl-512 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_big_context sph_groestl512_context;
+
+/**
+ * Initialize a Groestl-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-224 context (pointer to a
+ *             <code>sph_groestl224_context</code>)
+ */
+void sph_groestl224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-224 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl224_addbits_and_close(void *cc, unsigned ub, unsigned n,
+                                      void *dst);
+
+/**
+ * Initialize a Groestl-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-256 context (pointer to a
+ *             <code>sph_groestl256_context</code>)
+ */
+void sph_groestl256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-256 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl256_addbits_and_close(void *cc, unsigned ub, unsigned n,
+                                      void *dst);
+
+/**
+ * Initialize a Groestl-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-384 context (pointer to a
+ *             <code>sph_groestl384_context</code>)
+ */
+void sph_groestl384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-384 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl384_addbits_and_close(void *cc, unsigned ub, unsigned n,
+                                      void *dst);
+
+/**
+ * Initialize a Groestl-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-512 context (pointer to a
+ *             <code>sph_groestl512_context</code>)
+ */
+void sph_groestl512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-512 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n,
+                                      void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/crypto/ghostrider/sph_hamsi.c
+++ b/src/crypto/ghostrider/sph_hamsi.c
@ -0,0 +1,867 @@
+/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */
+/*
+ * Hamsi implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_hamsi.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAMSI
+#define SPH_SMALL_FOOTPRINT_HAMSI   1
+#endif
+
+/*
+ * The SPH_HAMSI_EXPAND_* define how many input bits we handle in one
+ * table lookup during message expansion (1 to 8, inclusive). If we note
+ * w the number of bits per message word (w=32 for Hamsi-224/256, w=64
+ * for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for
+ * Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level,
+ * then we will get t tables (where t=ceil(w/n)) of individual size
+ * 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and
+ * n=5, there are 7 tables, but the last one uses only two bits on
+ * input, not five).
+ *
+ * Also, we read t rows of r words from RAM. Words in a given row are
+ * concatenated in RAM in that order, so most of the cost is about
+ * reading the first row word; comparatively, cache misses are thus
+ * less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8).
+ *
+ * When n=1, tables are "special" in that we omit the first entry of
+ * each table (which always contains 0), so that total table size is
+ * halved.
+ *
+ * We thus have the following (size1 is the cumulative table size of
+ * Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2
+ * are for Hamsi-224/256 and Hamsi-384/512, respectively).
+ *
+ *   n      size1      size2    t1    t2
+ * ---------------------------------------
+ *   1       1024       4096    32    64
+ *   2       2048       8192    16    32
+ *   3       2688      10880    11    22
+ *   4       4096      16384     8    16
+ *   5       6272      25600     7    13
+ *   6      10368      41984     6    11
+ *   7      16896      73856     5    10
+ *   8      32768     131072     4     8
+ *
+ * So there is a trade-off: a lower n makes the tables fit better in
+ * L1 cache, but increases the number of memory accesses. The optimal
+ * value depends on the amount of available L1 cache and the relative
+ * impact of a cache miss.
+ *
+ * Experimentally, in ideal benchmark conditions (which are not necessarily
+ * realistic with regards to L1 cache contention), it seems that n=8 is
+ * the best value on "big" architectures (those with 32 kB or more of L1
+ * cache), while n=4 is better on "small" architectures. This was tested
+ * on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3
+ * (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302
+ * (8 kB L1 cache).
+ *
+ * Note: with n=1, the 32 tables (actually implemented as one big table)
+ * are read entirely and sequentially, regardless of the input data,
+ * thus avoiding any data-dependent table access pattern.
+ */
+
+#if !defined SPH_HAMSI_EXPAND_SMALL
+#if SPH_SMALL_FOOTPRINT_HAMSI
+#define SPH_HAMSI_EXPAND_SMALL  4
+#else
+#define SPH_HAMSI_EXPAND_SMALL  8
+#endif
+#endif
+
+#if !defined SPH_HAMSI_EXPAND_BIG
+#define SPH_HAMSI_EXPAND_BIG    8
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#include "sph_hamsi_helper.c"
+
+static const sph_u32 IV224[] = {
+	SPH_C32(0xc3967a67), SPH_C32(0xc3bc6c20), SPH_C32(0x4bc3bcc3),
+	SPH_C32(0xa7c3bc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c),
+	SPH_C32(0x69656b65), SPH_C32(0x20556e69)
+};
+
+/*
+ * This version is the one used in the Hamsi submission package for
+ * round 2 of the SHA-3 competition; the UTF-8 encoding is wrong and
+ * shall soon be corrected in the official Hamsi specification.
+ *
+static const sph_u32 IV224[] = {
+	SPH_C32(0x3c967a67), SPH_C32(0x3cbc6c20), SPH_C32(0xb4c343c3),
+	SPH_C32(0xa73cbc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c),
+	SPH_C32(0x69656b65), SPH_C32(0x20556e69)
+};
+ */
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0x76657273), SPH_C32(0x69746569), SPH_C32(0x74204c65),
+	SPH_C32(0x7576656e), SPH_C32(0x2c204465), SPH_C32(0x70617274),
+	SPH_C32(0x656d656e), SPH_C32(0x7420456c)
+};
+
+static const sph_u32 IV384[] = {
+	SPH_C32(0x656b7472), SPH_C32(0x6f746563), SPH_C32(0x686e6965),
+	SPH_C32(0x6b2c2043), SPH_C32(0x6f6d7075), SPH_C32(0x74657220),
+	SPH_C32(0x53656375), SPH_C32(0x72697479), SPH_C32(0x20616e64),
+	SPH_C32(0x20496e64), SPH_C32(0x75737472), SPH_C32(0x69616c20),
+	SPH_C32(0x43727970), SPH_C32(0x746f6772), SPH_C32(0x61706879),
+	SPH_C32(0x2c204b61)
+};
+
+static const sph_u32 IV512[] = {
+	SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
+	SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
+	SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33),
+	SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48),
+	SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
+	SPH_C32(0x6769756d)
+};
+
+static const sph_u32 alpha_n[] = {
+	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
+	SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
+	SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
+	SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
+	SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
+};
+
+static const sph_u32 alpha_f[] = {
+	SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
+	SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
+	SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
+	SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
+	SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
+	SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
+	SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
+};
+
+#define DECL_STATE_SMALL \
+	sph_u32 c0, c1, c2, c3, c4, c5, c6, c7;
+
+#define READ_STATE_SMALL(sc)   do { \
+		c0 = sc->h[0x0]; \
+		c1 = sc->h[0x1]; \
+		c2 = sc->h[0x2]; \
+		c3 = sc->h[0x3]; \
+		c4 = sc->h[0x4]; \
+		c5 = sc->h[0x5]; \
+		c6 = sc->h[0x6]; \
+		c7 = sc->h[0x7]; \
+	} while (0)
+
+#define WRITE_STATE_SMALL(sc)   do { \
+		sc->h[0x0] = c0; \
+		sc->h[0x1] = c1; \
+		sc->h[0x2] = c2; \
+		sc->h[0x3] = c3; \
+		sc->h[0x4] = c4; \
+		sc->h[0x5] = c5; \
+		sc->h[0x6] = c6; \
+		sc->h[0x7] = c7; \
+	} while (0)
+
+#define s0   m0
+#define s1   m1
+#define s2   c0
+#define s3   c1
+#define s4   c2
+#define s5   c3
+#define s6   m2
+#define s7   m3
+#define s8   m4
+#define s9   m5
+#define sA   c4
+#define sB   c5
+#define sC   c6
+#define sD   c7
+#define sE   m6
+#define sF   m7
+
+#define SBOX(a, b, c, d)   do { \
+		sph_u32 t; \
+		t = (a); \
+		(a) &= (c); \
+		(a) ^= (d); \
+		(c) ^= (b); \
+		(c) ^= (a); \
+		(d) |= t; \
+		(d) ^= (b); \
+		t ^= (c); \
+		(b) = (d); \
+		(d) |= t; \
+		(d) ^= (a); \
+		(a) &= (b); \
+		t ^= (a); \
+		(b) ^= (d); \
+		(b) ^= t; \
+		(a) = (c); \
+		(c) = (b); \
+		(b) = (d); \
+		(d) = SPH_T32(~t); \
+	} while (0)
+
+#define L(a, b, c, d)   do { \
+		(a) = SPH_ROTL32(a, 13); \
+		(c) = SPH_ROTL32(c, 3); \
+		(b) ^= (a) ^ (c); \
+		(d) ^= (c) ^ SPH_T32((a) << 3); \
+		(b) = SPH_ROTL32(b, 1); \
+		(d) = SPH_ROTL32(d, 7); \
+		(a) ^= (b) ^ (d); \
+		(c) ^= (d) ^ SPH_T32((b) << 7); \
+		(a) = SPH_ROTL32(a, 5); \
+		(c) = SPH_ROTL32(c, 22); \
+	} while (0)
+
+#define ROUND_SMALL(rc, alpha)   do { \
+		s0 ^= alpha[0x00]; \
+		s1 ^= alpha[0x01] ^ (sph_u32)(rc); \
+		s2 ^= alpha[0x02]; \
+		s3 ^= alpha[0x03]; \
+		s4 ^= alpha[0x08]; \
+		s5 ^= alpha[0x09]; \
+		s6 ^= alpha[0x0A]; \
+		s7 ^= alpha[0x0B]; \
+		s8 ^= alpha[0x10]; \
+		s9 ^= alpha[0x11]; \
+		sA ^= alpha[0x12]; \
+		sB ^= alpha[0x13]; \
+		sC ^= alpha[0x18]; \
+		sD ^= alpha[0x19]; \
+		sE ^= alpha[0x1A]; \
+		sF ^= alpha[0x1B]; \
+		SBOX(s0, s4, s8, sC); \
+		SBOX(s1, s5, s9, sD); \
+		SBOX(s2, s6, sA, sE); \
+		SBOX(s3, s7, sB, sF); \
+		L(s0, s5, sA, sF); \
+		L(s1, s6, sB, sC); \
+		L(s2, s7, s8, sD); \
+		L(s3, s4, s9, sE); \
+	} while (0)
+
+#define P_SMALL   do { \
+		ROUND_SMALL(0, alpha_n); \
+		ROUND_SMALL(1, alpha_n); \
+		ROUND_SMALL(2, alpha_n); \
+	} while (0)
+
+#define PF_SMALL   do { \
+		ROUND_SMALL(0, alpha_f); \
+		ROUND_SMALL(1, alpha_f); \
+		ROUND_SMALL(2, alpha_f); \
+		ROUND_SMALL(3, alpha_f); \
+		ROUND_SMALL(4, alpha_f); \
+		ROUND_SMALL(5, alpha_f); \
+	} while (0)
+
+#define T_SMALL   do { \
+		/* order is important */ \
+		c7 = (sc->h[7] ^= sB); \
+		c6 = (sc->h[6] ^= sA); \
+		c5 = (sc->h[5] ^= s9); \
+		c4 = (sc->h[4] ^= s8); \
+		c3 = (sc->h[3] ^= s3); \
+		c2 = (sc->h[2] ^= s2); \
+		c1 = (sc->h[1] ^= s1); \
+		c0 = (sc->h[0] ^= s0); \
+	} while (0)
+
+static void
+hamsi_small(sph_hamsi_small_context *sc, const unsigned char *buf, size_t num)
+{
+	DECL_STATE_SMALL
+#if !SPH_64
+	sph_u32 tmp;
+#endif
+
+#if SPH_64
+	sc->count += (sph_u64)num << 5;
+#else
+	tmp = SPH_T32((sph_u32)num << 5);
+	sc->count_low = SPH_T32(sc->count_low + tmp);
+	sc->count_high += (sph_u32)((num >> 13) >> 14);
+	if (sc->count_low < tmp)
+		sc->count_high ++;
+#endif
+	READ_STATE_SMALL(sc);
+	while (num -- > 0) {
+		sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
+
+		INPUT_SMALL;
+		P_SMALL;
+		T_SMALL;
+		buf += 4;
+	}
+	WRITE_STATE_SMALL(sc);
+}
+
+static void
+hamsi_small_final(sph_hamsi_small_context *sc, const unsigned char *buf)
+{
+	sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
+	DECL_STATE_SMALL
+
+	READ_STATE_SMALL(sc);
+	INPUT_SMALL;
+	PF_SMALL;
+	T_SMALL;
+	WRITE_STATE_SMALL(sc);
+}
+
+static void
+hamsi_small_init(sph_hamsi_small_context *sc, const sph_u32 *iv)
+{
+	sc->partial_len = 0;
+	memcpy(sc->h, iv, sizeof sc->h);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+static void
+hamsi_small_core(sph_hamsi_small_context *sc, const void *data, size_t len)
+{
+	if (sc->partial_len != 0) {
+		size_t mlen;
+
+		mlen = 4 - sc->partial_len;
+		if (len < mlen) {
+			memcpy(sc->partial + sc->partial_len, data, len);
+			sc->partial_len += len;
+			return;
+		} else {
+			memcpy(sc->partial + sc->partial_len, data, mlen);
+			len -= mlen;
+			data = (const unsigned char *)data + mlen;
+			hamsi_small(sc, sc->partial, 1);
+			sc->partial_len = 0;
+		}
+	}
+
+	hamsi_small(sc, data, (len >> 2));
+	data = (const unsigned char *)data + (len & ~(size_t)3);
+	len &= (size_t)3;
+	memcpy(sc->partial, data, len);
+	sc->partial_len = len;
+}
+
+static void
+hamsi_small_close(sph_hamsi_small_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	unsigned char pad[12];
+	size_t ptr, u;
+	unsigned z;
+	unsigned char *out;
+
+	ptr = sc->partial_len;
+	memcpy(pad, sc->partial, ptr);
+#if SPH_64
+	sph_enc64be(pad + 4, sc->count + (ptr << 3) + n);
+#else
+	sph_enc32be(pad + 4, sc->count_high);
+	sph_enc32be(pad + 8, sc->count_low + (ptr << 3) + n);
+#endif
+	z = 0x80 >> n;
+	pad[ptr ++] = ((ub & -z) | z) & 0xFF;
+	while (ptr < 4)
+		pad[ptr ++] = 0;
+	hamsi_small(sc, pad, 2);
+	hamsi_small_final(sc, pad + 8);
+	out = dst;
+	for (u = 0; u < out_size_w32; u ++)
+		sph_enc32be(out + (u << 2), sc->h[u]);
+}
+
+#define DECL_STATE_BIG \
+	sph_u32 c0, c1, c2, c3, c4, c5, c6, c7; \
+	sph_u32 c8, c9, cA, cB, cC, cD, cE, cF;
+
+#define READ_STATE_BIG(sc)   do { \
+		c0 = sc->h[0x0]; \
+		c1 = sc->h[0x1]; \
+		c2 = sc->h[0x2]; \
+		c3 = sc->h[0x3]; \
+		c4 = sc->h[0x4]; \
+		c5 = sc->h[0x5]; \
+		c6 = sc->h[0x6]; \
+		c7 = sc->h[0x7]; \
+		c8 = sc->h[0x8]; \
+		c9 = sc->h[0x9]; \
+		cA = sc->h[0xA]; \
+		cB = sc->h[0xB]; \
+		cC = sc->h[0xC]; \
+		cD = sc->h[0xD]; \
+		cE = sc->h[0xE]; \
+		cF = sc->h[0xF]; \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		sc->h[0x0] = c0; \
+		sc->h[0x1] = c1; \
+		sc->h[0x2] = c2; \
+		sc->h[0x3] = c3; \
+		sc->h[0x4] = c4; \
+		sc->h[0x5] = c5; \
+		sc->h[0x6] = c6; \
+		sc->h[0x7] = c7; \
+		sc->h[0x8] = c8; \
+		sc->h[0x9] = c9; \
+		sc->h[0xA] = cA; \
+		sc->h[0xB] = cB; \
+		sc->h[0xC] = cC; \
+		sc->h[0xD] = cD; \
+		sc->h[0xE] = cE; \
+		sc->h[0xF] = cF; \
+	} while (0)
+
+#define s00   m0
+#define s01   m1
+#define s02   c0
+#define s03   c1
+#define s04   m2
+#define s05   m3
+#define s06   c2
+#define s07   c3
+#define s08   c4
+#define s09   c5
+#define s0A   m4
+#define s0B   m5
+#define s0C   c6
+#define s0D   c7
+#define s0E   m6
+#define s0F   m7
+#define s10   m8
+#define s11   m9
+#define s12   c8
+#define s13   c9
+#define s14   mA
+#define s15   mB
+#define s16   cA
+#define s17   cB
+#define s18   cC
+#define s19   cD
+#define s1A   mC
+#define s1B   mD
+#define s1C   cE
+#define s1D   cF
+#define s1E   mE
+#define s1F   mF
+
+#define ROUND_BIG(rc, alpha)   do { \
+		s00 ^= alpha[0x00]; \
+		s01 ^= alpha[0x01] ^ (sph_u32)(rc); \
+		s02 ^= alpha[0x02]; \
+		s03 ^= alpha[0x03]; \
+		s04 ^= alpha[0x04]; \
+		s05 ^= alpha[0x05]; \
+		s06 ^= alpha[0x06]; \
+		s07 ^= alpha[0x07]; \
+		s08 ^= alpha[0x08]; \
+		s09 ^= alpha[0x09]; \
+		s0A ^= alpha[0x0A]; \
+		s0B ^= alpha[0x0B]; \
+		s0C ^= alpha[0x0C]; \
+		s0D ^= alpha[0x0D]; \
+		s0E ^= alpha[0x0E]; \
+		s0F ^= alpha[0x0F]; \
+		s10 ^= alpha[0x10]; \
+		s11 ^= alpha[0x11]; \
+		s12 ^= alpha[0x12]; \
+		s13 ^= alpha[0x13]; \
+		s14 ^= alpha[0x14]; \
+		s15 ^= alpha[0x15]; \
+		s16 ^= alpha[0x16]; \
+		s17 ^= alpha[0x17]; \
+		s18 ^= alpha[0x18]; \
+		s19 ^= alpha[0x19]; \
+		s1A ^= alpha[0x1A]; \
+		s1B ^= alpha[0x1B]; \
+		s1C ^= alpha[0x1C]; \
+		s1D ^= alpha[0x1D]; \
+		s1E ^= alpha[0x1E]; \
+		s1F ^= alpha[0x1F]; \
+		SBOX(s00, s08, s10, s18); \
+		SBOX(s01, s09, s11, s19); \
+		SBOX(s02, s0A, s12, s1A); \
+		SBOX(s03, s0B, s13, s1B); \
+		SBOX(s04, s0C, s14, s1C); \
+		SBOX(s05, s0D, s15, s1D); \
+		SBOX(s06, s0E, s16, s1E); \
+		SBOX(s07, s0F, s17, s1F); \
+		L(s00, s09, s12, s1B); \
+		L(s01, s0A, s13, s1C); \
+		L(s02, s0B, s14, s1D); \
+		L(s03, s0C, s15, s1E); \
+		L(s04, s0D, s16, s1F); \
+		L(s05, s0E, s17, s18); \
+		L(s06, s0F, s10, s19); \
+		L(s07, s08, s11, s1A); \
+		L(s00, s02, s05, s07); \
+		L(s10, s13, s15, s16); \
+		L(s09, s0B, s0C, s0E); \
+		L(s19, s1A, s1C, s1F); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_HAMSI
+
+#define P_BIG   do { \
+		unsigned r; \
+		for (r = 0; r < 6; r ++) \
+			ROUND_BIG(r, alpha_n); \
+	} while (0)
+
+#define PF_BIG   do { \
+		unsigned r; \
+		for (r = 0; r < 12; r ++) \
+			ROUND_BIG(r, alpha_f); \
+	} while (0)
+
+#else
+
+#define P_BIG   do { \
+		ROUND_BIG(0, alpha_n); \
+		ROUND_BIG(1, alpha_n); \
+		ROUND_BIG(2, alpha_n); \
+		ROUND_BIG(3, alpha_n); \
+		ROUND_BIG(4, alpha_n); \
+		ROUND_BIG(5, alpha_n); \
+	} while (0)
+
+#define PF_BIG   do { \
+		ROUND_BIG(0, alpha_f); \
+		ROUND_BIG(1, alpha_f); \
+		ROUND_BIG(2, alpha_f); \
+		ROUND_BIG(3, alpha_f); \
+		ROUND_BIG(4, alpha_f); \
+		ROUND_BIG(5, alpha_f); \
+		ROUND_BIG(6, alpha_f); \
+		ROUND_BIG(7, alpha_f); \
+		ROUND_BIG(8, alpha_f); \
+		ROUND_BIG(9, alpha_f); \
+		ROUND_BIG(10, alpha_f); \
+		ROUND_BIG(11, alpha_f); \
+	} while (0)
+
+#endif
+
+#define T_BIG   do { \
+		/* order is important */ \
+		cF = (sc->h[0xF] ^= s17); \
+		cE = (sc->h[0xE] ^= s16); \
+		cD = (sc->h[0xD] ^= s15); \
+		cC = (sc->h[0xC] ^= s14); \
+		cB = (sc->h[0xB] ^= s13); \
+		cA = (sc->h[0xA] ^= s12); \
+		c9 = (sc->h[0x9] ^= s11); \
+		c8 = (sc->h[0x8] ^= s10); \
+		c7 = (sc->h[0x7] ^= s07); \
+		c6 = (sc->h[0x6] ^= s06); \
+		c5 = (sc->h[0x5] ^= s05); \
+		c4 = (sc->h[0x4] ^= s04); \
+		c3 = (sc->h[0x3] ^= s03); \
+		c2 = (sc->h[0x2] ^= s02); \
+		c1 = (sc->h[0x1] ^= s01); \
+		c0 = (sc->h[0x0] ^= s00); \
+	} while (0)
+
+static void
+hamsi_big(sph_hamsi_big_context *sc, const unsigned char *buf, size_t num)
+{
+	DECL_STATE_BIG
+#if !SPH_64
+	sph_u32 tmp;
+#endif
+
+#if SPH_64
+	sc->count += (sph_u64)num << 6;
+#else
+	tmp = SPH_T32((sph_u32)num << 6);
+	sc->count_low = SPH_T32(sc->count_low + tmp);
+	sc->count_high += (sph_u32)((num >> 13) >> 13);
+	if (sc->count_low < tmp)
+		sc->count_high ++;
+#endif
+	READ_STATE_BIG(sc);
+	while (num -- > 0) {
+		sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
+		sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
+
+		INPUT_BIG;
+		P_BIG;
+		T_BIG;
+		buf += 8;
+	}
+	WRITE_STATE_BIG(sc);
+}
+
+static void
+hamsi_big_final(sph_hamsi_big_context *sc, const unsigned char *buf)
+{
+	sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
+	sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
+	DECL_STATE_BIG
+
+	READ_STATE_BIG(sc);
+	INPUT_BIG;
+	PF_BIG;
+	T_BIG;
+	WRITE_STATE_BIG(sc);
+}
+
+static void
+hamsi_big_init(sph_hamsi_big_context *sc, const sph_u32 *iv)
+{
+	sc->partial_len = 0;
+	memcpy(sc->h, iv, sizeof sc->h);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+static void
+hamsi_big_core(sph_hamsi_big_context *sc, const void *data, size_t len)
+{
+	if (sc->partial_len != 0) {
+		size_t mlen;
+
+		mlen = 8 - sc->partial_len;
+		if (len < mlen) {
+			memcpy(sc->partial + sc->partial_len, data, len);
+			sc->partial_len += len;
+			return;
+		} else {
+			memcpy(sc->partial + sc->partial_len, data, mlen);
+			len -= mlen;
+			data = (const unsigned char *)data + mlen;
+			hamsi_big(sc, sc->partial, 1);
+			sc->partial_len = 0;
+		}
+	}
+
+	hamsi_big(sc, data, (len >> 3));
+	data = (const unsigned char *)data + (len & ~(size_t)7);
+	len &= (size_t)7;
+	memcpy(sc->partial, data, len);
+	sc->partial_len = len;
+}
+
+static void
+hamsi_big_close(sph_hamsi_big_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	unsigned char pad[8];
+	size_t ptr, u;
+	unsigned z;
+	unsigned char *out;
+
+	ptr = sc->partial_len;
+#if SPH_64
+	sph_enc64be(pad, sc->count + (ptr << 3) + n);
+#else
+	sph_enc32be(pad, sc->count_high);
+	sph_enc32be(pad + 4, sc->count_low + (ptr << 3) + n);
+#endif
+	z = 0x80 >> n;
+	sc->partial[ptr ++] = ((ub & -z) | z) & 0xFF;
+	while (ptr < 8)
+		sc->partial[ptr ++] = 0;
+	hamsi_big(sc, sc->partial, 1);
+	hamsi_big_final(sc, pad);
+	out = dst;
+	if (out_size_w32 == 12) {
+		sph_enc32be(out +  0, sc->h[ 0]);
+		sph_enc32be(out +  4, sc->h[ 1]);
+		sph_enc32be(out +  8, sc->h[ 3]);
+		sph_enc32be(out + 12, sc->h[ 4]);
+		sph_enc32be(out + 16, sc->h[ 5]);
+		sph_enc32be(out + 20, sc->h[ 6]);
+		sph_enc32be(out + 24, sc->h[ 8]);
+		sph_enc32be(out + 28, sc->h[ 9]);
+		sph_enc32be(out + 32, sc->h[10]);
+		sph_enc32be(out + 36, sc->h[12]);
+		sph_enc32be(out + 40, sc->h[13]);
+		sph_enc32be(out + 44, sc->h[15]);
+	} else {
+		for (u = 0; u < 16; u ++)
+			sph_enc32be(out + (u << 2), sc->h[u]);
+	}
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi224_init(void *cc)
+{
+	hamsi_small_init(cc, IV224);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi224(void *cc, const void *data, size_t len)
+{
+	hamsi_small_core(cc, data, len);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi224_close(void *cc, void *dst)
+{
+	hamsi_small_close(cc, 0, 0, dst, 7);
+//	hamsi_small_init(cc, IV224);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	hamsi_small_close(cc, ub, n, dst, 7);
+//	hamsi_small_init(cc, IV224);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi256_init(void *cc)
+{
+	hamsi_small_init(cc, IV256);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi256(void *cc, const void *data, size_t len)
+{
+	hamsi_small_core(cc, data, len);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi256_close(void *cc, void *dst)
+{
+	hamsi_small_close(cc, 0, 0, dst, 8);
+//	hamsi_small_init(cc, IV256);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	hamsi_small_close(cc, ub, n, dst, 8);
+//	hamsi_small_init(cc, IV256);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi384_init(void *cc)
+{
+	hamsi_big_init(cc, IV384);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi384(void *cc, const void *data, size_t len)
+{
+	hamsi_big_core(cc, data, len);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi384_close(void *cc, void *dst)
+{
+	hamsi_big_close(cc, 0, 0, dst, 12);
+//	hamsi_big_init(cc, IV384);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	hamsi_big_close(cc, ub, n, dst, 12);
+//	hamsi_big_init(cc, IV384);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi512_init(void *cc)
+{
+	hamsi_big_init(cc, IV512);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi512(void *cc, const void *data, size_t len)
+{
+	hamsi_big_core(cc, data, len);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi512_close(void *cc, void *dst)
+{
+	hamsi_big_close(cc, 0, 0, dst, 16);
+//	hamsi_big_init(cc, IV512);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	hamsi_big_close(cc, ub, n, dst, 16);
+//	hamsi_big_init(cc, IV512);
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/crypto/ghostrider/sph_hamsi.h
+++ b/src/crypto/ghostrider/sph_hamsi.h
@ -0,0 +1,321 @@
+/* $Id: sph_hamsi.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Hamsi interface. This code implements Hamsi with the recommended
+ * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_hamsi.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_HAMSI_H__
+#define SPH_HAMSI_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/**
+ * Output size (in bits) for Hamsi-224.
+ */
+#define SPH_SIZE_hamsi224   224
+
+/**
+ * Output size (in bits) for Hamsi-256.
+ */
+#define SPH_SIZE_hamsi256   256
+
+/**
+ * Output size (in bits) for Hamsi-384.
+ */
+#define SPH_SIZE_hamsi384   384
+
+/**
+ * Output size (in bits) for Hamsi-512.
+ */
+#define SPH_SIZE_hamsi512   512
+
+/**
+ * This structure is a context for Hamsi-224 and Hamsi-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a Hamsi computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running Hamsi
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char partial[4];
+	size_t partial_len;
+	sph_u32 h[8];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_hamsi_small_context;
+
+/**
+ * This structure is a context for Hamsi-224 computations. It is
+ * identical to the common <code>sph_hamsi_small_context</code>.
+ */
+typedef sph_hamsi_small_context sph_hamsi224_context;
+
+/**
+ * This structure is a context for Hamsi-256 computations. It is
+ * identical to the common <code>sph_hamsi_small_context</code>.
+ */
+typedef sph_hamsi_small_context sph_hamsi256_context;
+
+/**
+ * This structure is a context for Hamsi-384 and Hamsi-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a Hamsi computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running Hamsi
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char partial[8];
+	size_t partial_len;
+	sph_u32 h[16];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_hamsi_big_context;
+
+/**
+ * This structure is a context for Hamsi-384 computations. It is
+ * identical to the common <code>sph_hamsi_small_context</code>.
+ */
+typedef sph_hamsi_big_context sph_hamsi384_context;
+
+/**
+ * This structure is a context for Hamsi-512 computations. It is
+ * identical to the common <code>sph_hamsi_small_context</code>.
+ */
+typedef sph_hamsi_big_context sph_hamsi512_context;
+
+/**
+ * Initialize a Hamsi-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Hamsi-224 context (pointer to a
+ *             <code>sph_hamsi224_context</code>)
+ */
+void sph_hamsi224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Hamsi-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_hamsi224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Hamsi-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Hamsi-224 context
+ * @param dst   the destination buffer
+ */
+void sph_hamsi224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Hamsi-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_hamsi224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Hamsi-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Hamsi-256 context (pointer to a
+ *             <code>sph_hamsi256_context</code>)
+ */
+void sph_hamsi256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Hamsi-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_hamsi256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Hamsi-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Hamsi-256 context
+ * @param dst   the destination buffer
+ */
+void sph_hamsi256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Hamsi-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_hamsi256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Hamsi-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Hamsi-384 context (pointer to a
+ *             <code>sph_hamsi384_context</code>)
+ */
+void sph_hamsi384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Hamsi-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_hamsi384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Hamsi-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Hamsi-384 context
+ * @param dst   the destination buffer
+ */
+void sph_hamsi384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Hamsi-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_hamsi384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Hamsi-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Hamsi-512 context (pointer to a
+ *             <code>sph_hamsi512_context</code>)
+ */
+void sph_hamsi512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Hamsi-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_hamsi512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Hamsi-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Hamsi-512 context
+ * @param dst   the destination buffer
+ */
+void sph_hamsi512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Hamsi-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_hamsi512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/crypto/ghostrider/sph_hamsi_helper.c
+++ b/src/crypto/ghostrider/sph_hamsi_helper.c
--- a/src/crypto/ghostrider/sph_jh.c
+++ b/src/crypto/ghostrider/sph_jh.c
--- a/src/crypto/ghostrider/sph_jh.h
+++ b/src/crypto/ghostrider/sph_jh.h
@ -0,0 +1,298 @@
+/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * JH interface. JH is a family of functions which differ by
+ * their output size; this implementation defines JH for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_jh.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_JH_H__
+#define SPH_JH_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for JH-224.
+ */
+#define SPH_SIZE_jh224   224
+
+/**
+ * Output size (in bits) for JH-256.
+ */
+#define SPH_SIZE_jh256   256
+
+/**
+ * Output size (in bits) for JH-384.
+ */
+#define SPH_SIZE_jh384   384
+
+/**
+ * Output size (in bits) for JH-512.
+ */
+#define SPH_SIZE_jh512   512
+
+/**
+ * This structure is a context for JH computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a JH computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running JH computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	union {
+#if SPH_64
+		sph_u64 wide[16];
+#endif
+		sph_u32 narrow[32];
+	} H;
+#if SPH_64
+	sph_u64 block_count;
+#else
+	sph_u32 block_count_high, block_count_low;
+#endif
+#endif
+} sph_jh_context;
+
+/**
+ * Type for a JH-224 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh224_context;
+
+/**
+ * Type for a JH-256 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh256_context;
+
+/**
+ * Type for a JH-384 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh384_context;
+
+/**
+ * Type for a JH-512 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh512_context;
+
+/**
+ * Initialize a JH-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-224 context (pointer to a
+ *             <code>sph_jh224_context</code>)
+ */
+void sph_jh224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-224 context
+ * @param dst   the destination buffer
+ */
+void sph_jh224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a JH-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-256 context (pointer to a
+ *             <code>sph_jh256_context</code>)
+ */
+void sph_jh256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-256 context
+ * @param dst   the destination buffer
+ */
+void sph_jh256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a JH-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-384 context (pointer to a
+ *             <code>sph_jh384_context</code>)
+ */
+void sph_jh384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-384 context
+ * @param dst   the destination buffer
+ */
+void sph_jh384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a JH-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-512 context (pointer to a
+ *             <code>sph_jh512_context</code>)
+ */
+void sph_jh512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-512 context
+ * @param dst   the destination buffer
+ */
+void sph_jh512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/crypto/ghostrider/sph_keccak.c
+++ b/src/crypto/ghostrider/sph_keccak.c
--- a/src/crypto/ghostrider/sph_keccak.h
+++ b/src/crypto/ghostrider/sph_keccak.h
@ -0,0 +1,296 @@
+/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Keccak interface. This is the interface for Keccak with the
+ * recommended parameters for SHA-3, with output lengths 224, 256,
+ * 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_keccak.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_KECCAK_H__
+#define SPH_KECCAK_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Taken from keccak-gate.h
+extern int hard_coded_eb;
+
+#include "sph_types.h"
+#include <stddef.h>
+
+/**
+ * Output size (in bits) for Keccak-224.
+ */
+#define SPH_SIZE_keccak224 224
+
+/**
+ * Output size (in bits) for Keccak-256.
+ */
+#define SPH_SIZE_keccak256 256
+
+/**
+ * Output size (in bits) for Keccak-384.
+ */
+#define SPH_SIZE_keccak384 384
+
+/**
+ * Output size (in bits) for Keccak-512.
+ */
+#define SPH_SIZE_keccak512 512
+
+/**
+ * This structure is a context for Keccak computations: it contains the
+ * intermediate values and some data from the last entered block. Once a
+ * Keccak computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running Keccak computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+  unsigned char buf[144]; /* first field, for alignment */
+  size_t ptr, lim;
+  union {
+#if SPH_64
+    sph_u64 wide[25];
+#endif
+    sph_u32 narrow[50];
+  } u;
+#endif
+} sph_keccak_context;
+
+/**
+ * Type for a Keccak-224 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak224_context;
+
+/**
+ * Type for a Keccak-256 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak256_context;
+
+/**
+ * Type for a Keccak-384 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak384_context;
+
+/**
+ * Type for a Keccak-512 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak512_context;
+
+/**
+ * Initialize a Keccak-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-224 context (pointer to a
+ *             <code>sph_keccak224_context</code>)
+ */
+void sph_keccak224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-224 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak224_addbits_and_close(void *cc, unsigned ub, unsigned n,
+                                     void *dst);
+
+/**
+ * Initialize a Keccak-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-256 context (pointer to a
+ *             <code>sph_keccak256_context</code>)
+ */
+void sph_keccak256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-256 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak256_addbits_and_close(void *cc, unsigned ub, unsigned n,
+                                     void *dst);
+
+/**
+ * Initialize a Keccak-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-384 context (pointer to a
+ *             <code>sph_keccak384_context</code>)
+ */
+void sph_keccak384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-384 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak384_addbits_and_close(void *cc, unsigned ub, unsigned n,
+                                     void *dst);
+
+/**
+ * Initialize a Keccak-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-512 context (pointer to a
+ *             <code>sph_keccak512_context</code>)
+ */
+void sph_keccak512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-512 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak512_addbits_and_close(void *cc, unsigned ub, unsigned n,
+                                     void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/crypto/ghostrider/sph_luffa.c
+++ b/src/crypto/ghostrider/sph_luffa.c
--- a/src/crypto/ghostrider/sph_luffa.h
+++ b/src/crypto/ghostrider/sph_luffa.h
@ -0,0 +1,296 @@
+/* $Id: sph_luffa.h 154 2010-04-26 17:00:24Z tp $ */
+/**
+ * Luffa interface. Luffa is a family of functions which differ by
+ * their output size; this implementation defines Luffa for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_luffa.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_LUFFA_H__
+#define SPH_LUFFA_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for Luffa-224.
+ */
+#define SPH_SIZE_luffa224   224
+
+/**
+ * Output size (in bits) for Luffa-256.
+ */
+#define SPH_SIZE_luffa256   256
+
+/**
+ * Output size (in bits) for Luffa-384.
+ */
+#define SPH_SIZE_luffa384   384
+
+/**
+ * Output size (in bits) for Luffa-512.
+ */
+#define SPH_SIZE_luffa512   512
+
+/**
+ * This structure is a context for Luffa-224 computations: it contains
+ * the intermediate values and some data from the last entered block.
+ * Once a Luffa computation has been performed, the context can be
+ * reused for another computation.
+ *
+ * The contents of this structure are private. A running Luffa
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[3][8];
+#endif
+} sph_luffa224_context;
+
+/**
+ * This structure is a context for Luffa-256 computations. It is
+ * identical to <code>sph_luffa224_context</code>.
+ */
+typedef sph_luffa224_context sph_luffa256_context;
+
+/**
+ * This structure is a context for Luffa-384 computations.
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[4][8];
+#endif
+} sph_luffa384_context;
+
+/**
+ * This structure is a context for Luffa-512 computations.
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[5][8];
+#endif
+} sph_luffa512_context;
+
+/**
+ * Initialize a Luffa-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-224 context (pointer to a
+ *             <code>sph_luffa224_context</code>)
+ */
+void sph_luffa224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-224 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Luffa-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-256 context (pointer to a
+ *             <code>sph_luffa256_context</code>)
+ */
+void sph_luffa256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-256 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Luffa-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-384 context (pointer to a
+ *             <code>sph_luffa384_context</code>)
+ */
+void sph_luffa384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-384 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Luffa-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-512 context (pointer to a
+ *             <code>sph_luffa512_context</code>)
+ */
+void sph_luffa512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-512 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/crypto/ghostrider/sph_sha2.c
+++ b/src/crypto/ghostrider/sph_sha2.c
@ -0,0 +1,793 @@
+/* $Id: sha2.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * SHA-224 / SHA-256 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_sha2.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHA2
+#define SPH_SMALL_FOOTPRINT_SHA2   1
+#endif
+
+#define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
+//#define MAJ(X, Y, Z)   (((Y) & (Z)) | (((Y) | (Z)) & (X)))
+#define MAJ( X, Y, Z )   ( Y  ^ ( ( X_xor_Y = X ^ Y ) & ( Y_xor_Z ) ) )
+#define ROTR    SPH_ROTR32
+
+#define BSG2_0(x)      (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define BSG2_1(x)      (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define SSG2_0(x)      (ROTR(x, 7) ^ ROTR(x, 18) ^ SPH_T32((x) >> 3))
+#define SSG2_1(x)      (ROTR(x, 17) ^ ROTR(x, 19) ^ SPH_T32((x) >> 10))
+
+static const sph_u32 H224[8] = {
+	SPH_C32(0xC1059ED8), SPH_C32(0x367CD507), SPH_C32(0x3070DD17),
+	SPH_C32(0xF70E5939), SPH_C32(0xFFC00B31), SPH_C32(0x68581511),
+	SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4)
+};
+
+static const sph_u32 H256[8] = {
+	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), SPH_C32(0x3C6EF372),
+	SPH_C32(0xA54FF53A), SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
+	SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+};
+
+/*
+ * The SHA2_ROUND_BODY defines the body for a SHA-224 / SHA-256
+ * compression function implementation. The "in" parameter should
+ * evaluate, when applied to a numerical input parameter from 0 to 15,
+ * to an expression which yields the corresponding input block. The "r"
+ * parameter should evaluate to an array or pointer expression
+ * designating the array of 8 words which contains the input and output
+ * of the compression function.
+ */
+
+
+/*
+static const sph_u32 K[64] = {
+	SPH_C32(0x428A2F98), SPH_C32(0x71374491),
+	SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
+	SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
+	SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
+	SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
+	SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
+	SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
+	SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
+	SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
+	SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
+	SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
+	SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
+	SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
+	SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
+	SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
+	SPH_C32(0x06CA6351), SPH_C32(0x14292967),
+	SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
+	SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
+	SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
+	SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
+	SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
+	SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
+	SPH_C32(0xD192E819), SPH_C32(0xD6990624),
+	SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
+	SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
+	SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
+	SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
+	SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
+	SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
+	SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
+	SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
+	SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
+};
+*/
+
+#if SPH_SMALL_FOOTPRINT_SHA2
+
+#define SHA2_MEXP1(in, pc)   do { \
+		W[pc] = in(pc); \
+	} while (0)
+
+#define SHA2_MEXP2(in, pc)   do { \
+		W[(pc) & 0x0F] = SPH_T32(SSG2_1(W[((pc) - 2) & 0x0F]) \
+			+ W[((pc) - 7) & 0x0F] \
+			+ SSG2_0(W[((pc) - 15) & 0x0F]) + W[(pc) & 0x0F]); \
+	} while (0)
+
+#define SHA2_STEPn(n, a, b, c, d, e, f, g, h, in, pc)   do { \
+		sph_u32 t1, t2; \
+		SHA2_MEXP ## n(in, pc); \
+		t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
+			+ K[pcount + (pc)] + W[(pc) & 0x0F]); \
+		t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
+      Y_xor_Z = X_xor_Y; \
+		d = SPH_T32(d + t1); \
+		h = SPH_T32(t1 + t2); \
+	} while (0)
+
+#define SHA2_STEP1(a, b, c, d, e, f, g, h, in, pc) \
+	SHA2_STEPn(1, a, b, c, d, e, f, g, h, in, pc)
+#define SHA2_STEP2(a, b, c, d, e, f, g, h, in, pc) \
+	SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)
+
+#define SHA2_ROUND_BODY(in, r)   do { \
+		sph_u32 A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; \
+		sph_u32 W[16]; \
+		unsigned pcount; \
+ \
+		A = (r)[0]; \
+		B = (r)[1]; \
+		C = (r)[2]; \
+		D = (r)[3]; \
+		E = (r)[4]; \
+		F = (r)[5]; \
+		G = (r)[6]; \
+		H = (r)[7]; \
+		pcount = 0; \
+      Y_xor_Z = B ^ C; \
+		SHA2_STEP1(A, B, C, D, E, F, G, H, in,  0); \
+		SHA2_STEP1(H, A, B, C, D, E, F, G, in,  1); \
+		SHA2_STEP1(G, H, A, B, C, D, E, F, in,  2); \
+		SHA2_STEP1(F, G, H, A, B, C, D, E, in,  3); \
+		SHA2_STEP1(E, F, G, H, A, B, C, D, in,  4); \
+		SHA2_STEP1(D, E, F, G, H, A, B, C, in,  5); \
+		SHA2_STEP1(C, D, E, F, G, H, A, B, in,  6); \
+		SHA2_STEP1(B, C, D, E, F, G, H, A, in,  7); \
+		SHA2_STEP1(A, B, C, D, E, F, G, H, in,  8); \
+		SHA2_STEP1(H, A, B, C, D, E, F, G, in,  9); \
+		SHA2_STEP1(G, H, A, B, C, D, E, F, in, 10); \
+		SHA2_STEP1(F, G, H, A, B, C, D, E, in, 11); \
+		SHA2_STEP1(E, F, G, H, A, B, C, D, in, 12); \
+		SHA2_STEP1(D, E, F, G, H, A, B, C, in, 13); \
+		SHA2_STEP1(C, D, E, F, G, H, A, B, in, 14); \
+		SHA2_STEP1(B, C, D, E, F, G, H, A, in, 15); \
+		for (pcount = 16; pcount < 64; pcount += 16) { \
+			SHA2_STEP2(A, B, C, D, E, F, G, H, in,  0); \
+			SHA2_STEP2(H, A, B, C, D, E, F, G, in,  1); \
+			SHA2_STEP2(G, H, A, B, C, D, E, F, in,  2); \
+			SHA2_STEP2(F, G, H, A, B, C, D, E, in,  3); \
+			SHA2_STEP2(E, F, G, H, A, B, C, D, in,  4); \
+			SHA2_STEP2(D, E, F, G, H, A, B, C, in,  5); \
+			SHA2_STEP2(C, D, E, F, G, H, A, B, in,  6); \
+			SHA2_STEP2(B, C, D, E, F, G, H, A, in,  7); \
+			SHA2_STEP2(A, B, C, D, E, F, G, H, in,  8); \
+			SHA2_STEP2(H, A, B, C, D, E, F, G, in,  9); \
+			SHA2_STEP2(G, H, A, B, C, D, E, F, in, 10); \
+			SHA2_STEP2(F, G, H, A, B, C, D, E, in, 11); \
+			SHA2_STEP2(E, F, G, H, A, B, C, D, in, 12); \
+			SHA2_STEP2(D, E, F, G, H, A, B, C, in, 13); \
+			SHA2_STEP2(C, D, E, F, G, H, A, B, in, 14); \
+			SHA2_STEP2(B, C, D, E, F, G, H, A, in, 15); \
+		} \
+		(r)[0] = SPH_T32((r)[0] + A); \
+		(r)[1] = SPH_T32((r)[1] + B); \
+		(r)[2] = SPH_T32((r)[2] + C); \
+		(r)[3] = SPH_T32((r)[3] + D); \
+		(r)[4] = SPH_T32((r)[4] + E); \
+		(r)[5] = SPH_T32((r)[5] + F); \
+		(r)[6] = SPH_T32((r)[6] + G); \
+		(r)[7] = SPH_T32((r)[7] + H); \
+	} while (0)
+
+#else  // large footprint (default)
+
+#define SHA2_ROUND_BODY(in, r)   do { \
+		sph_u32 A, B, C, D, E, F, G, H, T1, T2, X_xor_Y, Y_xor_Z;; \
+		sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
+		sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
+ \
+		A = (r)[0]; \
+		B = (r)[1]; \
+		C = (r)[2]; \
+		D = (r)[3]; \
+		E = (r)[4]; \
+		F = (r)[5]; \
+		G = (r)[6]; \
+		H = (r)[7]; \
+      Y_xor_Z = B ^ C; \
+		W00 = in(0); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x428A2F98) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = in(1); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x71374491) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = in(2); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0xB5C0FBCF) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = in(3); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0xE9B5DBA5) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = in(4); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x3956C25B) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = in(5); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x59F111F1) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = in(6); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x923F82A4) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = in(7); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0xAB1C5ED5) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = in(8); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0xD807AA98) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = in(9); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x12835B01) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = in(10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x243185BE) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = in(11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x550C7DC3) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = in(12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x72BE5D74) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = in(13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x80DEB1FE) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = in(14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x9BDC06A7) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = in(15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0xC19BF174) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0xE49B69C1) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0xEFBE4786) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x0FC19DC6) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x240CA1CC) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x2DE92C6F) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x4A7484AA) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x5CB0A9DC) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x76F988DA) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x983E5152) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0xA831C66D) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0xB00327C8) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0xBF597FC7) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0xC6E00BF3) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0xD5A79147) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x06CA6351) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x14292967) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x27B70A85) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x2E1B2138) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x4D2C6DFC) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x53380D13) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x650A7354) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x766A0ABB) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x81C2C92E) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x92722C85) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0xA2BFE8A1) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0xA81A664B) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0xC24B8B70) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0xC76C51A3) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0xD192E819) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0xD6990624) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0xF40E3585) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x106AA070) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x19A4C116) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x1E376C08) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x2748774C) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x34B0BCB5) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x391C0CB3) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x4ED8AA4A) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x5B9CCA4F) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x682E6FF3) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x748F82EE) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x78A5636F) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x84C87814) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x8CC70208) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x90BEFFFA) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0xA4506CEB) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0xBEF9A3F7) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0xC67178F2) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		(r)[0] = SPH_T32((r)[0] + A); \
+		(r)[1] = SPH_T32((r)[1] + B); \
+		(r)[2] = SPH_T32((r)[2] + C); \
+		(r)[3] = SPH_T32((r)[3] + D); \
+		(r)[4] = SPH_T32((r)[4] + E); \
+		(r)[5] = SPH_T32((r)[5] + F); \
+		(r)[6] = SPH_T32((r)[6] + G); \
+		(r)[7] = SPH_T32((r)[7] + H); \
+	} while (0)
+
+#endif  // small footprint else
+
+/*
+ * One round of SHA-224 / SHA-256. The data must be aligned for 32-bit access.
+ */
+static void
+sha2_round(const unsigned char *data, sph_u32 r[8])
+{
+#define SHA2_IN(x)   sph_dec32be_aligned(data + (4 * (x)))
+	SHA2_ROUND_BODY(SHA2_IN, r);
+#undef SHA2_IN
+}
+
+void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
+                              const uint32_t *state_in )
+{
+memcpy( state_out, state_in, 32 );
+#define SHA2_IN(x)   (data[x])
+   SHA2_ROUND_BODY( SHA2_IN, state_out );
+#undef SHA2_IN
+}
+
+void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
+                              const uint32_t *state_in )
+{
+memcpy( state_out, state_in, 32 );
+#define SHA2_IN(x)   sph_dec32be_aligned( data+(x) )
+   SHA2_ROUND_BODY( SHA2_IN, state_out );
+#undef SHA2_IN
+
+}
+
+/* see sph_sha2.h */
+void
+sph_sha224_init(void *cc)
+{
+	sph_sha224_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H224, sizeof H224);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+/* see sph_sha2.h */
+void
+sph_sha256_init(void *cc)
+{
+	sph_sha256_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H256, sizeof H256);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+#define RFUN   sha2_round
+#define HASH   sha224
+#define BE32   1
+#include "md_helper.c"
+
+/* see sph_sha2.h */
+void
+sph_sha224_close(void *cc, void *dst)
+{
+	sha224_close(cc, dst, 7);
+//	sph_sha224_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha224_addbits_and_close(cc, ub, n, dst, 7);
+//	sph_sha224_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha256_close(void *cc, void *dst)
+{
+	sha224_close(cc, dst, 8);
+//	sph_sha256_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha224_addbits_and_close(cc, ub, n, dst, 8);
+//	sph_sha256_init(cc);
+}
+
+void sph_sha256_full( void *dst, const void *data, size_t len )
+{
+   sph_sha256_context cc;
+   sph_sha256_init( &cc );
+   sph_sha256( &cc, data, len );
+   sph_sha256_close( &cc, dst );
+}
+
+void sha256d(void* hash, const void* data, int len)
+{
+    sph_sha256_full(hash, data, len);
+    sph_sha256_full(hash, hash, 32);
+}
+
+/* see sph_sha2.h */
+//void
+//sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8])
+//{
+//#define SHA2_IN(x)   msg[x]
+//	SHA2_ROUND_BODY(SHA2_IN, val);
+//#undef SHA2_IN
+//}
--- a/src/crypto/ghostrider/sph_sha2.h
+++ b/src/crypto/ghostrider/sph_sha2.h
@ -0,0 +1,383 @@
+/* $Id: sph_sha2.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * SHA-224, SHA-256, SHA-384 and SHA-512 interface.
+ *
+ * SHA-256 has been published in FIPS 180-2, now amended with a change
+ * notice to include SHA-224 as well (which is a simple variation on
+ * SHA-256). SHA-384 and SHA-512 are also defined in FIPS 180-2. FIPS
+ * standards can be found at:
+ *    http://csrc.nist.gov/publications/fips/
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_sha2.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SHA2_H__
+#define SPH_SHA2_H__
+
+#include <stddef.h>
+#include <stdint.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for SHA-224.
+ */
+#define SPH_SIZE_sha224   224
+
+/**
+ * Output size (in bits) for SHA-256.
+ */
+#define SPH_SIZE_sha256   256
+
+/**
+ * This structure is a context for SHA-224 computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a SHA-224 computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running SHA-224 computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	sph_u32 val[8];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_sha224_context;
+
+/**
+ * This structure is a context for SHA-256 computations. It is identical
+ * to the SHA-224 context. However, a context is initialized for SHA-224
+ * <strong>or</strong> SHA-256, but not both (the internal IV is not the
+ * same).
+ */
+typedef sph_sha224_context sph_sha256_context;
+
+/**
+ * Initialize a SHA-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHA-224 context (pointer to
+ *             a <code>sph_sha224_context</code>)
+ */
+void sph_sha224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHA-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_sha224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHA-224 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHA-224 context
+ * @param dst   the destination buffer
+ */
+void sph_sha224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHA-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Apply the SHA-224 compression function on the provided data. The
+ * <code>msg</code> parameter contains the 16 32-bit input blocks,
+ * as numerical values (hence after the big-endian decoding). The
+ * <code>val</code> parameter contains the 8 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 256-bit input and output
+ */
+void sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]);
+
+/**
+ * Initialize a SHA-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHA-256 context (pointer to
+ *             a <code>sph_sha256_context</code>)
+ */
+void sph_sha256_init(void *cc);
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Process some data bytes, for SHA-256. This function is identical to
+ * <code>sha_224()</code>
+ *
+ * @param cc     the SHA-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_sha256(void *cc, const void *data, size_t len);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_sha256   sph_sha224
+#endif
+
+/**
+ * Terminate the current SHA-256 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHA-256 context
+ * @param dst   the destination buffer
+ */
+void sph_sha256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHA-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Apply the SHA-256 compression function on the provided data. This
+ * function is identical to <code>sha224_comp()</code>.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 256-bit input and output
+ */
+void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_sha256_comp   sph_sha224_comp
+#endif
+
+void sph_sha256_full( void *dst, const void *data, size_t len );
+void sha256d(void* hash, const void* data, int len);
+
+// These shouldn't be called directly, use sha256-hash.h generic functions
+// sha256_transform_le & sha256_transform_be instead.
+void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
+                              const uint32_t *state_in );
+
+void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
+                              const uint32_t *state_in );
+
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for SHA-384.
+ */
+#define SPH_SIZE_sha384   384
+
+/**
+ * Output size (in bits) for SHA-512.
+ */
+#define SPH_SIZE_sha512   512
+
+/**
+ * This structure is a context for SHA-384 computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a SHA-384 computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running SHA-384 computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	sph_u64 val[8];
+	sph_u64 count;
+#endif
+} sph_sha384_context;
+
+/**
+ * Initialize a SHA-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHA-384 context (pointer to
+ *             a <code>sph_sha384_context</code>)
+ */
+void sph_sha384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHA-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_sha384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHA-384 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHA-384 context
+ * @param dst   the destination buffer
+ */
+void sph_sha384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHA-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_sha384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Apply the SHA-384 compression function on the provided data. The
+ * <code>msg</code> parameter contains the 16 64-bit input blocks,
+ * as numerical values (hence after the big-endian decoding). The
+ * <code>val</code> parameter contains the 8 64-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 512-bit input and output
+ */
+void sph_sha384_comp(const sph_u64 msg[16], sph_u64 val[8]);
+
+/**
+ * This structure is a context for SHA-512 computations. It is identical
+ * to the SHA-384 context. However, a context is initialized for SHA-384
+ * <strong>or</strong> SHA-512, but not both (the internal IV is not the
+ * same).
+ */
+typedef sph_sha384_context sph_sha512_context;
+
+/**
+ * Initialize a SHA-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHA-512 context (pointer to
+ *             a <code>sph_sha512_context</code>)
+ */
+void sph_sha512_init(void *cc);
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Process some data bytes, for SHA-512. This function is identical to
+ * <code>sph_sha384()</code>.
+ *
+ * @param cc     the SHA-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_sha512(void *cc, const void *data, size_t len);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_sha512   sph_sha384
+#endif
+
+/**
+ * Terminate the current SHA-512 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHA-512 context
+ * @param dst   the destination buffer
+ */
+void sph_sha512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHA-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_sha512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Apply the SHA-512 compression function. This function is identical to
+ * <code>sph_sha384_comp()</code>.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 512-bit input and output
+ */
+void sph_sha512_comp(const sph_u64 msg[16], sph_u64 val[8]);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_sha512_comp   sph_sha384_comp
+#endif
+
+#endif
+
+#endif
--- a/src/crypto/ghostrider/sph_shabal.c
+++ b/src/crypto/ghostrider/sph_shabal.c
@ -0,0 +1,808 @@
+/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
+/*
+ * Shabal implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_shabal.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * Part of this code was automatically generated (the part between
+ * the "BEGIN" and "END" markers).
+ */
+
+#define sM    16
+
+#define C32   SPH_C32
+#define T32   SPH_T32
+
+#define O1   13
+#define O2    9
+#define O3    6
+
+/*
+ * We copy the state into local variables, so that the compiler knows
+ * that it can optimize them at will.
+ */
+
+/* BEGIN -- automatically generated code. */
+
+#define DECL_STATE   \
+	sph_u32 A00, A01, A02, A03, A04, A05, A06, A07, \
+	        A08, A09, A0A, A0B; \
+	sph_u32 B0, B1, B2, B3, B4, B5, B6, B7, \
+	        B8, B9, BA, BB, BC, BD, BE, BF; \
+	sph_u32 C0, C1, C2, C3, C4, C5, C6, C7, \
+	        C8, C9, CA, CB, CC, CD, CE, CF; \
+	sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, \
+	        M8, M9, MA, MB, MC, MD, ME, MF; \
+	sph_u32 Wlow, Whigh;
+
+#define READ_STATE(state)   do { \
+		A00 = (state)->A[0]; \
+		A01 = (state)->A[1]; \
+		A02 = (state)->A[2]; \
+		A03 = (state)->A[3]; \
+		A04 = (state)->A[4]; \
+		A05 = (state)->A[5]; \
+		A06 = (state)->A[6]; \
+		A07 = (state)->A[7]; \
+		A08 = (state)->A[8]; \
+		A09 = (state)->A[9]; \
+		A0A = (state)->A[10]; \
+		A0B = (state)->A[11]; \
+		B0 = (state)->B[0]; \
+		B1 = (state)->B[1]; \
+		B2 = (state)->B[2]; \
+		B3 = (state)->B[3]; \
+		B4 = (state)->B[4]; \
+		B5 = (state)->B[5]; \
+		B6 = (state)->B[6]; \
+		B7 = (state)->B[7]; \
+		B8 = (state)->B[8]; \
+		B9 = (state)->B[9]; \
+		BA = (state)->B[10]; \
+		BB = (state)->B[11]; \
+		BC = (state)->B[12]; \
+		BD = (state)->B[13]; \
+		BE = (state)->B[14]; \
+		BF = (state)->B[15]; \
+		C0 = (state)->C[0]; \
+		C1 = (state)->C[1]; \
+		C2 = (state)->C[2]; \
+		C3 = (state)->C[3]; \
+		C4 = (state)->C[4]; \
+		C5 = (state)->C[5]; \
+		C6 = (state)->C[6]; \
+		C7 = (state)->C[7]; \
+		C8 = (state)->C[8]; \
+		C9 = (state)->C[9]; \
+		CA = (state)->C[10]; \
+		CB = (state)->C[11]; \
+		CC = (state)->C[12]; \
+		CD = (state)->C[13]; \
+		CE = (state)->C[14]; \
+		CF = (state)->C[15]; \
+		Wlow = (state)->Wlow; \
+		Whigh = (state)->Whigh; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->A[0] = A00; \
+		(state)->A[1] = A01; \
+		(state)->A[2] = A02; \
+		(state)->A[3] = A03; \
+		(state)->A[4] = A04; \
+		(state)->A[5] = A05; \
+		(state)->A[6] = A06; \
+		(state)->A[7] = A07; \
+		(state)->A[8] = A08; \
+		(state)->A[9] = A09; \
+		(state)->A[10] = A0A; \
+		(state)->A[11] = A0B; \
+		(state)->B[0] = B0; \
+		(state)->B[1] = B1; \
+		(state)->B[2] = B2; \
+		(state)->B[3] = B3; \
+		(state)->B[4] = B4; \
+		(state)->B[5] = B5; \
+		(state)->B[6] = B6; \
+		(state)->B[7] = B7; \
+		(state)->B[8] = B8; \
+		(state)->B[9] = B9; \
+		(state)->B[10] = BA; \
+		(state)->B[11] = BB; \
+		(state)->B[12] = BC; \
+		(state)->B[13] = BD; \
+		(state)->B[14] = BE; \
+		(state)->B[15] = BF; \
+		(state)->C[0] = C0; \
+		(state)->C[1] = C1; \
+		(state)->C[2] = C2; \
+		(state)->C[3] = C3; \
+		(state)->C[4] = C4; \
+		(state)->C[5] = C5; \
+		(state)->C[6] = C6; \
+		(state)->C[7] = C7; \
+		(state)->C[8] = C8; \
+		(state)->C[9] = C9; \
+		(state)->C[10] = CA; \
+		(state)->C[11] = CB; \
+		(state)->C[12] = CC; \
+		(state)->C[13] = CD; \
+		(state)->C[14] = CE; \
+		(state)->C[15] = CF; \
+		(state)->Wlow = Wlow; \
+		(state)->Whigh = Whigh; \
+	} while (0)
+
+#define DECODE_BLOCK   do { \
+		M0 = sph_dec32le_aligned(buf + 0); \
+		M1 = sph_dec32le_aligned(buf + 4); \
+		M2 = sph_dec32le_aligned(buf + 8); \
+		M3 = sph_dec32le_aligned(buf + 12); \
+		M4 = sph_dec32le_aligned(buf + 16); \
+		M5 = sph_dec32le_aligned(buf + 20); \
+		M6 = sph_dec32le_aligned(buf + 24); \
+		M7 = sph_dec32le_aligned(buf + 28); \
+		M8 = sph_dec32le_aligned(buf + 32); \
+		M9 = sph_dec32le_aligned(buf + 36); \
+		MA = sph_dec32le_aligned(buf + 40); \
+		MB = sph_dec32le_aligned(buf + 44); \
+		MC = sph_dec32le_aligned(buf + 48); \
+		MD = sph_dec32le_aligned(buf + 52); \
+		ME = sph_dec32le_aligned(buf + 56); \
+		MF = sph_dec32le_aligned(buf + 60); \
+	} while (0)
+
+#define INPUT_BLOCK_ADD   do { \
+		B0 = T32(B0 + M0); \
+		B1 = T32(B1 + M1); \
+		B2 = T32(B2 + M2); \
+		B3 = T32(B3 + M3); \
+		B4 = T32(B4 + M4); \
+		B5 = T32(B5 + M5); \
+		B6 = T32(B6 + M6); \
+		B7 = T32(B7 + M7); \
+		B8 = T32(B8 + M8); \
+		B9 = T32(B9 + M9); \
+		BA = T32(BA + MA); \
+		BB = T32(BB + MB); \
+		BC = T32(BC + MC); \
+		BD = T32(BD + MD); \
+		BE = T32(BE + ME); \
+		BF = T32(BF + MF); \
+	} while (0)
+
+#define INPUT_BLOCK_SUB   do { \
+		C0 = T32(C0 - M0); \
+		C1 = T32(C1 - M1); \
+		C2 = T32(C2 - M2); \
+		C3 = T32(C3 - M3); \
+		C4 = T32(C4 - M4); \
+		C5 = T32(C5 - M5); \
+		C6 = T32(C6 - M6); \
+		C7 = T32(C7 - M7); \
+		C8 = T32(C8 - M8); \
+		C9 = T32(C9 - M9); \
+		CA = T32(CA - MA); \
+		CB = T32(CB - MB); \
+		CC = T32(CC - MC); \
+		CD = T32(CD - MD); \
+		CE = T32(CE - ME); \
+		CF = T32(CF - MF); \
+	} while (0)
+
+#define XOR_W   do { \
+		A00 ^= Wlow; \
+		A01 ^= Whigh; \
+	} while (0)
+
+#define SWAP(v1, v2)   do { \
+		sph_u32 tmp = (v1); \
+		(v1) = (v2); \
+		(v2) = tmp; \
+	} while (0)
+
+#define SWAP_BC   do { \
+		SWAP(B0, C0); \
+		SWAP(B1, C1); \
+		SWAP(B2, C2); \
+		SWAP(B3, C3); \
+		SWAP(B4, C4); \
+		SWAP(B5, C5); \
+		SWAP(B6, C6); \
+		SWAP(B7, C7); \
+		SWAP(B8, C8); \
+		SWAP(B9, C9); \
+		SWAP(BA, CA); \
+		SWAP(BB, CB); \
+		SWAP(BC, CC); \
+		SWAP(BD, CD); \
+		SWAP(BE, CE); \
+		SWAP(BF, CF); \
+	} while (0)
+
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)   do { \
+		xa0 = T32((xa0 \
+			^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \
+			^ xc) * 3U) \
+			^ xb1 ^ (xb2 & ~xb3) ^ xm; \
+		xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \
+	} while (0)
+
+#define PERM_STEP_0   do { \
+		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_1   do { \
+		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_2   do { \
+		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define APPLY_P   do { \
+		B0 = T32(B0 << 17) | (B0 >> 15); \
+		B1 = T32(B1 << 17) | (B1 >> 15); \
+		B2 = T32(B2 << 17) | (B2 >> 15); \
+		B3 = T32(B3 << 17) | (B3 >> 15); \
+		B4 = T32(B4 << 17) | (B4 >> 15); \
+		B5 = T32(B5 << 17) | (B5 >> 15); \
+		B6 = T32(B6 << 17) | (B6 >> 15); \
+		B7 = T32(B7 << 17) | (B7 >> 15); \
+		B8 = T32(B8 << 17) | (B8 >> 15); \
+		B9 = T32(B9 << 17) | (B9 >> 15); \
+		BA = T32(BA << 17) | (BA >> 15); \
+		BB = T32(BB << 17) | (BB >> 15); \
+		BC = T32(BC << 17) | (BC >> 15); \
+		BD = T32(BD << 17) | (BD >> 15); \
+		BE = T32(BE << 17) | (BE >> 15); \
+		BF = T32(BF << 17) | (BF >> 15); \
+		PERM_STEP_0; \
+		PERM_STEP_1; \
+		PERM_STEP_2; \
+		A0B = T32(A0B + C6); \
+		A0A = T32(A0A + C5); \
+		A09 = T32(A09 + C4); \
+		A08 = T32(A08 + C3); \
+		A07 = T32(A07 + C2); \
+		A06 = T32(A06 + C1); \
+		A05 = T32(A05 + C0); \
+		A04 = T32(A04 + CF); \
+		A03 = T32(A03 + CE); \
+		A02 = T32(A02 + CD); \
+		A01 = T32(A01 + CC); \
+		A00 = T32(A00 + CB); \
+		A0B = T32(A0B + CA); \
+		A0A = T32(A0A + C9); \
+		A09 = T32(A09 + C8); \
+		A08 = T32(A08 + C7); \
+		A07 = T32(A07 + C6); \
+		A06 = T32(A06 + C5); \
+		A05 = T32(A05 + C4); \
+		A04 = T32(A04 + C3); \
+		A03 = T32(A03 + C2); \
+		A02 = T32(A02 + C1); \
+		A01 = T32(A01 + C0); \
+		A00 = T32(A00 + CF); \
+		A0B = T32(A0B + CE); \
+		A0A = T32(A0A + CD); \
+		A09 = T32(A09 + CC); \
+		A08 = T32(A08 + CB); \
+		A07 = T32(A07 + CA); \
+		A06 = T32(A06 + C9); \
+		A05 = T32(A05 + C8); \
+		A04 = T32(A04 + C7); \
+		A03 = T32(A03 + C6); \
+		A02 = T32(A02 + C5); \
+		A01 = T32(A01 + C4); \
+		A00 = T32(A00 + C3); \
+	} while (0)
+
+#define INCR_W   do { \
+		if ((Wlow = T32(Wlow + 1)) == 0) \
+			Whigh = T32(Whigh + 1); \
+	} while (0)
+
+static const sph_u32 A_init_192[] = {
+	C32(0xFD749ED4), C32(0xB798E530), C32(0x33904B6F), C32(0x46BDA85E),
+	C32(0x076934B4), C32(0x454B4058), C32(0x77F74527), C32(0xFB4CF465),
+	C32(0x62931DA9), C32(0xE778C8DB), C32(0x22B3998E), C32(0xAC15CFB9)
+};
+
+static const sph_u32 B_init_192[] = {
+	C32(0x58BCBAC4), C32(0xEC47A08E), C32(0xAEE933B2), C32(0xDFCBC824),
+	C32(0xA7944804), C32(0xBF65BDB0), C32(0x5A9D4502), C32(0x59979AF7),
+	C32(0xC5CEA54E), C32(0x4B6B8150), C32(0x16E71909), C32(0x7D632319),
+	C32(0x930573A0), C32(0xF34C63D1), C32(0xCAF914B4), C32(0xFDD6612C)
+};
+
+static const sph_u32 C_init_192[] = {
+	C32(0x61550878), C32(0x89EF2B75), C32(0xA1660C46), C32(0x7EF3855B),
+	C32(0x7297B58C), C32(0x1BC67793), C32(0x7FB1C723), C32(0xB66FC640),
+	C32(0x1A48B71C), C32(0xF0976D17), C32(0x088CE80A), C32(0xA454EDF3),
+	C32(0x1C096BF4), C32(0xAC76224B), C32(0x5215781C), C32(0xCD5D2669)
+};
+
+static const sph_u32 A_init_224[] = {
+	C32(0xA5201467), C32(0xA9B8D94A), C32(0xD4CED997), C32(0x68379D7B),
+	C32(0xA7FC73BA), C32(0xF1A2546B), C32(0x606782BF), C32(0xE0BCFD0F),
+	C32(0x2F25374E), C32(0x069A149F), C32(0x5E2DFF25), C32(0xFAECF061)
+};
+
+static const sph_u32 B_init_224[] = {
+	C32(0xEC9905D8), C32(0xF21850CF), C32(0xC0A746C8), C32(0x21DAD498),
+	C32(0x35156EEB), C32(0x088C97F2), C32(0x26303E40), C32(0x8A2D4FB5),
+	C32(0xFEEE44B6), C32(0x8A1E9573), C32(0x7B81111A), C32(0xCBC139F0),
+	C32(0xA3513861), C32(0x1D2C362E), C32(0x918C580E), C32(0xB58E1B9C)
+};
+
+static const sph_u32 C_init_224[] = {
+	C32(0xE4B573A1), C32(0x4C1A0880), C32(0x1E907C51), C32(0x04807EFD),
+	C32(0x3AD8CDE5), C32(0x16B21302), C32(0x02512C53), C32(0x2204CB18),
+	C32(0x99405F2D), C32(0xE5B648A1), C32(0x70AB1D43), C32(0xA10C25C2),
+	C32(0x16F1AC05), C32(0x38BBEB56), C32(0x9B01DC60), C32(0xB1096D83)
+};
+
+static const sph_u32 A_init_256[] = {
+	C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
+	C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
+	C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
+};
+
+static const sph_u32 B_init_256[] = {
+	C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
+	C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
+	C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
+	C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
+};
+
+static const sph_u32 C_init_256[] = {
+	C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
+	C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
+	C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
+	C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
+};
+
+static const sph_u32 A_init_384[] = {
+	C32(0xC8FCA331), C32(0xE55C504E), C32(0x003EBF26), C32(0xBB6B8D83),
+	C32(0x7B0448C1), C32(0x41B82789), C32(0x0A7C9601), C32(0x8D659CFF),
+	C32(0xB6E2673E), C32(0xCA54C77B), C32(0x1460FD7E), C32(0x3FCB8F2D)
+};
+
+static const sph_u32 B_init_384[] = {
+	C32(0x527291FC), C32(0x2A16455F), C32(0x78E627E5), C32(0x944F169F),
+	C32(0x1CA6F016), C32(0xA854EA25), C32(0x8DB98ABE), C32(0xF2C62641),
+	C32(0x30117DCB), C32(0xCF5C4309), C32(0x93711A25), C32(0xF9F671B8),
+	C32(0xB01D2116), C32(0x333F4B89), C32(0xB285D165), C32(0x86829B36)
+};
+
+static const sph_u32 C_init_384[] = {
+	C32(0xF764B11A), C32(0x76172146), C32(0xCEF6934D), C32(0xC6D28399),
+	C32(0xFE095F61), C32(0x5E6018B4), C32(0x5048ECF5), C32(0x51353261),
+	C32(0x6E6E36DC), C32(0x63130DAD), C32(0xA9C69BD6), C32(0x1E90EA0C),
+	C32(0x7C35073B), C32(0x28D95E6D), C32(0xAA340E0D), C32(0xCB3DEE70)
+};
+
+static const sph_u32 A_init_512[] = {
+	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
+	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
+	C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
+};
+
+static const sph_u32 B_init_512[] = {
+	C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
+	C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
+	C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
+	C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
+};
+
+static const sph_u32 C_init_512[] = {
+	C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
+	C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
+	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
+	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
+};
+
+/* END -- automatically generated code. */
+
+static void
+shabal_init(void *cc, unsigned size)
+{
+	/*
+	 * We have precomputed initial states for all the supported
+	 * output bit lengths.
+	 */
+	const sph_u32 *A_init, *B_init, *C_init;
+	sph_shabal_context *sc;
+
+	switch (size) {
+	case 192:
+		A_init = A_init_192;
+		B_init = B_init_192;
+		C_init = C_init_192;
+		break;
+	case 224:
+		A_init = A_init_224;
+		B_init = B_init_224;
+		C_init = C_init_224;
+		break;
+	case 256:
+		A_init = A_init_256;
+		B_init = B_init_256;
+		C_init = C_init_256;
+		break;
+	case 384:
+		A_init = A_init_384;
+		B_init = B_init_384;
+		C_init = C_init_384;
+		break;
+	case 512:
+		A_init = A_init_512;
+		B_init = B_init_512;
+		C_init = C_init_512;
+		break;
+	default:
+		return;
+	}
+	sc = cc;
+	memcpy(sc->A, A_init, sizeof sc->A);
+	memcpy(sc->B, B_init, sizeof sc->B);
+	memcpy(sc->C, C_init, sizeof sc->C);
+	sc->Wlow = 1;
+	sc->Whigh = 0;
+	sc->ptr = 0;
+}
+
+static void
+shabal_core(void *cc, const unsigned char *data, size_t len)
+{
+	sph_shabal_context *sc;
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE
+
+	sc = cc;
+	buf = sc->buf;
+	ptr = sc->ptr;
+
+	/*
+	 * We do not want to copy the state to local variables if the
+	 * amount of data is less than what is needed to complete the
+	 * current block. Note that it is anyway suboptimal to call
+	 * this method many times for small chunks of data.
+	 */
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			DECODE_BLOCK;
+			INPUT_BLOCK_ADD;
+			XOR_W;
+			APPLY_P;
+			INPUT_BLOCK_SUB;
+			SWAP_BC;
+			INCR_W;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE(sc);
+	sc->ptr = ptr;
+}
+
+static void
+shabal_close(void *cc, unsigned ub, unsigned n, void *dst, unsigned size_words)
+{
+	sph_shabal_context *sc;
+	unsigned char *buf;
+	size_t ptr;
+	int i;
+	unsigned z;
+	union {
+		unsigned char tmp_out[64];
+		sph_u32 dummy;
+	} u;
+	size_t out_len;
+	DECL_STATE
+
+	sc = cc;
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr + 1, 0, (sizeof sc->buf) - (ptr + 1));
+	READ_STATE(sc);
+	DECODE_BLOCK;
+	INPUT_BLOCK_ADD;
+	XOR_W;
+	APPLY_P;
+	for (i = 0; i < 3; i ++) {
+		SWAP_BC;
+		XOR_W;
+		APPLY_P;
+	}
+
+	/*
+	 * We just use our local variables; no need to go through
+	 * the state structure. In order to share some code, we
+	 * emit the relevant words into a temporary buffer, which
+	 * we finally copy into the destination array.
+	 */
+	switch (size_words) {
+	case 16:
+		sph_enc32le_aligned(u.tmp_out +  0, B0);
+		sph_enc32le_aligned(u.tmp_out +  4, B1);
+		sph_enc32le_aligned(u.tmp_out +  8, B2);
+		sph_enc32le_aligned(u.tmp_out + 12, B3);
+		/* fall through */
+	case 12:
+		sph_enc32le_aligned(u.tmp_out + 16, B4);
+		sph_enc32le_aligned(u.tmp_out + 20, B5);
+		sph_enc32le_aligned(u.tmp_out + 24, B6);
+		sph_enc32le_aligned(u.tmp_out + 28, B7);
+		/* fall through */
+	case 8:
+		sph_enc32le_aligned(u.tmp_out + 32, B8);
+		/* fall through */
+	case 7:
+		sph_enc32le_aligned(u.tmp_out + 36, B9);
+		/* fall through */
+	case 6:
+		sph_enc32le_aligned(u.tmp_out + 40, BA);
+		sph_enc32le_aligned(u.tmp_out + 44, BB);
+		sph_enc32le_aligned(u.tmp_out + 48, BC);
+		sph_enc32le_aligned(u.tmp_out + 52, BD);
+		sph_enc32le_aligned(u.tmp_out + 56, BE);
+		sph_enc32le_aligned(u.tmp_out + 60, BF);
+		break;
+	default:
+		return;
+	}
+	out_len = size_words << 2;
+	memcpy(dst, u.tmp_out + (sizeof u.tmp_out) - out_len, out_len);
+//	shabal_init(sc, size_words << 5);
+}
+#if 0
+/* see sph_shabal.h */
+void
+sph_shabal192_init(void *cc)
+{
+	shabal_init(cc, 192);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal192(void *cc, const void *data, size_t len)
+{
+	shabal_core(cc, data, len);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal192_close(void *cc, void *dst)
+{
+	shabal_close(cc, 0, 0, dst, 6);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal192_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_close(cc, ub, n, dst, 6);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal224_init(void *cc)
+{
+	shabal_init(cc, 224);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal224(void *cc, const void *data, size_t len)
+{
+	shabal_core(cc, data, len);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal224_close(void *cc, void *dst)
+{
+	shabal_close(cc, 0, 0, dst, 7);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_close(cc, ub, n, dst, 7);
+}
+
+#endif
+/* see sph_shabal.h */
+void
+sph_shabal256_init(void *cc)
+{
+	shabal_init(cc, 256);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal256(void *cc, const void *data, size_t len)
+{
+	shabal_core(cc, data, len);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal256_close(void *cc, void *dst)
+{
+	shabal_close(cc, 0, 0, dst, 8);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_close(cc, ub, n, dst, 8);
+}
+
+#if 0
+/* see sph_shabal.h */
+void
+sph_shabal384_init(void *cc)
+{
+	shabal_init(cc, 384);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal384(void *cc, const void *data, size_t len)
+{
+	shabal_core(cc, data, len);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal384_close(void *cc, void *dst)
+{
+	shabal_close(cc, 0, 0, dst, 12);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_close(cc, ub, n, dst, 12);
+}
+#endif
+
+/* see sph_shabal.h */
+void
+sph_shabal512_init(void *cc)
+{
+	shabal_init(cc, 512);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal512(void *cc, const void *data, size_t len)
+{
+	shabal_core(cc, data, len);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal512_close(void *cc, void *dst)
+{
+	shabal_close(cc, 0, 0, dst, 16);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_close(cc, ub, n, dst, 16);
+}
+#ifdef __cplusplus
+}
+#endif
--- a/src/crypto/ghostrider/sph_shabal.h
+++ b/src/crypto/ghostrider/sph_shabal.h
@ -0,0 +1,344 @@
+/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
+/**
+ * Shabal interface. Shabal is a family of functions which differ by
+ * their output size; this implementation defines Shabal for output
+ * sizes 192, 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_shabal.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SHABAL_H__
+#define SPH_SHABAL_H__
+
+#include "sph_types.h"
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Output size (in bits) for Shabal-192.
+ */
+#define SPH_SIZE_shabal192 192
+
+/**
+ * Output size (in bits) for Shabal-224.
+ */
+#define SPH_SIZE_shabal224 224
+
+/**
+ * Output size (in bits) for Shabal-256.
+ */
+#define SPH_SIZE_shabal256 256
+
+/**
+ * Output size (in bits) for Shabal-384.
+ */
+#define SPH_SIZE_shabal384 384
+
+/**
+ * Output size (in bits) for Shabal-512.
+ */
+#define SPH_SIZE_shabal512 512
+
+/**
+ * This structure is a context for Shabal computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a Shabal computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running Shabal computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+  unsigned char buf[64]; /* first field, for alignment */
+  size_t ptr;
+  sph_u32 A[12], B[16], C[16];
+  sph_u32 Whigh, Wlow;
+#endif
+} sph_shabal_context;
+
+/**
+ * Type for a Shabal-192 context (identical to the common context).
+ */
+typedef sph_shabal_context sph_shabal192_context;
+
+/**
+ * Type for a Shabal-224 context (identical to the common context).
+ */
+typedef sph_shabal_context sph_shabal224_context;
+
+/**
+ * Type for a Shabal-256 context (identical to the common context).
+ */
+typedef sph_shabal_context sph_shabal256_context;
+
+/**
+ * Type for a Shabal-384 context (identical to the common context).
+ */
+typedef sph_shabal_context sph_shabal384_context;
+
+/**
+ * Type for a Shabal-512 context (identical to the common context).
+ */
+typedef sph_shabal_context sph_shabal512_context;
+
+/**
+ * Initialize a Shabal-192 context. This process performs no memory allocation.
+ *
+ * @param cc   the Shabal-192 context (pointer to a
+ *             <code>sph_shabal192_context</code>)
+ */
+void sph_shabal192_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Shabal-192 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shabal192(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Shabal-192 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (24 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Shabal-192 context
+ * @param dst   the destination buffer
+ */
+void sph_shabal192_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (24 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Shabal-192 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shabal192_addbits_and_close(void *cc, unsigned ub, unsigned n,
+                                     void *dst);
+
+/**
+ * Initialize a Shabal-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Shabal-224 context (pointer to a
+ *             <code>sph_shabal224_context</code>)
+ */
+void sph_shabal224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Shabal-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shabal224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Shabal-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Shabal-224 context
+ * @param dst   the destination buffer
+ */
+void sph_shabal224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Shabal-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shabal224_addbits_and_close(void *cc, unsigned ub, unsigned n,
+                                     void *dst);
+
+/**
+ * Initialize a Shabal-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Shabal-256 context (pointer to a
+ *             <code>sph_shabal256_context</code>)
+ */
+void sph_shabal256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Shabal-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shabal256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Shabal-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Shabal-256 context
+ * @param dst   the destination buffer
+ */
+void sph_shabal256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Shabal-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shabal256_addbits_and_close(void *cc, unsigned ub, unsigned n,
+                                     void *dst);
+
+/**
+ * Initialize a Shabal-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Shabal-384 context (pointer to a
+ *             <code>sph_shabal384_context</code>)
+ */
+void sph_shabal384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Shabal-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shabal384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Shabal-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Shabal-384 context
+ * @param dst   the destination buffer
+ */
+void sph_shabal384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Shabal-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shabal384_addbits_and_close(void *cc, unsigned ub, unsigned n,
+                                     void *dst);
+
+/**
+ * Initialize a Shabal-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Shabal-512 context (pointer to a
+ *             <code>sph_shabal512_context</code>)
+ */
+void sph_shabal512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Shabal-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shabal512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Shabal-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Shabal-512 context
+ * @param dst   the destination buffer
+ */
+void sph_shabal512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Shabal-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shabal512_addbits_and_close(void *cc, unsigned ub, unsigned n,
+                                     void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/crypto/ghostrider/sph_shavite.c
+++ b/src/crypto/ghostrider/sph_shavite.c
--- a/src/crypto/ghostrider/sph_shavite.h
+++ b/src/crypto/ghostrider/sph_shavite.h
@ -0,0 +1,314 @@
+/* $Id: sph_shavite.h 208 2010-06-02 20:33:00Z tp $ */
+/**
+ * SHAvite-3 interface. This code implements SHAvite-3 with the
+ * recommended parameters for SHA-3, with outputs of 224, 256, 384 and
+ * 512 bits. In the following, we call the function "SHAvite" (without
+ * the "-3" suffix), thus "SHAvite-224" is "SHAvite-3 with a 224-bit
+ * output".
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_shavite.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SHAVITE_H__
+#define SPH_SHAVITE_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/**
+ * Output size (in bits) for SHAvite-224.
+ */
+#define SPH_SIZE_shavite224   224
+
+/**
+ * Output size (in bits) for SHAvite-256.
+ */
+#define SPH_SIZE_shavite256   256
+
+/**
+ * Output size (in bits) for SHAvite-384.
+ */
+#define SPH_SIZE_shavite384   384
+
+/**
+ * Output size (in bits) for SHAvite-512.
+ */
+#define SPH_SIZE_shavite512   512
+
+/**
+ * This structure is a context for SHAvite-224 and SHAvite-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a SHAvite computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running SHAvite
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 h[8];
+	sph_u32 count0, count1;
+#endif
+} sph_shavite_small_context;
+
+/**
+ * This structure is a context for SHAvite-224 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_small_context sph_shavite224_context;
+
+/**
+ * This structure is a context for SHAvite-256 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_small_context sph_shavite256_context;
+
+/**
+ * This structure is a context for SHAvite-384 and SHAvite-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a SHAvite computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running SHAvite
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 h[16];
+	sph_u32 count0, count1, count2, count3;
+#endif
+} sph_shavite_big_context;
+
+/**
+ * This structure is a context for SHAvite-384 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_big_context sph_shavite384_context;
+
+/**
+ * This structure is a context for SHAvite-512 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_big_context sph_shavite512_context;
+
+/**
+ * Initialize a SHAvite-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-224 context (pointer to a
+ *             <code>sph_shavite224_context</code>)
+ */
+void sph_shavite224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-224 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-256 context (pointer to a
+ *             <code>sph_shavite256_context</code>)
+ */
+void sph_shavite256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-256 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-384 context (pointer to a
+ *             <code>sph_shavite384_context</code>)
+ */
+void sph_shavite384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-384 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-512 context (pointer to a
+ *             <code>sph_shavite512_context</code>)
+ */
+void sph_shavite512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-512 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/crypto/ghostrider/sph_simd.c
+++ b/src/crypto/ghostrider/sph_simd.c
--- a/src/crypto/ghostrider/sph_simd.h
+++ b/src/crypto/ghostrider/sph_simd.h
@ -0,0 +1,309 @@
+/* $Id: sph_simd.h 154 2010-04-26 17:00:24Z tp $ */
+/**
+ * SIMD interface. SIMD is a family of functions which differ by
+ * their output size; this implementation defines SIMD for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_simd.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SIMD_H__
+#define SPH_SIMD_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for SIMD-224.
+ */
+#define SPH_SIZE_simd224   224
+
+/**
+ * Output size (in bits) for SIMD-256.
+ */
+#define SPH_SIZE_simd256   256
+
+/**
+ * Output size (in bits) for SIMD-384.
+ */
+#define SPH_SIZE_simd384   384
+
+/**
+ * Output size (in bits) for SIMD-512.
+ */
+#define SPH_SIZE_simd512   512
+
+/**
+ * This structure is a context for SIMD computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an SIMD computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for SIMD-224
+ * and SIMD-256.
+ *
+ * The contents of this structure are private. A running SIMD computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 state[16];
+	sph_u32 count_low, count_high;
+#endif
+} sph_simd_small_context;
+
+/**
+ * This structure is a context for SIMD computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an SIMD computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for SIMD-384
+ * and SIMD-512.
+ *
+ * The contents of this structure are private. A running SIMD computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 state[32];
+	sph_u32 count_low, count_high;
+#endif
+} sph_simd_big_context;
+
+/**
+ * Type for a SIMD-224 context (identical to the common "small" context).
+ */
+typedef sph_simd_small_context sph_simd224_context;
+
+/**
+ * Type for a SIMD-256 context (identical to the common "small" context).
+ */
+typedef sph_simd_small_context sph_simd256_context;
+
+/**
+ * Type for a SIMD-384 context (identical to the common "big" context).
+ */
+typedef sph_simd_big_context sph_simd384_context;
+
+/**
+ * Type for a SIMD-512 context (identical to the common "big" context).
+ */
+typedef sph_simd_big_context sph_simd512_context;
+
+/**
+ * Initialize an SIMD-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-224 context (pointer to a
+ *             <code>sph_simd224_context</code>)
+ */
+void sph_simd224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-224 context
+ * @param dst   the destination buffer
+ */
+void sph_simd224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an SIMD-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-256 context (pointer to a
+ *             <code>sph_simd256_context</code>)
+ */
+void sph_simd256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-256 context
+ * @param dst   the destination buffer
+ */
+void sph_simd256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an SIMD-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-384 context (pointer to a
+ *             <code>sph_simd384_context</code>)
+ */
+void sph_simd384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-384 context
+ * @param dst   the destination buffer
+ */
+void sph_simd384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an SIMD-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-512 context (pointer to a
+ *             <code>sph_simd512_context</code>)
+ */
+void sph_simd512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-512 context
+ * @param dst   the destination buffer
+ */
+void sph_simd512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/crypto/ghostrider/sph_skein.c
+++ b/src/crypto/ghostrider/sph_skein.c
--- a/src/crypto/ghostrider/sph_skein.h
+++ b/src/crypto/ghostrider/sph_skein.h
@ -0,0 +1,298 @@
+/* $Id: sph_skein.h 253 2011-06-07 18:33:10Z tp $ */
+/**
+ * Skein interface. The Skein specification defines three main
+ * functions, called Skein-256, Skein-512 and Skein-1024, which can be
+ * further parameterized with an output length. For the SHA-3
+ * competition, Skein-512 is used for output sizes of 224, 256, 384 and
+ * 512 bits; this is what this code implements. Thus, we hereafter call
+ * Skein-224, Skein-256, Skein-384 and Skein-512 what the Skein
+ * specification defines as Skein-512-224, Skein-512-256, Skein-512-384
+ * and Skein-512-512, respectively.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_skein.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SKEIN_H__
+#define SPH_SKEIN_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for Skein-224.
+ */
+#define SPH_SIZE_skein224   224
+
+/**
+ * Output size (in bits) for Skein-256.
+ */
+#define SPH_SIZE_skein256   256
+
+/**
+ * Output size (in bits) for Skein-384.
+ */
+#define SPH_SIZE_skein384   384
+
+/**
+ * Output size (in bits) for Skein-512.
+ */
+#define SPH_SIZE_skein512   512
+
+/**
+ * This structure is a context for Skein computations (with a 384- or
+ * 512-bit output): it contains the intermediate values and some data
+ * from the last entered block. Once a Skein computation has been
+ * performed, the context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running Skein computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u64 h0, h1, h2, h3, h4, h5, h6, h7;
+	sph_u64 bcount;
+#endif
+} sph_skein_big_context;
+
+/**
+ * Type for a Skein-224 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein224_context;
+
+/**
+ * Type for a Skein-256 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein256_context;
+
+/**
+ * Type for a Skein-384 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein384_context;
+
+/**
+ * Type for a Skein-512 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein512_context;
+
+/**
+ * Initialize a Skein-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-224 context (pointer to a
+ *             <code>sph_skein224_context</code>)
+ */
+void sph_skein224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-224 context
+ * @param dst   the destination buffer
+ */
+void sph_skein224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Skein-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-256 context (pointer to a
+ *             <code>sph_skein256_context</code>)
+ */
+void sph_skein256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-256 context
+ * @param dst   the destination buffer
+ */
+void sph_skein256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Skein-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-384 context (pointer to a
+ *             <code>sph_skein384_context</code>)
+ */
+void sph_skein384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-384 context
+ * @param dst   the destination buffer
+ */
+void sph_skein384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Skein-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-512 context (pointer to a
+ *             <code>sph_skein512_context</code>)
+ */
+void sph_skein512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-512 context
+ * @param dst   the destination buffer
+ */
+void sph_skein512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/crypto/ghostrider/sph_types.h
+++ b/src/crypto/ghostrider/sph_types.h
--- a/src/crypto/ghostrider/sph_whirlpool.c
+++ b/src/crypto/ghostrider/sph_whirlpool.c
--- a/src/crypto/ghostrider/sph_whirlpool.h
+++ b/src/crypto/ghostrider/sph_whirlpool.h
@ -0,0 +1,224 @@
+/* $Id: sph_whirlpool.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * WHIRLPOOL interface.
+ *
+ * WHIRLPOOL knows three variants, dubbed "WHIRLPOOL-0" (original
+ * version, published in 2000, studied by NESSIE), "WHIRLPOOL-1"
+ * (first revision, 2001, with a new S-box) and "WHIRLPOOL" (current
+ * version, 2003, with a new diffusion matrix, also described as "plain
+ * WHIRLPOOL"). All three variants are implemented here.
+ *
+ * The original WHIRLPOOL (i.e. WHIRLPOOL-0) was published in: P. S. L.
+ * M. Barreto, V. Rijmen, "The Whirlpool Hashing Function", First open
+ * NESSIE Workshop, Leuven, Belgium, November 13--14, 2000.
+ *
+ * The current WHIRLPOOL specification and a reference implementation
+ * can be found on the WHIRLPOOL web page:
+ * http://paginas.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_whirlpool.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_WHIRLPOOL_H__
+#define SPH_WHIRLPOOL_H__
+
+#include "sph_types.h"
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for WHIRLPOOL.
+ */
+#define SPH_SIZE_whirlpool 512
+
+/**
+ * Output size (in bits) for WHIRLPOOL-0.
+ */
+#define SPH_SIZE_whirlpool0 512
+
+/**
+ * Output size (in bits) for WHIRLPOOL-1.
+ */
+#define SPH_SIZE_whirlpool1 512
+
+/**
+ * This structure is a context for WHIRLPOOL computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a WHIRLPOOL computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running WHIRLPOOL computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+  unsigned char buf[64]; /* first field, for alignment */
+  sph_u64 state[8];
+#if SPH_64
+  sph_u64 count;
+#else
+  sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_whirlpool_context;
+
+/**
+ * Initialize a WHIRLPOOL context. This process performs no memory allocation.
+ *
+ * @param cc   the WHIRLPOOL context (pointer to a
+ *             <code>sph_whirlpool_context</code>)
+ */
+void sph_whirlpool_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing). This function applies the
+ * plain WHIRLPOOL algorithm.
+ *
+ * @param cc     the WHIRLPOOL context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_whirlpool(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current WHIRLPOOL computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the WHIRLPOOL context
+ * @param dst   the destination buffer
+ */
+void sph_whirlpool_close(void *cc, void *dst);
+
+#define sph_whirlpool512_full(cc, dst, data, len)                              \
+  do {                                                                         \
+    sph_whirlpool_init(cc);                                                    \
+    sph_whirlpool(cc, data, len);                                              \
+    sph_whirlpool_close(cc, dst);                                              \
+  } while (0)
+
+/**
+ * WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
+ */
+typedef sph_whirlpool_context sph_whirlpool0_context;
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Initialize a WHIRLPOOL-0 context. This function is identical to
+ * <code>sph_whirlpool_init()</code>.
+ *
+ * @param cc   the WHIRLPOOL context (pointer to a
+ *             <code>sph_whirlpool0_context</code>)
+ */
+void sph_whirlpool0_init(void *cc);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_whirlpool0_init sph_whirlpool_init
+#endif
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing). This function applies the
+ * WHIRLPOOL-0 algorithm.
+ *
+ * @param cc     the WHIRLPOOL context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_whirlpool0(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current WHIRLPOOL-0 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the WHIRLPOOL-0 context
+ * @param dst   the destination buffer
+ */
+void sph_whirlpool0_close(void *cc, void *dst);
+
+/**
+ * WHIRLPOOL-1 uses the same structure than plain WHIRLPOOL.
+ */
+typedef sph_whirlpool_context sph_whirlpool1_context;
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Initialize a WHIRLPOOL-1 context. This function is identical to
+ * <code>sph_whirlpool_init()</code>.
+ *
+ * @param cc   the WHIRLPOOL context (pointer to a
+ *             <code>sph_whirlpool1_context</code>)
+ */
+void sph_whirlpool1_init(void *cc);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_whirlpool1_init sph_whirlpool_init
+#endif
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing). This function applies the
+ * WHIRLPOOL-1 algorithm.
+ *
+ * @param cc     the WHIRLPOOL context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_whirlpool1(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current WHIRLPOOL-1 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the WHIRLPOOL-1 context
+ * @param dst   the destination buffer
+ */
+void sph_whirlpool1_close(void *cc, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/crypto/kawpow/KPHash.cpp
+++ b/src/crypto/kawpow/KPHash.cpp
@ -166,9 +166,9 @@ static inline uint32_t popcount_soft(uint64_t x)
    constexpr uint64_t h01 = 0x0101010101010101ull;

    x -= (x >> 1) & m1;             //put count of each 2 bits into those 2 bits
-    x = (x & m2) + ((x >> 2) & m2); //put count of each 4 bits into those 4 bits 
-    x = (x + (x >> 4)) & m4;        //put count of each 8 bits into those 8 bits 
-    return (x * h01) >> 56;         //returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ... 
+    x = (x & m2) + ((x >> 2) & m2); //put count of each 4 bits into those 4 bits
+    x = (x + (x >> 4)) & m4;        //put count of each 8 bits into those 8 bits
+    return (x * h01) >> 56;         //returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ...
 }


--- a/src/crypto/randomx/aes_hash.cpp
+++ b/src/crypto/randomx/aes_hash.cpp
@ -382,7 +382,7 @@ void SelectSoftAESImpl(size_t threadsCount)
  double fast_speed = 0.0;
  for (size_t run = 0; run < 3; ++run) {
    for (size_t i = 0; i < impl.size(); ++i) {
-      const uint64_t t1 = xmrig::Chrono::highResolutionMSecs();
+      const double t1 = xmrig::Chrono::highResolutionMSecs();
      std::vector<uint32_t> count(threadsCount, 0);
      std::vector<std::thread> threads;
      for (size_t t = 0; t < threadsCount; ++t) {
@ -401,7 +401,7 @@ void SelectSoftAESImpl(size_t threadsCount)
        threads[t].join();
        total += count[t];
      }
-      const uint64_t t2 = xmrig::Chrono::highResolutionMSecs();
+      const double t2 = xmrig::Chrono::highResolutionMSecs();
      const double speed = total * 1e3 / (t2 - t1);
      if (speed > fast_speed) {
        fast_idx = i;
--- a/src/crypto/randomx/bytecode_machine.hpp
+++ b/src/crypto/randomx/bytecode_machine.hpp
@ -240,10 +240,17 @@ namespace randomx {
 			return x;
 		}

+		void cleanup() {
+			for (unsigned i = 0; i < RegistersCount; ++i) {
+				registerUsage[i] = -1;
+			}
+			nreg = nullptr;
+		}
+
 	private:
 		static const int_reg_t zero;
-		int registerUsage[RegistersCount];
-		NativeRegisterFile* nreg;
+		int registerUsage[RegistersCount] = {};
+		NativeRegisterFile* nreg = nullptr;

 		static void* getScratchpadAddress(InstructionByteCode& ibc, uint8_t* scratchpad) {
 			uint32_t addr = (*ibc.isrc + ibc.imm) & ibc.memMask;
--- a/src/crypto/randomx/jit_compiler_fallback.cpp
+++ b/src/crypto/randomx/jit_compiler_fallback.cpp
@ -28,13 +28,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

-#pragma once
-

 void randomx_set_huge_pages_jit(bool)
 {
 }

+
 void randomx_set_optimized_dataset_init(int)
 {
 }
--- a/src/crypto/randomx/jit_compiler_x86.cpp
+++ b/src/crypto/randomx/jit_compiler_x86.cpp
@ -114,42 +114,42 @@ namespace randomx {
 	#define codeLoopBegin ADDR(randomx_program_loop_begin)
 	#define codeLoopLoad ADDR(randomx_program_loop_load)
 	#define codeLoopLoadXOP ADDR(randomx_program_loop_load_xop)
-	#define codeProgamStart ADDR(randomx_program_start)
+	#define codeProgramStart ADDR(randomx_program_start)
 	#define codeReadDataset ADDR(randomx_program_read_dataset)
 	#define codeReadDatasetLightSshInit ADDR(randomx_program_read_dataset_sshash_init)
 	#define codeReadDatasetLightSshFin ADDR(randomx_program_read_dataset_sshash_fin)
 	#define codeDatasetInit ADDR(randomx_dataset_init)
-	#define codeDatasetInitAVX2_prologue ADDR(randomx_dataset_init_avx2_prologue)
-	#define codeDatasetInitAVX2_loop_end ADDR(randomx_dataset_init_avx2_loop_end)
-	#define codeDatasetInitAVX2_loop_epilogue ADDR(randomx_dataset_init_avx2_epilogue)
-	#define codeDatasetInitAVX2_ssh_load ADDR(randomx_dataset_init_avx2_ssh_load)
-	#define codeDatasetInitAVX2_ssh_prefetch ADDR(randomx_dataset_init_avx2_ssh_prefetch)
+	#define codeDatasetInitAVX2Prologue ADDR(randomx_dataset_init_avx2_prologue)
+	#define codeDatasetInitAVX2LoopEnd ADDR(randomx_dataset_init_avx2_loop_end)
+	#define codeDatasetInitAVX2Epilogue ADDR(randomx_dataset_init_avx2_epilogue)
+	#define codeDatasetInitAVX2SshLoad ADDR(randomx_dataset_init_avx2_ssh_load)
+	#define codeDatasetInitAVX2SshPrefetch ADDR(randomx_dataset_init_avx2_ssh_prefetch)
 	#define codeLoopStore ADDR(randomx_program_loop_store)
 	#define codeLoopEnd ADDR(randomx_program_loop_end)
 	#define codeEpilogue ADDR(randomx_program_epilogue)
 	#define codeProgramEnd ADDR(randomx_program_end)
-	#define codeShhLoad ADDR(randomx_sshash_load)
-	#define codeShhPrefetch ADDR(randomx_sshash_prefetch)
-	#define codeShhEnd ADDR(randomx_sshash_end)
-	#define codeShhInit ADDR(randomx_sshash_init)
+	#define codeSshLoad ADDR(randomx_sshash_load)
+	#define codeSshPrefetch ADDR(randomx_sshash_prefetch)
+	#define codeSshEnd ADDR(randomx_sshash_end)
+	#define codeSshInit ADDR(randomx_sshash_init)

 	#define prologueSize (codeLoopBegin - codePrologue)
 	#define loopLoadSize (codeLoopLoadXOP - codeLoopLoad)
-	#define loopLoadXOPSize (codeProgamStart - codeLoopLoadXOP)
+	#define loopLoadXOPSize (codeProgramStart - codeLoopLoadXOP)
 	#define readDatasetSize (codeReadDatasetLightSshInit - codeReadDataset)
 	#define readDatasetLightInitSize (codeReadDatasetLightSshFin - codeReadDatasetLightSshInit)
 	#define readDatasetLightFinSize (codeLoopStore - codeReadDatasetLightSshFin)
 	#define loopStoreSize (codeLoopEnd - codeLoopStore)
-	#define datasetInitSize (codeDatasetInitAVX2_prologue - codeDatasetInit)
-	#define datasetInitAVX2_prologue_size (codeDatasetInitAVX2_loop_end - codeDatasetInitAVX2_prologue)
-	#define datasetInitAVX2_loop_end_size (codeDatasetInitAVX2_loop_epilogue - codeDatasetInitAVX2_loop_end)
-	#define datasetInitAVX2_epilogue_size (codeDatasetInitAVX2_ssh_load - codeDatasetInitAVX2_loop_epilogue)
-	#define datasetInitAVX2_ssh_load_size (codeDatasetInitAVX2_ssh_prefetch - codeDatasetInitAVX2_ssh_load)
-	#define datasetInitAVX2_ssh_prefetch_size (codeEpilogue - codeDatasetInitAVX2_ssh_prefetch)
-	#define epilogueSize (codeShhLoad - codeEpilogue)
-	#define codeSshLoadSize (codeShhPrefetch - codeShhLoad)
-	#define codeSshPrefetchSize (codeShhEnd - codeShhPrefetch)
-	#define codeSshInitSize (codeProgramEnd - codeShhInit)
+	#define datasetInitSize (codeDatasetInitAVX2Prologue - codeDatasetInit)
+	#define datasetInitAVX2PrologueSize (codeDatasetInitAVX2LoopEnd - codeDatasetInitAVX2Prologue)
+	#define datasetInitAVX2LoopEndSize (codeDatasetInitAVX2Epilogue - codeDatasetInitAVX2LoopEnd)
+	#define datasetInitAVX2EpilogueSize (codeDatasetInitAVX2SshLoad - codeDatasetInitAVX2Epilogue)
+	#define datasetInitAVX2SshLoadSize (codeDatasetInitAVX2SshPrefetch - codeDatasetInitAVX2SshLoad)
+	#define datasetInitAVX2SshPrefetchSize (codeEpilogue - codeDatasetInitAVX2SshPrefetch)
+	#define epilogueSize (codeSshLoad - codeEpilogue)
+	#define codeSshLoadSize (codeSshPrefetch - codeSshLoad)
+	#define codeSshPrefetchSize (codeSshEnd - codeSshPrefetch)
+	#define codeSshInitSize (codeProgramEnd - codeSshInit)

 	#define epilogueOffset ((CodeSize - epilogueSize) & ~63)

@ -341,7 +341,7 @@ namespace randomx {
 		uint8_t* p = code;
 		if (initDatasetAVX2) {
 			codePos = 0;
-			emit(codeDatasetInitAVX2_prologue, datasetInitAVX2_prologue_size, code, codePos);
+			emit(codeDatasetInitAVX2Prologue, datasetInitAVX2PrologueSize, code, codePos);

 			for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) {
 				SuperscalarProgram& prog = programs[j];
@ -350,29 +350,29 @@ namespace randomx {
 					generateSuperscalarCode<true>(prog(i), p, pos);
 				}
 				codePos = pos;
-				emit(codeShhLoad, codeSshLoadSize, code, codePos);
-				emit(codeDatasetInitAVX2_ssh_load, datasetInitAVX2_ssh_load_size, code, codePos);
+				emit(codeSshLoad, codeSshLoadSize, code, codePos);
+				emit(codeDatasetInitAVX2SshLoad, datasetInitAVX2SshLoadSize, code, codePos);
 				if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
 					*(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast<uint32_t>(prog.getAddressRegister()) << 16);
 					codePos += 3;
-					emit(RandomX_CurrentConfig.codeShhPrefetchTweaked, codeSshPrefetchSize, code, codePos);
+					emit(RandomX_CurrentConfig.codeSshPrefetchTweaked, codeSshPrefetchSize, code, codePos);
 					uint8_t* p = code + codePos;
-					emit(codeDatasetInitAVX2_ssh_prefetch, datasetInitAVX2_ssh_prefetch_size, code, codePos);
+					emit(codeDatasetInitAVX2SshPrefetch, datasetInitAVX2SshPrefetchSize, code, codePos);
 					p[3] += prog.getAddressRegister() << 3;
 				}
 			}

-			emit(codeDatasetInitAVX2_loop_end, datasetInitAVX2_loop_end_size, code, codePos);
+			emit(codeDatasetInitAVX2LoopEnd, datasetInitAVX2LoopEndSize, code, codePos);

 			// Number of bytes from the start of randomx_dataset_init_avx2_prologue to loop_begin label
 			constexpr int32_t prologue_size = 320;
 			*(int32_t*)(code + codePos - 4) = prologue_size - codePos;

-			emit(codeDatasetInitAVX2_loop_epilogue, datasetInitAVX2_epilogue_size, code, codePos);
+			emit(codeDatasetInitAVX2Epilogue, datasetInitAVX2EpilogueSize, code, codePos);
 			return;
 		}

-		memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize);
+		memcpy(code + superScalarHashOffset, codeSshInit, codeSshInitSize);
 		codePos = superScalarHashOffset + codeSshInitSize;
 		for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) {
 			SuperscalarProgram& prog = programs[j];
@ -381,11 +381,11 @@ namespace randomx {
 				generateSuperscalarCode<false>(prog(i), p, pos);
 			}
 			codePos = pos;
-			emit(codeShhLoad, codeSshLoadSize, code, codePos);
+			emit(codeSshLoad, codeSshLoadSize, code, codePos);
 			if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
 				*(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast<uint32_t>(prog.getAddressRegister()) << 16);
 				codePos += 3;
-				emit(RandomX_CurrentConfig.codeShhPrefetchTweaked, codeSshPrefetchSize, code, codePos);
+				emit(RandomX_CurrentConfig.codeSshPrefetchTweaked, codeSshPrefetchSize, code, codePos);
 			}
 		}
 		emitByte(0xc3, code, codePos);
@ -411,7 +411,7 @@ namespace randomx {
 		}

 #		ifdef XMRIG_FIX_RYZEN
-        xmrig::RxFix::setMainLoopBounds(mainLoopBounds);
+		xmrig::RxFix::setMainLoopBounds(mainLoopBounds);
 #		endif

 		imul_rcp_storage = code + (ADDR(randomx_program_imul_rcp_store) - codePrologue) + 2;
--- a/src/crypto/randomx/jit_compiler_x86_static.S
+++ b/src/crypto/randomx/jit_compiler_x86_static.S
@ -176,7 +176,7 @@ init_block_loop:
 	prefetchw byte ptr [rsi]
 	mov rbx, rbp
 	.byte 232 ;# 0xE8 = call
-	;# .set CALL_LOC, 
+	;# .set CALL_LOC,
 	.int 32768 - (call_offset - DECL(randomx_dataset_init))
 call_offset:
 	mov qword ptr [rsi+0], r8
--- a/src/crypto/randomx/randomx.cpp
+++ b/src/crypto/randomx/randomx.cpp
@ -172,7 +172,7 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
 	{
 		const uint8_t* a = addr(randomx_sshash_prefetch);
 		const uint8_t* b = addr(randomx_sshash_end);
-		memcpy(codeShhPrefetchTweaked, a, b - a);
+		memcpy(codeSshPrefetchTweaked, a, b - a);
 	}
 	if (xmrig::Cpu::info()->hasBMI2()) {
 		const uint8_t* a = addr(randomx_prefetch_scratchpad_bmi2);
@ -214,7 +214,7 @@ void RandomX_ConfigurationBase::Apply()
 	ScratchpadL3Mask64_Calculated = ((ScratchpadL3_Size / sizeof(uint64_t)) / 8 - 1) * 64;

 #if defined(XMRIG_FEATURE_ASM) && (defined(_M_X64) || defined(__x86_64__))
-	*(uint32_t*)(codeShhPrefetchTweaked + 3) = ArgonMemory * 16 - 1;
+	*(uint32_t*)(codeSshPrefetchTweaked + 3) = ArgonMemory * 16 - 1;
 	// Not needed right now because all variants use default dataset base size
 	//const uint32_t DatasetBaseMask = DatasetBaseSize - RANDOMX_DATASET_ITEM_SIZE;
 	//*(uint32_t*)(codeReadDatasetTweaked + 9) = DatasetBaseMask;
@ -295,7 +295,7 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx
 	INST_HANDLE(IMUL_R, ISUB_M);
 	INST_HANDLE(IMUL_M, IMUL_R);

-#if defined(_M_X64) || defined(__x86_64__)
+#if defined(XMRIG_FEATURE_ASM) && (defined(_M_X64) || defined(__x86_64__))
 	if (hasBMI2) {
 		INST_HANDLE2(IMULH_R, IMULH_R_BMI2, IMUL_M);
 		INST_HANDLE2(IMULH_M, IMULH_M_BMI2, IMULH_R);
@ -337,7 +337,7 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx
 	INST_HANDLE(CBRANCH, FSQRT_R);
 #endif

-#if defined(_M_X64) || defined(__x86_64__)
+#if defined(XMRIG_FEATURE_ASM) && (defined(_M_X64) || defined(__x86_64__))
 	if (hasBMI2) {
 		INST_HANDLE2(CFROUND, CFROUND_BMI2, CBRANCH);
 	}
@ -410,6 +410,7 @@ extern "C" {
 	}

 	void randomx_release_cache(randomx_cache* cache) {
+		delete cache->jit;
 		delete cache;
 	}

--- a/src/crypto/randomx/randomx.h
+++ b/src/crypto/randomx/randomx.h
@ -124,7 +124,7 @@ struct RandomX_ConfigurationBase

 	rx_vec_i128 fillAes4Rx4_Key[8];

-	uint8_t codeShhPrefetchTweaked[20];
+	uint8_t codeSshPrefetchTweaked[20];
 	uint8_t codePrefetchScratchpadTweaked[28];
 	uint32_t codePrefetchScratchpadTweakedSize;

--- a/src/crypto/randomx/superscalar.cpp
+++ b/src/crypto/randomx/superscalar.cpp
@ -231,7 +231,7 @@ namespace randomx {
 	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMULH_R = SuperscalarInstructionInfo("IMULH_R", SuperscalarInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1);
 	const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISMULH_R = SuperscalarInstructionInfo("ISMULH_R", SuperscalarInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1);
 	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_RCP = SuperscalarInstructionInfo("IMUL_RCP", SuperscalarInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1);
-	
+
 	const SuperscalarInstructionInfo SuperscalarInstructionInfo::NOP = SuperscalarInstructionInfo("NOP");

 	//these are some of the options how to split a 16-byte window into 3 or 4 x86 instructions.
@ -494,7 +494,7 @@ namespace randomx {
 			// * value must be ready at the required cycle
 			// * cannot be the same as the source register unless the instruction allows it
 			//   - this avoids optimizable instructions such as "xor r, r" or "sub r, r"
-			// * register cannot be multiplied twice in a row unless allowChainedMul is true 
+			// * register cannot be multiplied twice in a row unless allowChainedMul is true
 			//   - this avoids accumulation of trailing zeroes in registers due to excessive multiplication
 			//   - allowChainedMul is set to true if an attempt to find source/destination registers failed (this is quite rare, but prevents a catastrophic failure of the generator)
 			// * either the last instruction applied to the register or its source must be different than this instruction
@ -619,7 +619,7 @@ namespace randomx {
 			if (commit)
 				if (trace) std::cout << "; (eliminated)" << std::endl;
 			return cycle;
-		} 
+		}
 		else if (mop.isSimple()) {
 			//this macro-op has only one uOP
 			return scheduleUop<commit>(mop.getUop1(), portBusy, cycle);
@ -676,7 +676,7 @@ namespace randomx {
 			if (trace) std::cout << "; ------------- fetch cycle " << cycle << " (" << decodeBuffer->getName() << ")" << std::endl;

 			int bufferIndex = 0;
-			
+
 			//fill all instruction slots in the current decode buffer
 			while (bufferIndex < decodeBuffer->getSize()) {
 				int topCycle = cycle;
@ -831,7 +831,7 @@ namespace randomx {
 		prog.decodeCycles = decodeCycle;
 		prog.ipc = ipc;
 		prog.mulCount = mulCount;
-		
+

 		/*if(INFO) std::cout << "; ALU port utilization:" << std::endl;
 		if (INFO) std::cout << "; (* = in use, _ = idle)" << std::endl;
--- a/src/crypto/randomx/vm_interpreted.cpp
+++ b/src/crypto/randomx/vm_interpreted.cpp
@ -104,6 +104,8 @@ namespace randomx {

 		for (unsigned i = 0; i < RegisterCountFlt; ++i)
 			rx_store_vec_f128(&reg.e[i].lo, nreg.e[i]);
+
+		cleanup();
 	}

 	template<int softAes>
--- a/src/crypto/rx/Rx.cpp
+++ b/src/crypto/rx/Rx.cpp
@ -91,6 +91,9 @@ bool xmrig::Rx::init(const T &seed, const RxConfig &config, const CpuConfig &cpu
    if ((f != Algorithm::RANDOM_X)
 #       ifdef XMRIG_ALGO_CN_HEAVY
        && (f != Algorithm::CN_HEAVY)
+#       endif
+#       ifdef XMRIG_ALGO_GHOSTRIDER
+        && (f != Algorithm::GHOSTRIDER)
 #       endif
        ) {
 #       ifdef XMRIG_FEATURE_MSR
@ -112,6 +115,12 @@ bool xmrig::Rx::init(const T &seed, const RxConfig &config, const CpuConfig &cpu
    }
 #   endif

+#   ifdef XMRIG_ALGO_GHOSTRIDER
+    if (f == Algorithm::GHOSTRIDER) {
+        return true;
+    }
+#   endif
+
    randomx_set_scratchpad_prefetch_mode(config.scratchpadPrefetchMode());
    randomx_set_huge_pages_jit(cpu.isHugePagesJit());
    randomx_set_optimized_dataset_init(config.initDatasetAVX2());
--- a/src/crypto/rx/RxDataset.cpp
+++ b/src/crypto/rx/RxDataset.cpp
@ -1,7 +1,7 @@
 /* XMRig
 * Copyright (c) 2018-2019 tevador     <tevador@gmail.com>
- * Copyright (c) 2018-2021 SChernykh   <https://github.com/SChernykh>
- * Copyright (c) 2016-2021 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright (c) 2018-2022 SChernykh   <https://github.com/SChernykh>
+ * Copyright (c) 2016-2022 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -22,7 +22,6 @@
 #include "base/io/log/Log.h"
 #include "base/io/log/Tags.h"
 #include "base/kernel/OS.h"
-#include "base/kernel/Platform.h"
 #include "crypto/common/VirtualMemory.h"
 #include "crypto/randomx/randomx.h"
 #include "crypto/rx/RxAlgo.h"
@ -37,7 +36,7 @@ namespace xmrig {

 static void init_dataset_wrapper(randomx_dataset *dataset, randomx_cache *cache, uint32_t startItem, uint32_t itemCount, int priority)
 {
-    Platform::setThreadPriority(priority);
+    OS::setThreadPriority(priority);

    if (Cpu::info()->hasAVX2() && (itemCount % 5)) {
        randomx_init_dataset(dataset, cache, startItem, itemCount - (itemCount % 5));
--- a/src/crypto/rx/RxDataset.h
+++ b/src/crypto/rx/RxDataset.h
@ -1,7 +1,7 @@
 /* XMRig
 * Copyright (c) 2018-2019 tevador     <tevador@gmail.com>
- * Copyright (c) 2018-2021 SChernykh   <https://github.com/SChernykh>
- * Copyright (c) 2016-2021 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright (c) 2018-2022 SChernykh   <https://github.com/SChernykh>
+ * Copyright (c) 2016-2022 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
--- a/src/crypto/rx/RxNUMAStorage.cpp
+++ b/src/crypto/rx/RxNUMAStorage.cpp
@ -1,7 +1,7 @@
 /* XMRig
 * Copyright (c) 2018-2019 tevador     <tevador@gmail.com>
- * Copyright (c) 2018-2021 SChernykh   <https://github.com/SChernykh>
- * Copyright (c) 2016-2021 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright (c) 2018-2022 SChernykh   <https://github.com/SChernykh>
+ * Copyright (c) 2016-2022 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -22,7 +22,7 @@
 #include "backend/cpu/platform/HwlocCpuInfo.h"
 #include "base/io/log/Log.h"
 #include "base/io/log/Tags.h"
-#include "base/kernel/Platform.h"
+#include "base/kernel/OS.h"
 #include "base/tools/Chrono.h"
 #include "crypto/rx/RxAlgo.h"
 #include "crypto/rx/RxCache.h"
@ -52,7 +52,7 @@ static bool bindToNUMANode(uint32_t nodeId)
    }

    if (cpu->membind(node->nodeset)) {
-        Platform::setThreadAffinity(static_cast<uint64_t>(hwloc_bitmap_first(node->cpuset)));
+        OS::setThreadAffinity(static_cast<uint64_t>(hwloc_bitmap_first(node->cpuset)));

        return true;
    }
--- a/src/crypto/rx/RxNUMAStorage.h
+++ b/src/crypto/rx/RxNUMAStorage.h
@ -1,7 +1,7 @@
 /* XMRig
 * Copyright (c) 2018-2019 tevador     <tevador@gmail.com>
- * Copyright (c) 2018-2021 SChernykh   <https://github.com/SChernykh>
- * Copyright (c) 2016-2021 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright (c) 2018-2022 SChernykh   <https://github.com/SChernykh>
+ * Copyright (c) 2016-2022 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
--- a/Show more
+++ b/Show more