REDACTED-rig/src/crypto/ghostrider/sph_keccak.c

/* $Id: keccak.c 259 2011-07-19 22:11:27Z tp $ */
/*
 * Keccak implementation.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */

#include "sph_keccak.h"
#include <stddef.h>
#include <string.h>

#ifdef __cplusplus
extern "C" {
#endif

// Taken from keccak-gate.c
int hard_coded_eb = 1;

/*
 * Parameters:
 *
 *  SPH_KECCAK_64          use a 64-bit type
 *  SPH_KECCAK_UNROLL      number of loops to unroll (0/undef for full unroll)
 *  SPH_KECCAK_INTERLEAVE  use bit-interleaving (32-bit type only)
 *  SPH_KECCAK_NOCOPY      do not copy the state into local variables
 *
 * If there is no usable 64-bit type, the code automatically switches
 * back to the 32-bit implementation.
 *
 * Some tests on an Intel Core2 Q6600 (both 64-bit and 32-bit, 32 kB L1
 * code cache), a PowerPC (G3, 32 kB L1 code cache), an ARM920T core
 * (16 kB L1 code cache), and a small MIPS-compatible CPU (Broadcom BCM3302,
 * 8 kB L1 code cache), seem to show that the following are optimal:
 *
 * -- x86, 64-bit: use the 64-bit implementation, unroll 8 rounds,
 * do not copy the state; unrolling 2, 6 or all rounds also provides
 * near-optimal performance.
 * -- x86, 32-bit: use the 32-bit implementation, unroll 6 rounds,
 * interleave, do not copy the state. Unrolling 1, 2, 4 or 8 rounds
 * also provides near-optimal performance.
 * -- PowerPC: use the 64-bit implementation, unroll 8 rounds,
 * copy the state. Unrolling 4 or 6 rounds is near-optimal.
 * -- ARM: use the 64-bit implementation, unroll 2 or 4 rounds,
 * copy the state.
 * -- MIPS: use the 64-bit implementation, unroll 2 rounds, copy
 * the state. Unrolling only 1 round is also near-optimal.
 *
 * Also, interleaving does not always yield actual improvements when
 * using a 32-bit implementation; in particular when the architecture
 * does not offer a native rotation opcode (interleaving replaces one
 * 64-bit rotation with two 32-bit rotations, which is a gain only if
 * there is a native 32-bit rotation opcode and not a native 64-bit
 * rotation opcode; also, interleaving implies a small overhead when
 * processing input words).
 *
 * To sum up:
 * -- when possible, use the 64-bit code
 * -- exception: on 32-bit x86, use 32-bit code
 * -- when using 32-bit code, use interleaving
 * -- copy the state, except on x86
 * -- unroll 8 rounds on "big" machine, 2 rounds on "small" machines
 */

#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_KECCAK
#define SPH_SMALL_FOOTPRINT_KECCAK 1
#endif

/*
 * By default, we select the 64-bit implementation if a 64-bit type
 * is available, unless a 32-bit x86 is detected.
 */
#if !defined SPH_KECCAK_64 && SPH_64 &&                                        \
    !(defined __i386__ || SPH_I386_GCC || SPH_I386_MSVC)
#define SPH_KECCAK_64 1
#endif

/*
 * If using a 32-bit implementation, we prefer to interleave.
 */
#if !SPH_KECCAK_64 && !defined SPH_KECCAK_INTERLEAVE
#define SPH_KECCAK_INTERLEAVE 1
#endif

/*
 * Unroll 8 rounds on big systems, 2 rounds on small systems.
 */
#ifndef SPH_KECCAK_UNROLL
#if SPH_SMALL_FOOTPRINT_KECCAK
#define SPH_KECCAK_UNROLL 2
#else
#define SPH_KECCAK_UNROLL 8
#endif
#endif

/*
 * We do not want to copy the state to local variables on x86 (32-bit
 * and 64-bit alike).
 */
#ifndef SPH_KECCAK_NOCOPY
#if defined __i386__ || defined __x86_64 || SPH_I386_MSVC || SPH_I386_GCC
#define SPH_KECCAK_NOCOPY 1
#else
#define SPH_KECCAK_NOCOPY 0
#endif
#endif

#ifdef _MSC_VER
#pragma warning(disable : 4146)
#endif

#if SPH_KECCAK_64

static const sph_u64 RC[] = {
    SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
    SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
    SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
    SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
    SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
    SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
    SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
    SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
    SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
    SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
    SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
    SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)};

#if SPH_KECCAK_NOCOPY

#define a00 (kc->u.wide[0])
#define a10 (kc->u.wide[1])
#define a20 (kc->u.wide[2])
#define a30 (kc->u.wide[3])
#define a40 (kc->u.wide[4])
#define a01 (kc->u.wide[5])
#define a11 (kc->u.wide[6])
#define a21 (kc->u.wide[7])
#define a31 (kc->u.wide[8])
#define a41 (kc->u.wide[9])
#define a02 (kc->u.wide[10])
#define a12 (kc->u.wide[11])
#define a22 (kc->u.wide[12])
#define a32 (kc->u.wide[13])
#define a42 (kc->u.wide[14])
#define a03 (kc->u.wide[15])
#define a13 (kc->u.wide[16])
#define a23 (kc->u.wide[17])
#define a33 (kc->u.wide[18])
#define a43 (kc->u.wide[19])
#define a04 (kc->u.wide[20])
#define a14 (kc->u.wide[21])
#define a24 (kc->u.wide[22])
#define a34 (kc->u.wide[23])
#define a44 (kc->u.wide[24])

#define DECL_STATE
#define READ_STATE(sc)
#define WRITE_STATE(sc)

#define INPUT_BUF(size)                                                        \
  do {                                                                         \
    size_t j;                                                                  \
    for (j = 0; j < (size); j += 8) {                                          \
      kc->u.wide[j >> 3] ^= sph_dec64le_aligned(buf + j);                      \
    }                                                                          \
  } while (0)

#define INPUT_BUF144 INPUT_BUF(144)
#define INPUT_BUF136 INPUT_BUF(136)
#define INPUT_BUF104 INPUT_BUF(104)
#define INPUT_BUF72 INPUT_BUF(72)

#else

#define DECL_STATE                                                             \
  sph_u64 a00, a01, a02, a03, a04;                                             \
  sph_u64 a10, a11, a12, a13, a14;                                             \
  sph_u64 a20, a21, a22, a23, a24;                                             \
  sph_u64 a30, a31, a32, a33, a34;                                             \
  sph_u64 a40, a41, a42, a43, a44;

#define READ_STATE(state)                                                      \
  do {                                                                         \
    a00 = (state)->u.wide[0];                                                  \
    a10 = (state)->u.wide[1];                                                  \
    a20 = (state)->u.wide[2];                                                  \
    a30 = (state)->u.wide[3];                                                  \
    a40 = (state)->u.wide[4];                                                  \
    a01 = (state)->u.wide[5];                                                  \
    a11 = (state)->u.wide[6];                                                  \
    a21 = (state)->u.wide[7];                                                  \
    a31 = (state)->u.wide[8];                                                  \
    a41 = (state)->u.wide[9];                                                  \
    a02 = (state)->u.wide[10];                                                 \
    a12 = (state)->u.wide[11];                                                 \
    a22 = (state)->u.wide[12];                                                 \
    a32 = (state)->u.wide[13];                                                 \
    a42 = (state)->u.wide[14];                                                 \
    a03 = (state)->u.wide[15];                                                 \
    a13 = (state)->u.wide[16];                                                 \
    a23 = (state)->u.wide[17];                                                 \
    a33 = (state)->u.wide[18];                                                 \
    a43 = (state)->u.wide[19];                                                 \
    a04 = (state)->u.wide[20];                                                 \
    a14 = (state)->u.wide[21];                                                 \
    a24 = (state)->u.wide[22];                                                 \
    a34 = (state)->u.wide[23];                                                 \
    a44 = (state)->u.wide[24];                                                 \
  } while (0)

#define WRITE_STATE(state)                                                     \
  do {                                                                         \
    (state)->u.wide[0] = a00;                                                  \
    (state)->u.wide[1] = a10;                                                  \
    (state)->u.wide[2] = a20;                                                  \
    (state)->u.wide[3] = a30;                                                  \
    (state)->u.wide[4] = a40;                                                  \
    (state)->u.wide[5] = a01;                                                  \
    (state)->u.wide[6] = a11;                                                  \
    (state)->u.wide[7] = a21;                                                  \
    (state)->u.wide[8] = a31;                                                  \
    (state)->u.wide[9] = a41;                                                  \
    (state)->u.wide[10] = a02;                                                 \
    (state)->u.wide[11] = a12;                                                 \
    (state)->u.wide[12] = a22;                                                 \
    (state)->u.wide[13] = a32;                                                 \
    (state)->u.wide[14] = a42;                                                 \
    (state)->u.wide[15] = a03;                                                 \
    (state)->u.wide[16] = a13;                                                 \
    (state)->u.wide[17] = a23;                                                 \
    (state)->u.wide[18] = a33;                                                 \
    (state)->u.wide[19] = a43;                                                 \
    (state)->u.wide[20] = a04;                                                 \
    (state)->u.wide[21] = a14;                                                 \
    (state)->u.wide[22] = a24;                                                 \
    (state)->u.wide[23] = a34;                                                 \
    (state)->u.wide[24] = a44;                                                 \
  } while (0)

#define INPUT_BUF144                                                           \
  do {                                                                         \
    a00 ^= sph_dec64le_aligned(buf + 0);                                       \
    a10 ^= sph_dec64le_aligned(buf + 8);                                       \
    a20 ^= sph_dec64le_aligned(buf + 16);                                      \
    a30 ^= sph_dec64le_aligned(buf + 24);                                      \
    a40 ^= sph_dec64le_aligned(buf + 32);                                      \
    a01 ^= sph_dec64le_aligned(buf + 40);                                      \
    a11 ^= sph_dec64le_aligned(buf + 48);                                      \
    a21 ^= sph_dec64le_aligned(buf + 56);                                      \
    a31 ^= sph_dec64le_aligned(buf + 64);                                      \
    a41 ^= sph_dec64le_aligned(buf + 72);                                      \
    a02 ^= sph_dec64le_aligned(buf + 80);                                      \
    a12 ^= sph_dec64le_aligned(buf + 88);                                      \
    a22 ^= sph_dec64le_aligned(buf + 96);                                      \
    a32 ^= sph_dec64le_aligned(buf + 104);                                     \
    a42 ^= sph_dec64le_aligned(buf + 112);                                     \
    a03 ^= sph_dec64le_aligned(buf + 120);                                     \
    a13 ^= sph_dec64le_aligned(buf + 128);                                     \
    a23 ^= sph_dec64le_aligned(buf + 136);                                     \
  } while (0)

#define INPUT_BUF136                                                           \
  do {                                                                         \
    a00 ^= sph_dec64le_aligned(buf + 0);                                       \
    a10 ^= sph_dec64le_aligned(buf + 8);                                       \
    a20 ^= sph_dec64le_aligned(buf + 16);                                      \
    a30 ^= sph_dec64le_aligned(buf + 24);                                      \
    a40 ^= sph_dec64le_aligned(buf + 32);                                      \
    a01 ^= sph_dec64le_aligned(buf + 40);                                      \
    a11 ^= sph_dec64le_aligned(buf + 48);                                      \
    a21 ^= sph_dec64le_aligned(buf + 56);                                      \
    a31 ^= sph_dec64le_aligned(buf + 64);                                      \
    a41 ^= sph_dec64le_aligned(buf + 72);                                      \
    a02 ^= sph_dec64le_aligned(buf + 80);                                      \
    a12 ^= sph_dec64le_aligned(buf + 88);                                      \
    a22 ^= sph_dec64le_aligned(buf + 96);                                      \
    a32 ^= sph_dec64le_aligned(buf + 104);                                     \
    a42 ^= sph_dec64le_aligned(buf + 112);                                     \
    a03 ^= sph_dec64le_aligned(buf + 120);                                     \
    a13 ^= sph_dec64le_aligned(buf + 128);                                     \
  } while (0)

#define INPUT_BUF104                                                           \
  do {                                                                         \
    a00 ^= sph_dec64le_aligned(buf + 0);                                       \
    a10 ^= sph_dec64le_aligned(buf + 8);                                       \
    a20 ^= sph_dec64le_aligned(buf + 16);                                      \
    a30 ^= sph_dec64le_aligned(buf + 24);                                      \
    a40 ^= sph_dec64le_aligned(buf + 32);                                      \
    a01 ^= sph_dec64le_aligned(buf + 40);                                      \
    a11 ^= sph_dec64le_aligned(buf + 48);                                      \
    a21 ^= sph_dec64le_aligned(buf + 56);                                      \
    a31 ^= sph_dec64le_aligned(buf + 64);                                      \
    a41 ^= sph_dec64le_aligned(buf + 72);                                      \
    a02 ^= sph_dec64le_aligned(buf + 80);                                      \
    a12 ^= sph_dec64le_aligned(buf + 88);                                      \
    a22 ^= sph_dec64le_aligned(buf + 96);                                      \
  } while (0)

#define INPUT_BUF72                                                            \
  do {                                                                         \
    a00 ^= sph_dec64le_aligned(buf + 0);                                       \
    a10 ^= sph_dec64le_aligned(buf + 8);                                       \
    a20 ^= sph_dec64le_aligned(buf + 16);                                      \
    a30 ^= sph_dec64le_aligned(buf + 24);                                      \
    a40 ^= sph_dec64le_aligned(buf + 32);                                      \
    a01 ^= sph_dec64le_aligned(buf + 40);                                      \
    a11 ^= sph_dec64le_aligned(buf + 48);                                      \
    a21 ^= sph_dec64le_aligned(buf + 56);                                      \
    a31 ^= sph_dec64le_aligned(buf + 64);                                      \
  } while (0)

#define INPUT_BUF(lim)                                                         \
  do {                                                                         \
    a00 ^= sph_dec64le_aligned(buf + 0);                                       \
    a10 ^= sph_dec64le_aligned(buf + 8);                                       \
    a20 ^= sph_dec64le_aligned(buf + 16);                                      \
    a30 ^= sph_dec64le_aligned(buf + 24);                                      \
    a40 ^= sph_dec64le_aligned(buf + 32);                                      \
    a01 ^= sph_dec64le_aligned(buf + 40);                                      \
    a11 ^= sph_dec64le_aligned(buf + 48);                                      \
    a21 ^= sph_dec64le_aligned(buf + 56);                                      \
    a31 ^= sph_dec64le_aligned(buf + 64);                                      \
    if ((lim) == 72)                                                           \
      break;                                                                   \
    a41 ^= sph_dec64le_aligned(buf + 72);                                      \
    a02 ^= sph_dec64le_aligned(buf + 80);                                      \
    a12 ^= sph_dec64le_aligned(buf + 88);                                      \
    a22 ^= sph_dec64le_aligned(buf + 96);                                      \
    if ((lim) == 104)                                                          \
      break;                                                                   \
    a32 ^= sph_dec64le_aligned(buf + 104);                                     \
    a42 ^= sph_dec64le_aligned(buf + 112);                                     \
    a03 ^= sph_dec64le_aligned(buf + 120);                                     \
    a13 ^= sph_dec64le_aligned(buf + 128);                                     \
    if ((lim) == 136)                                                          \
      break;                                                                   \
    a23 ^= sph_dec64le_aligned(buf + 136);                                     \
  } while (0)

#endif

#define DECL64(x) sph_u64 x
#define MOV64(d, s) (d = s)
#define XOR64(d, a, b) (d = a ^ b)
#define AND64(d, a, b) (d = a & b)
#define OR64(d, a, b) (d = a | b)
#define NOT64(d, s) (d = SPH_T64(~s))
#define ROL64(d, v, n) (d = SPH_ROTL64(v, n))
#define XOR64_IOTA XOR64

#else

static const struct {
  sph_u32 high, low;
} RC[] = {
#if SPH_KECCAK_INTERLEAVE
    {SPH_C32(0x00000000), SPH_C32(0x00000001)},
    {SPH_C32(0x00000089), SPH_C32(0x00000000)},
    {SPH_C32(0x8000008B), SPH_C32(0x00000000)},
    {SPH_C32(0x80008080), SPH_C32(0x00000000)},
    {SPH_C32(0x0000008B), SPH_C32(0x00000001)},
    {SPH_C32(0x00008000), SPH_C32(0x00000001)},
    {SPH_C32(0x80008088), SPH_C32(0x00000001)},
    {SPH_C32(0x80000082), SPH_C32(0x00000001)},
    {SPH_C32(0x0000000B), SPH_C32(0x00000000)},
    {SPH_C32(0x0000000A), SPH_C32(0x00000000)},
    {SPH_C32(0x00008082), SPH_C32(0x00000001)},
    {SPH_C32(0x00008003), SPH_C32(0x00000000)},
    {SPH_C32(0x0000808B), SPH_C32(0x00000001)},
    {SPH_C32(0x8000000B), SPH_C32(0x00000001)},
    {SPH_C32(0x8000008A), SPH_C32(0x00000001)},
    {SPH_C32(0x80000081), SPH_C32(0x00000001)},
    {SPH_C32(0x80000081), SPH_C32(0x00000000)},
    {SPH_C32(0x80000008), SPH_C32(0x00000000)},
    {SPH_C32(0x00000083), SPH_C32(0x00000000)},
    {SPH_C32(0x80008003), SPH_C32(0x00000000)},
    {SPH_C32(0x80008088), SPH_C32(0x00000001)},
    {SPH_C32(0x80000088), SPH_C32(0x00000000)},
    {SPH_C32(0x00008000), SPH_C32(0x00000001)},
    {SPH_C32(0x80008082), SPH_C32(0x00000000)}
#else
    {SPH_C32(0x00000000), SPH_C32(0x00000001)},
    {SPH_C32(0x00000000), SPH_C32(0x00008082)},
    {SPH_C32(0x80000000), SPH_C32(0x0000808A)},
    {SPH_C32(0x80000000), SPH_C32(0x80008000)},
    {SPH_C32(0x00000000), SPH_C32(0x0000808B)},
    {SPH_C32(0x00000000), SPH_C32(0x80000001)},
    {SPH_C32(0x80000000), SPH_C32(0x80008081)},
    {SPH_C32(0x80000000), SPH_C32(0x00008009)},
    {SPH_C32(0x00000000), SPH_C32(0x0000008A)},
    {SPH_C32(0x00000000), SPH_C32(0x00000088)},
    {SPH_C32(0x00000000), SPH_C32(0x80008009)},
    {SPH_C32(0x00000000), SPH_C32(0x8000000A)},
    {SPH_C32(0x00000000), SPH_C32(0x8000808B)},
    {SPH_C32(0x80000000), SPH_C32(0x0000008B)},
    {SPH_C32(0x80000000), SPH_C32(0x00008089)},
    {SPH_C32(0x80000000), SPH_C32(0x00008003)},
    {SPH_C32(0x80000000), SPH_C32(0x00008002)},
    {SPH_C32(0x80000000), SPH_C32(0x00000080)},
    {SPH_C32(0x00000000), SPH_C32(0x0000800A)},
    {SPH_C32(0x80000000), SPH_C32(0x8000000A)},
    {SPH_C32(0x80000000), SPH_C32(0x80008081)},
    {SPH_C32(0x80000000), SPH_C32(0x00008080)},
    {SPH_C32(0x00000000), SPH_C32(0x80000001)},
    {SPH_C32(0x80000000), SPH_C32(0x80008008)}
#endif
};

#if SPH_KECCAK_INTERLEAVE

#define INTERLEAVE(xl, xh)                                                     \
  do {                                                                         \
    sph_u32 l, h, t;                                                           \
    l = (xl);                                                                  \
    h = (xh);                                                                  \
    t = (l ^ (l >> 1)) & SPH_C32(0x22222222);                                  \
    l ^= t ^ (t << 1);                                                         \
    t = (h ^ (h >> 1)) & SPH_C32(0x22222222);                                  \
    h ^= t ^ (t << 1);                                                         \
    t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C);                                  \
    l ^= t ^ (t << 2);                                                         \
    t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C);                                  \
    h ^= t ^ (t << 2);                                                         \
    t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0);                                  \
    l ^= t ^ (t << 4);                                                         \
    t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0);                                  \
    h ^= t ^ (t << 4);                                                         \
    t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00);                                  \
    l ^= t ^ (t << 8);                                                         \
    t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00);                                  \
    h ^= t ^ (t << 8);                                                         \
    t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000);                          \
    l ^= t;                                                                    \
    h ^= t >> 16;                                                              \
    (xl) = l;                                                                  \
    (xh) = h;                                                                  \
  } while (0)

#define UNINTERLEAVE(xl, xh)                                                   \
  do {                                                                         \
    sph_u32 l, h, t;                                                           \
    l = (xl);                                                                  \
    h = (xh);                                                                  \
    t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000);                          \
    l ^= t;                                                                    \
    h ^= t >> 16;                                                              \
    t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00);                                  \
    l ^= t ^ (t << 8);                                                         \
    t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00);                                  \
    h ^= t ^ (t << 8);                                                         \
    t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0);                                  \
    l ^= t ^ (t << 4);                                                         \
    t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0);                                  \
    h ^= t ^ (t << 4);                                                         \
    t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C);                                  \
    l ^= t ^ (t << 2);                                                         \
    t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C);                                  \
    h ^= t ^ (t << 2);                                                         \
    t = (l ^ (l >> 1)) & SPH_C32(0x22222222);                                  \
    l ^= t ^ (t << 1);                                                         \
    t = (h ^ (h >> 1)) & SPH_C32(0x22222222);                                  \
    h ^= t ^ (t << 1);                                                         \
    (xl) = l;                                                                  \
    (xh) = h;                                                                  \
  } while (0)

#else

#define INTERLEAVE(l, h)
#define UNINTERLEAVE(l, h)

#endif

#if SPH_KECCAK_NOCOPY

#define a00l (kc->u.narrow[2 * 0 + 0])
#define a00h (kc->u.narrow[2 * 0 + 1])
#define a10l (kc->u.narrow[2 * 1 + 0])
#define a10h (kc->u.narrow[2 * 1 + 1])
#define a20l (kc->u.narrow[2 * 2 + 0])
#define a20h (kc->u.narrow[2 * 2 + 1])
#define a30l (kc->u.narrow[2 * 3 + 0])
#define a30h (kc->u.narrow[2 * 3 + 1])
#define a40l (kc->u.narrow[2 * 4 + 0])
#define a40h (kc->u.narrow[2 * 4 + 1])
#define a01l (kc->u.narrow[2 * 5 + 0])
#define a01h (kc->u.narrow[2 * 5 + 1])
#define a11l (kc->u.narrow[2 * 6 + 0])
#define a11h (kc->u.narrow[2 * 6 + 1])
#define a21l (kc->u.narrow[2 * 7 + 0])
#define a21h (kc->u.narrow[2 * 7 + 1])
#define a31l (kc->u.narrow[2 * 8 + 0])
#define a31h (kc->u.narrow[2 * 8 + 1])
#define a41l (kc->u.narrow[2 * 9 + 0])
#define a41h (kc->u.narrow[2 * 9 + 1])
#define a02l (kc->u.narrow[2 * 10 + 0])
#define a02h (kc->u.narrow[2 * 10 + 1])
#define a12l (kc->u.narrow[2 * 11 + 0])
#define a12h (kc->u.narrow[2 * 11 + 1])
#define a22l (kc->u.narrow[2 * 12 + 0])
#define a22h (kc->u.narrow[2 * 12 + 1])
#define a32l (kc->u.narrow[2 * 13 + 0])
#define a32h (kc->u.narrow[2 * 13 + 1])
#define a42l (kc->u.narrow[2 * 14 + 0])
#define a42h (kc->u.narrow[2 * 14 + 1])
#define a03l (kc->u.narrow[2 * 15 + 0])
#define a03h (kc->u.narrow[2 * 15 + 1])
#define a13l (kc->u.narrow[2 * 16 + 0])
#define a13h (kc->u.narrow[2 * 16 + 1])
#define a23l (kc->u.narrow[2 * 17 + 0])
#define a23h (kc->u.narrow[2 * 17 + 1])
#define a33l (kc->u.narrow[2 * 18 + 0])
#define a33h (kc->u.narrow[2 * 18 + 1])
#define a43l (kc->u.narrow[2 * 19 + 0])
#define a43h (kc->u.narrow[2 * 19 + 1])
#define a04l (kc->u.narrow[2 * 20 + 0])
#define a04h (kc->u.narrow[2 * 20 + 1])
#define a14l (kc->u.narrow[2 * 21 + 0])
#define a14h (kc->u.narrow[2 * 21 + 1])
#define a24l (kc->u.narrow[2 * 22 + 0])
#define a24h (kc->u.narrow[2 * 22 + 1])
#define a34l (kc->u.narrow[2 * 23 + 0])
#define a34h (kc->u.narrow[2 * 23 + 1])
#define a44l (kc->u.narrow[2 * 24 + 0])
#define a44h (kc->u.narrow[2 * 24 + 1])

#define DECL_STATE
#define READ_STATE(state)
#define WRITE_STATE(state)

#define INPUT_BUF(size)                                                        \
  do {                                                                         \
    size_t j;                                                                  \
    for (j = 0; j < (size); j += 8) {                                          \
      sph_u32 tl, th;                                                          \
      tl = sph_dec32le_aligned(buf + j + 0);                                   \
      th = sph_dec32le_aligned(buf + j + 4);                                   \
      INTERLEAVE(tl, th);                                                      \
      kc->u.narrow[(j >> 2) + 0] ^= tl;                                        \
      kc->u.narrow[(j >> 2) + 1] ^= th;                                        \
    }                                                                          \
  } while (0)

#define INPUT_BUF144 INPUT_BUF(144)
#define INPUT_BUF136 INPUT_BUF(136)
#define INPUT_BUF104 INPUT_BUF(104)
#define INPUT_BUF72 INPUT_BUF(72)

#else

#define DECL_STATE                                                             \
  sph_u32 a00l, a00h, a01l, a01h, a02l, a02h, a03l, a03h, a04l, a04h;          \
  sph_u32 a10l, a10h, a11l, a11h, a12l, a12h, a13l, a13h, a14l, a14h;          \
  sph_u32 a20l, a20h, a21l, a21h, a22l, a22h, a23l, a23h, a24l, a24h;          \
  sph_u32 a30l, a30h, a31l, a31h, a32l, a32h, a33l, a33h, a34l, a34h;          \
  sph_u32 a40l, a40h, a41l, a41h, a42l, a42h, a43l, a43h, a44l, a44h;

#define READ_STATE(state)                                                      \
  do {                                                                         \
    a00l = (state)->u.narrow[2 * 0 + 0];                                       \
    a00h = (state)->u.narrow[2 * 0 + 1];                                       \
    a10l = (state)->u.narrow[2 * 1 + 0];                                       \
    a10h = (state)->u.narrow[2 * 1 + 1];                                       \
    a20l = (state)->u.narrow[2 * 2 + 0];                                       \
    a20h = (state)->u.narrow[2 * 2 + 1];                                       \
    a30l = (state)->u.narrow[2 * 3 + 0];                                       \
    a30h = (state)->u.narrow[2 * 3 + 1];                                       \
    a40l = (state)->u.narrow[2 * 4 + 0];                                       \
    a40h = (state)->u.narrow[2 * 4 + 1];                                       \
    a01l = (state)->u.narrow[2 * 5 + 0];                                       \
    a01h = (state)->u.narrow[2 * 5 + 1];                                       \
    a11l = (state)->u.narrow[2 * 6 + 0];                                       \
    a11h = (state)->u.narrow[2 * 6 + 1];                                       \
    a21l = (state)->u.narrow[2 * 7 + 0];                                       \
    a21h = (state)->u.narrow[2 * 7 + 1];                                       \
    a31l = (state)->u.narrow[2 * 8 + 0];                                       \
    a31h = (state)->u.narrow[2 * 8 + 1];                                       \
    a41l = (state)->u.narrow[2 * 9 + 0];                                       \
    a41h = (state)->u.narrow[2 * 9 + 1];                                       \
    a02l = (state)->u.narrow[2 * 10 + 0];                                      \
    a02h = (state)->u.narrow[2 * 10 + 1];                                      \
    a12l = (state)->u.narrow[2 * 11 + 0];                                      \
    a12h = (state)->u.narrow[2 * 11 + 1];                                      \
    a22l = (state)->u.narrow[2 * 12 + 0];                                      \
    a22h = (state)->u.narrow[2 * 12 + 1];                                      \
    a32l = (state)->u.narrow[2 * 13 + 0];                                      \
    a32h = (state)->u.narrow[2 * 13 + 1];                                      \
    a42l = (state)->u.narrow[2 * 14 + 0];                                      \
    a42h = (state)->u.narrow[2 * 14 + 1];                                      \
    a03l = (state)->u.narrow[2 * 15 + 0];                                      \
    a03h = (state)->u.narrow[2 * 15 + 1];                                      \
    a13l = (state)->u.narrow[2 * 16 + 0];                                      \
    a13h = (state)->u.narrow[2 * 16 + 1];                                      \
    a23l = (state)->u.narrow[2 * 17 + 0];                                      \
    a23h = (state)->u.narrow[2 * 17 + 1];                                      \
    a33l = (state)->u.narrow[2 * 18 + 0];                                      \
    a33h = (state)->u.narrow[2 * 18 + 1];                                      \
    a43l = (state)->u.narrow[2 * 19 + 0];                                      \
    a43h = (state)->u.narrow[2 * 19 + 1];                                      \
    a04l = (state)->u.narrow[2 * 20 + 0];                                      \
    a04h = (state)->u.narrow[2 * 20 + 1];                                      \
    a14l = (state)->u.narrow[2 * 21 + 0];                                      \
    a14h = (state)->u.narrow[2 * 21 + 1];                                      \
    a24l = (state)->u.narrow[2 * 22 + 0];                                      \
    a24h = (state)->u.narrow[2 * 22 + 1];                                      \
    a34l = (state)->u.narrow[2 * 23 + 0];                                      \
    a34h = (state)->u.narrow[2 * 23 + 1];                                      \
    a44l = (state)->u.narrow[2 * 24 + 0];                                      \
    a44h = (state)->u.narrow[2 * 24 + 1];                                      \
  } while (0)

#define WRITE_STATE(state)                                                     \
  do {                                                                         \
    (state)->u.narrow[2 * 0 + 0] = a00l;                                       \
    (state)->u.narrow[2 * 0 + 1] = a00h;                                       \
    (state)->u.narrow[2 * 1 + 0] = a10l;                                       \
    (state)->u.narrow[2 * 1 + 1] = a10h;                                       \
    (state)->u.narrow[2 * 2 + 0] = a20l;                                       \
    (state)->u.narrow[2 * 2 + 1] = a20h;                                       \
    (state)->u.narrow[2 * 3 + 0] = a30l;                                       \
    (state)->u.narrow[2 * 3 + 1] = a30h;                                       \
    (state)->u.narrow[2 * 4 + 0] = a40l;                                       \
    (state)->u.narrow[2 * 4 + 1] = a40h;                                       \
    (state)->u.narrow[2 * 5 + 0] = a01l;                                       \
    (state)->u.narrow[2 * 5 + 1] = a01h;                                       \
    (state)->u.narrow[2 * 6 + 0] = a11l;                                       \
    (state)->u.narrow[2 * 6 + 1] = a11h;                                       \
    (state)->u.narrow[2 * 7 + 0] = a21l;                                       \
    (state)->u.narrow[2 * 7 + 1] = a21h;                                       \
    (state)->u.narrow[2 * 8 + 0] = a31l;                                       \
    (state)->u.narrow[2 * 8 + 1] = a31h;                                       \
    (state)->u.narrow[2 * 9 + 0] = a41l;                                       \
    (state)->u.narrow[2 * 9 + 1] = a41h;                                       \
    (state)->u.narrow[2 * 10 + 0] = a02l;                                      \
    (state)->u.narrow[2 * 10 + 1] = a02h;                                      \
    (state)->u.narrow[2 * 11 + 0] = a12l;                                      \
    (state)->u.narrow[2 * 11 + 1] = a12h;                                      \
    (state)->u.narrow[2 * 12 + 0] = a22l;                                      \
    (state)->u.narrow[2 * 12 + 1] = a22h;                                      \
    (state)->u.narrow[2 * 13 + 0] = a32l;                                      \
    (state)->u.narrow[2 * 13 + 1] = a32h;                                      \
    (state)->u.narrow[2 * 14 + 0] = a42l;                                      \
    (state)->u.narrow[2 * 14 + 1] = a42h;                                      \
    (state)->u.narrow[2 * 15 + 0] = a03l;                                      \
    (state)->u.narrow[2 * 15 + 1] = a03h;                                      \
    (state)->u.narrow[2 * 16 + 0] = a13l;                                      \
    (state)->u.narrow[2 * 16 + 1] = a13h;                                      \
    (state)->u.narrow[2 * 17 + 0] = a23l;                                      \
    (state)->u.narrow[2 * 17 + 1] = a23h;                                      \
    (state)->u.narrow[2 * 18 + 0] = a33l;                                      \
    (state)->u.narrow[2 * 18 + 1] = a33h;                                      \
    (state)->u.narrow[2 * 19 + 0] = a43l;                                      \
    (state)->u.narrow[2 * 19 + 1] = a43h;                                      \
    (state)->u.narrow[2 * 20 + 0] = a04l;                                      \
    (state)->u.narrow[2 * 20 + 1] = a04h;                                      \
    (state)->u.narrow[2 * 21 + 0] = a14l;                                      \
    (state)->u.narrow[2 * 21 + 1] = a14h;                                      \
    (state)->u.narrow[2 * 22 + 0] = a24l;                                      \
    (state)->u.narrow[2 * 22 + 1] = a24h;                                      \
    (state)->u.narrow[2 * 23 + 0] = a34l;                                      \
    (state)->u.narrow[2 * 23 + 1] = a34h;                                      \
    (state)->u.narrow[2 * 24 + 0] = a44l;                                      \
    (state)->u.narrow[2 * 24 + 1] = a44h;                                      \
  } while (0)

#define READ64(d, off)                                                         \
  do {                                                                         \
    sph_u32 tl, th;                                                            \
    tl = sph_dec32le_aligned(buf + (off));                                     \
    th = sph_dec32le_aligned(buf + (off) + 4);                                 \
    INTERLEAVE(tl, th);                                                        \
    d##l ^= tl;                                                                \
    d##h ^= th;                                                                \
  } while (0)

#define INPUT_BUF144                                                           \
  do {                                                                         \
    READ64(a00, 0);                                                            \
    READ64(a10, 8);                                                            \
    READ64(a20, 16);                                                           \
    READ64(a30, 24);                                                           \
    READ64(a40, 32);                                                           \
    READ64(a01, 40);                                                           \
    READ64(a11, 48);                                                           \
    READ64(a21, 56);                                                           \
    READ64(a31, 64);                                                           \
    READ64(a41, 72);                                                           \
    READ64(a02, 80);                                                           \
    READ64(a12, 88);                                                           \
    READ64(a22, 96);                                                           \
    READ64(a32, 104);                                                          \
    READ64(a42, 112);                                                          \
    READ64(a03, 120);                                                          \
    READ64(a13, 128);                                                          \
    READ64(a23, 136);                                                          \
  } while (0)

#define INPUT_BUF136                                                           \
  do {                                                                         \
    READ64(a00, 0);                                                            \
    READ64(a10, 8);                                                            \
    READ64(a20, 16);                                                           \
    READ64(a30, 24);                                                           \
    READ64(a40, 32);                                                           \
    READ64(a01, 40);                                                           \
    READ64(a11, 48);                                                           \
    READ64(a21, 56);                                                           \
    READ64(a31, 64);                                                           \
    READ64(a41, 72);                                                           \
    READ64(a02, 80);                                                           \
    READ64(a12, 88);                                                           \
    READ64(a22, 96);                                                           \
    READ64(a32, 104);                                                          \
    READ64(a42, 112);                                                          \
    READ64(a03, 120);                                                          \
    READ64(a13, 128);                                                          \
  } while (0)

#define INPUT_BUF104                                                           \
  do {                                                                         \
    READ64(a00, 0);                                                            \
    READ64(a10, 8);                                                            \
    READ64(a20, 16);                                                           \
    READ64(a30, 24);                                                           \
    READ64(a40, 32);                                                           \
    READ64(a01, 40);                                                           \
    READ64(a11, 48);                                                           \
    READ64(a21, 56);                                                           \
    READ64(a31, 64);                                                           \
    READ64(a41, 72);                                                           \
    READ64(a02, 80);                                                           \
    READ64(a12, 88);                                                           \
    READ64(a22, 96);                                                           \
  } while (0)

#define INPUT_BUF72                                                            \
  do {                                                                         \
    READ64(a00, 0);                                                            \
    READ64(a10, 8);                                                            \
    READ64(a20, 16);                                                           \
    READ64(a30, 24);                                                           \
    READ64(a40, 32);                                                           \
    READ64(a01, 40);                                                           \
    READ64(a11, 48);                                                           \
    READ64(a21, 56);                                                           \
    READ64(a31, 64);                                                           \
  } while (0)

#define INPUT_BUF(lim)                                                         \
  do {                                                                         \
    READ64(a00, 0);                                                            \
    READ64(a10, 8);                                                            \
    READ64(a20, 16);                                                           \
    READ64(a30, 24);                                                           \
    READ64(a40, 32);                                                           \
    READ64(a01, 40);                                                           \
    READ64(a11, 48);                                                           \
    READ64(a21, 56);                                                           \
    READ64(a31, 64);                                                           \
    if ((lim) == 72)                                                           \
      break;                                                                   \
    READ64(a41, 72);                                                           \
    READ64(a02, 80);                                                           \
    READ64(a12, 88);                                                           \
    READ64(a22, 96);                                                           \
    if ((lim) == 104)                                                          \
      break;                                                                   \
    READ64(a32, 104);                                                          \
    READ64(a42, 112);                                                          \
    READ64(a03, 120);                                                          \
    READ64(a13, 128);                                                          \
    if ((lim) == 136)                                                          \
      break;                                                                   \
    READ64(a23, 136);                                                          \
  } while (0)

#endif

#define DECL64(x) sph_u64 x##l, x##h
#define MOV64(d, s) (d##l = s##l, d##h = s##h)
#define XOR64(d, a, b) (d##l = a##l ^ b##l, d##h = a##h ^ b##h)
#define AND64(d, a, b) (d##l = a##l & b##l, d##h = a##h & b##h)
#define OR64(d, a, b) (d##l = a##l | b##l, d##h = a##h | b##h)
#define NOT64(d, s) (d##l = SPH_T32(~s##l), d##h = SPH_T32(~s##h))
#define ROL64(d, v, n) ROL64_##n(d, v)

#if SPH_KECCAK_INTERLEAVE

#define ROL64_odd1(d, v)                                                       \
  do {                                                                         \
    sph_u32 tmp;                                                               \
    tmp = v##l;                                                                \
    d##l = SPH_T32(v##h << 1) | (v##h >> 31);                                  \
    d##h = tmp;                                                                \
  } while (0)

#define ROL64_odd63(d, v)                                                      \
  do {                                                                         \
    sph_u32 tmp;                                                               \
    tmp = SPH_T32(v##l << 31) | (v##l >> 1);                                   \
    d##l = v##h;                                                               \
    d##h = tmp;                                                                \
  } while (0)

#define ROL64_odd(d, v, n)                                                     \
  do {                                                                         \
    sph_u32 tmp;                                                               \
    tmp = SPH_T32(v##l << (n - 1)) | (v##l >> (33 - n));                       \
    d##l = SPH_T32(v##h << n) | (v##h >> (32 - n));                            \
    d##h = tmp;                                                                \
  } while (0)

#define ROL64_even(d, v, n)                                                    \
  do {                                                                         \
    d##l = SPH_T32(v##l << n) | (v##l >> (32 - n));                            \
    d##h = SPH_T32(v##h << n) | (v##h >> (32 - n));                            \
  } while (0)

#define ROL64_0(d, v)
#define ROL64_1(d, v) ROL64_odd1(d, v)
#define ROL64_2(d, v) ROL64_even(d, v, 1)
#define ROL64_3(d, v) ROL64_odd(d, v, 2)
#define ROL64_4(d, v) ROL64_even(d, v, 2)
#define ROL64_5(d, v) ROL64_odd(d, v, 3)
#define ROL64_6(d, v) ROL64_even(d, v, 3)
#define ROL64_7(d, v) ROL64_odd(d, v, 4)
#define ROL64_8(d, v) ROL64_even(d, v, 4)
#define ROL64_9(d, v) ROL64_odd(d, v, 5)
#define ROL64_10(d, v) ROL64_even(d, v, 5)
#define ROL64_11(d, v) ROL64_odd(d, v, 6)
#define ROL64_12(d, v) ROL64_even(d, v, 6)
#define ROL64_13(d, v) ROL64_odd(d, v, 7)
#define ROL64_14(d, v) ROL64_even(d, v, 7)
#define ROL64_15(d, v) ROL64_odd(d, v, 8)
#define ROL64_16(d, v) ROL64_even(d, v, 8)
#define ROL64_17(d, v) ROL64_odd(d, v, 9)
#define ROL64_18(d, v) ROL64_even(d, v, 9)
#define ROL64_19(d, v) ROL64_odd(d, v, 10)
#define ROL64_20(d, v) ROL64_even(d, v, 10)
#define ROL64_21(d, v) ROL64_odd(d, v, 11)
#define ROL64_22(d, v) ROL64_even(d, v, 11)
#define ROL64_23(d, v) ROL64_odd(d, v, 12)
#define ROL64_24(d, v) ROL64_even(d, v, 12)
#define ROL64_25(d, v) ROL64_odd(d, v, 13)
#define ROL64_26(d, v) ROL64_even(d, v, 13)
#define ROL64_27(d, v) ROL64_odd(d, v, 14)
#define ROL64_28(d, v) ROL64_even(d, v, 14)
#define ROL64_29(d, v) ROL64_odd(d, v, 15)
#define ROL64_30(d, v) ROL64_even(d, v, 15)
#define ROL64_31(d, v) ROL64_odd(d, v, 16)
#define ROL64_32(d, v) ROL64_even(d, v, 16)
#define ROL64_33(d, v) ROL64_odd(d, v, 17)
#define ROL64_34(d, v) ROL64_even(d, v, 17)
#define ROL64_35(d, v) ROL64_odd(d, v, 18)
#define ROL64_36(d, v) ROL64_even(d, v, 18)
#define ROL64_37(d, v) ROL64_odd(d, v, 19)
#define ROL64_38(d, v) ROL64_even(d, v, 19)
#define ROL64_39(d, v) ROL64_odd(d, v, 20)
#define ROL64_40(d, v) ROL64_even(d, v, 20)
#define ROL64_41(d, v) ROL64_odd(d, v, 21)
#define ROL64_42(d, v) ROL64_even(d, v, 21)
#define ROL64_43(d, v) ROL64_odd(d, v, 22)
#define ROL64_44(d, v) ROL64_even(d, v, 22)
#define ROL64_45(d, v) ROL64_odd(d, v, 23)
#define ROL64_46(d, v) ROL64_even(d, v, 23)
#define ROL64_47(d, v) ROL64_odd(d, v, 24)
#define ROL64_48(d, v) ROL64_even(d, v, 24)
#define ROL64_49(d, v) ROL64_odd(d, v, 25)
#define ROL64_50(d, v) ROL64_even(d, v, 25)
#define ROL64_51(d, v) ROL64_odd(d, v, 26)
#define ROL64_52(d, v) ROL64_even(d, v, 26)
#define ROL64_53(d, v) ROL64_odd(d, v, 27)
#define ROL64_54(d, v) ROL64_even(d, v, 27)
#define ROL64_55(d, v) ROL64_odd(d, v, 28)
#define ROL64_56(d, v) ROL64_even(d, v, 28)
#define ROL64_57(d, v) ROL64_odd(d, v, 29)
#define ROL64_58(d, v) ROL64_even(d, v, 29)
#define ROL64_59(d, v) ROL64_odd(d, v, 30)
#define ROL64_60(d, v) ROL64_even(d, v, 30)
#define ROL64_61(d, v) ROL64_odd(d, v, 31)
#define ROL64_62(d, v) ROL64_even(d, v, 31)
#define ROL64_63(d, v) ROL64_odd63(d, v)

#else

#define ROL64_small(d, v, n)                                                   \
  do {                                                                         \
    sph_u32 tmp;                                                               \
    tmp = SPH_T32(v##l << n) | (v##h >> (32 - n));                             \
    d##h = SPH_T32(v##h << n) | (v##l >> (32 - n));                            \
    d##l = tmp;                                                                \
  } while (0)

#define ROL64_0(d, v) 0
#define ROL64_1(d, v) ROL64_small(d, v, 1)
#define ROL64_2(d, v) ROL64_small(d, v, 2)
#define ROL64_3(d, v) ROL64_small(d, v, 3)
#define ROL64_4(d, v) ROL64_small(d, v, 4)
#define ROL64_5(d, v) ROL64_small(d, v, 5)
#define ROL64_6(d, v) ROL64_small(d, v, 6)
#define ROL64_7(d, v) ROL64_small(d, v, 7)
#define ROL64_8(d, v) ROL64_small(d, v, 8)
#define ROL64_9(d, v) ROL64_small(d, v, 9)
#define ROL64_10(d, v) ROL64_small(d, v, 10)
#define ROL64_11(d, v) ROL64_small(d, v, 11)
#define ROL64_12(d, v) ROL64_small(d, v, 12)
#define ROL64_13(d, v) ROL64_small(d, v, 13)
#define ROL64_14(d, v) ROL64_small(d, v, 14)
#define ROL64_15(d, v) ROL64_small(d, v, 15)
#define ROL64_16(d, v) ROL64_small(d, v, 16)
#define ROL64_17(d, v) ROL64_small(d, v, 17)
#define ROL64_18(d, v) ROL64_small(d, v, 18)
#define ROL64_19(d, v) ROL64_small(d, v, 19)
#define ROL64_20(d, v) ROL64_small(d, v, 20)
#define ROL64_21(d, v) ROL64_small(d, v, 21)
#define ROL64_22(d, v) ROL64_small(d, v, 22)
#define ROL64_23(d, v) ROL64_small(d, v, 23)
#define ROL64_24(d, v) ROL64_small(d, v, 24)
#define ROL64_25(d, v) ROL64_small(d, v, 25)
#define ROL64_26(d, v) ROL64_small(d, v, 26)
#define ROL64_27(d, v) ROL64_small(d, v, 27)
#define ROL64_28(d, v) ROL64_small(d, v, 28)
#define ROL64_29(d, v) ROL64_small(d, v, 29)
#define ROL64_30(d, v) ROL64_small(d, v, 30)
#define ROL64_31(d, v) ROL64_small(d, v, 31)

#define ROL64_32(d, v)                                                         \
  do {                                                                         \
    sph_u32 tmp;                                                               \
    tmp = v##l;                                                                \
    d##l = v##h;                                                               \
    d##h = tmp;                                                                \
  } while (0)

#define ROL64_big(d, v, n)                                                     \
  do {                                                                         \
    sph_u32 trl, trh;                                                          \
    ROL64_small(tr, v, n);                                                     \
    d##h = trl;                                                                \
    d##l = trh;                                                                \
  } while (0)

#define ROL64_33(d, v) ROL64_big(d, v, 1)
#define ROL64_34(d, v) ROL64_big(d, v, 2)
#define ROL64_35(d, v) ROL64_big(d, v, 3)
#define ROL64_36(d, v) ROL64_big(d, v, 4)
#define ROL64_37(d, v) ROL64_big(d, v, 5)
#define ROL64_38(d, v) ROL64_big(d, v, 6)
#define ROL64_39(d, v) ROL64_big(d, v, 7)
#define ROL64_40(d, v) ROL64_big(d, v, 8)
#define ROL64_41(d, v) ROL64_big(d, v, 9)
#define ROL64_42(d, v) ROL64_big(d, v, 10)
#define ROL64_43(d, v) ROL64_big(d, v, 11)
#define ROL64_44(d, v) ROL64_big(d, v, 12)
#define ROL64_45(d, v) ROL64_big(d, v, 13)
#define ROL64_46(d, v) ROL64_big(d, v, 14)
#define ROL64_47(d, v) ROL64_big(d, v, 15)
#define ROL64_48(d, v) ROL64_big(d, v, 16)
#define ROL64_49(d, v) ROL64_big(d, v, 17)
#define ROL64_50(d, v) ROL64_big(d, v, 18)
#define ROL64_51(d, v) ROL64_big(d, v, 19)
#define ROL64_52(d, v) ROL64_big(d, v, 20)
#define ROL64_53(d, v) ROL64_big(d, v, 21)
#define ROL64_54(d, v) ROL64_big(d, v, 22)
#define ROL64_55(d, v) ROL64_big(d, v, 23)
#define ROL64_56(d, v) ROL64_big(d, v, 24)
#define ROL64_57(d, v) ROL64_big(d, v, 25)
#define ROL64_58(d, v) ROL64_big(d, v, 26)
#define ROL64_59(d, v) ROL64_big(d, v, 27)
#define ROL64_60(d, v) ROL64_big(d, v, 28)
#define ROL64_61(d, v) ROL64_big(d, v, 29)
#define ROL64_62(d, v) ROL64_big(d, v, 30)
#define ROL64_63(d, v) ROL64_big(d, v, 31)

#endif

#define XOR64_IOTA(d, s, k) (d##l = s##l ^ k.low, d##h = s##h ^ k.high)

#endif

#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)                      \
  do {                                                                         \
    DECL64(tt0);                                                               \
    DECL64(tt1);                                                               \
    DECL64(tt2);                                                               \
    DECL64(tt3);                                                               \
    XOR64(tt0, d0, d1);                                                        \
    XOR64(tt1, d2, d3);                                                        \
    XOR64(tt0, tt0, d4);                                                       \
    XOR64(tt0, tt0, tt1);                                                      \
    ROL64(tt0, tt0, 1);                                                        \
    XOR64(tt2, c0, c1);                                                        \
    XOR64(tt3, c2, c3);                                                        \
    XOR64(tt0, tt0, c4);                                                       \
    XOR64(tt2, tt2, tt3);                                                      \
    XOR64(t, tt0, tt2);                                                        \
  } while (0)

#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, b20, b21, b22, \
              b23, b24, b30, b31, b32, b33, b34, b40, b41, b42, b43, b44)      \
  do {                                                                         \
    DECL64(t0);                                                                \
    DECL64(t1);                                                                \
    DECL64(t2);                                                                \
    DECL64(t3);                                                                \
    DECL64(t4);                                                                \
    TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14);              \
    TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24);              \
    TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34);              \
    TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44);              \
    TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04);              \
    XOR64(b00, b00, t0);                                                       \
    XOR64(b01, b01, t0);                                                       \
    XOR64(b02, b02, t0);                                                       \
    XOR64(b03, b03, t0);                                                       \
    XOR64(b04, b04, t0);                                                       \
    XOR64(b10, b10, t1);                                                       \
    XOR64(b11, b11, t1);                                                       \
    XOR64(b12, b12, t1);                                                       \
    XOR64(b13, b13, t1);                                                       \
    XOR64(b14, b14, t1);                                                       \
    XOR64(b20, b20, t2);                                                       \
    XOR64(b21, b21, t2);                                                       \
    XOR64(b22, b22, t2);                                                       \
    XOR64(b23, b23, t2);                                                       \
    XOR64(b24, b24, t2);                                                       \
    XOR64(b30, b30, t3);                                                       \
    XOR64(b31, b31, t3);                                                       \
    XOR64(b32, b32, t3);                                                       \
    XOR64(b33, b33, t3);                                                       \
    XOR64(b34, b34, t3);                                                       \
    XOR64(b40, b40, t4);                                                       \
    XOR64(b41, b41, t4);                                                       \
    XOR64(b42, b42, t4);                                                       \
    XOR64(b43, b43, t4);                                                       \
    XOR64(b44, b44, t4);                                                       \
  } while (0)

#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, b20, b21, b22,   \
            b23, b24, b30, b31, b32, b33, b34, b40, b41, b42, b43, b44)        \
  do {                                                                         \
    /* ROL64(b00, b00,  0); */                                                 \
    ROL64(b01, b01, 36);                                                       \
    ROL64(b02, b02, 3);                                                        \
    ROL64(b03, b03, 41);                                                       \
    ROL64(b04, b04, 18);                                                       \
    ROL64(b10, b10, 1);                                                        \
    ROL64(b11, b11, 44);                                                       \
    ROL64(b12, b12, 10);                                                       \
    ROL64(b13, b13, 45);                                                       \
    ROL64(b14, b14, 2);                                                        \
    ROL64(b20, b20, 62);                                                       \
    ROL64(b21, b21, 6);                                                        \
    ROL64(b22, b22, 43);                                                       \
    ROL64(b23, b23, 15);                                                       \
    ROL64(b24, b24, 61);                                                       \
    ROL64(b30, b30, 28);                                                       \
    ROL64(b31, b31, 55);                                                       \
    ROL64(b32, b32, 25);                                                       \
    ROL64(b33, b33, 21);                                                       \
    ROL64(b34, b34, 56);                                                       \
    ROL64(b40, b40, 27);                                                       \
    ROL64(b41, b41, 20);                                                       \
    ROL64(b42, b42, 39);                                                       \
    ROL64(b43, b43, 8);                                                        \
    ROL64(b44, b44, 14);                                                       \
  } while (0)

/*
 * The KHI macro integrates the "lane complement" optimization. On input,
 * some words are complemented:
 *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
 * On output, the following words are complemented:
 *    a04 a10 a20 a22 a23 a31
 *
 * The (implicit) permutation and the theta expansion will bring back
 * the input mask for the next round.
 */

#define KHI_XO(d, a, b, c)                                                     \
  do {                                                                         \
    DECL64(kt);                                                                \
    OR64(kt, b, c);                                                            \
    XOR64(d, a, kt);                                                           \
  } while (0)

#define KHI_XA(d, a, b, c)                                                     \
  do {                                                                         \
    DECL64(kt);                                                                \
    AND64(kt, b, c);                                                           \
    XOR64(d, a, kt);                                                           \
  } while (0)

#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, b20, b21, b22,   \
            b23, b24, b30, b31, b32, b33, b34, b40, b41, b42, b43, b44)        \
  do {                                                                         \
    DECL64(c0);                                                                \
    DECL64(c1);                                                                \
    DECL64(c2);                                                                \
    DECL64(c3);                                                                \
    DECL64(c4);                                                                \
    DECL64(bnn);                                                               \
    NOT64(bnn, b20);                                                           \
    KHI_XO(c0, b00, b10, b20);                                                 \
    KHI_XO(c1, b10, bnn, b30);                                                 \
    KHI_XA(c2, b20, b30, b40);                                                 \
    KHI_XO(c3, b30, b40, b00);                                                 \
    KHI_XA(c4, b40, b00, b10);                                                 \
    MOV64(b00, c0);                                                            \
    MOV64(b10, c1);                                                            \
    MOV64(b20, c2);                                                            \
    MOV64(b30, c3);                                                            \
    MOV64(b40, c4);                                                            \
    NOT64(bnn, b41);                                                           \
    KHI_XO(c0, b01, b11, b21);                                                 \
    KHI_XA(c1, b11, b21, b31);                                                 \
    KHI_XO(c2, b21, b31, bnn);                                                 \
    KHI_XO(c3, b31, b41, b01);                                                 \
    KHI_XA(c4, b41, b01, b11);                                                 \
    MOV64(b01, c0);                                                            \
    MOV64(b11, c1);                                                            \
    MOV64(b21, c2);                                                            \
    MOV64(b31, c3);                                                            \
    MOV64(b41, c4);                                                            \
    NOT64(bnn, b32);                                                           \
    KHI_XO(c0, b02, b12, b22);                                                 \
    KHI_XA(c1, b12, b22, b32);                                                 \
    KHI_XA(c2, b22, bnn, b42);                                                 \
    KHI_XO(c3, bnn, b42, b02);                                                 \
    KHI_XA(c4, b42, b02, b12);                                                 \
    MOV64(b02, c0);                                                            \
    MOV64(b12, c1);                                                            \
    MOV64(b22, c2);                                                            \
    MOV64(b32, c3);                                                            \
    MOV64(b42, c4);                                                            \
    NOT64(bnn, b33);                                                           \
    KHI_XA(c0, b03, b13, b23);                                                 \
    KHI_XO(c1, b13, b23, b33);                                                 \
    KHI_XO(c2, b23, bnn, b43);                                                 \
    KHI_XA(c3, bnn, b43, b03);                                                 \
    KHI_XO(c4, b43, b03, b13);                                                 \
    MOV64(b03, c0);                                                            \
    MOV64(b13, c1);                                                            \
    MOV64(b23, c2);                                                            \
    MOV64(b33, c3);                                                            \
    MOV64(b43, c4);                                                            \
    NOT64(bnn, b14);                                                           \
    KHI_XA(c0, b04, bnn, b24);                                                 \
    KHI_XO(c1, bnn, b24, b34);                                                 \
    KHI_XA(c2, b24, b34, b44);                                                 \
    KHI_XO(c3, b34, b44, b04);                                                 \
    KHI_XA(c4, b44, b04, b14);                                                 \
    MOV64(b04, c0);                                                            \
    MOV64(b14, c1);                                                            \
    MOV64(b24, c2);                                                            \
    MOV64(b34, c3);                                                            \
    MOV64(b44, c4);                                                            \
  } while (0)

#define IOTA(r) XOR64_IOTA(a00, a00, r)

#define P0                                                                     \
  a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, a22, a23, a24,   \
      a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
#define P1                                                                     \
  a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, a32, a12, a42,   \
      a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
#define P2                                                                     \
  a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, a43, a21, a04,   \
      a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
#define P3                                                                     \
  a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, a34, a02, a20,   \
      a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
#define P4                                                                     \
  a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, a03, a10, a22,   \
      a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
#define P5                                                                     \
  a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, a40, a11, a32,   \
      a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
#define P6                                                                     \
  a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, a44, a41, a43,   \
      a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
#define P7                                                                     \
  a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, a14, a24, a34,   \
      a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
#define P8                                                                     \
  a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, a31, a42, a03,   \
      a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
#define P9                                                                     \
  a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, a13, a04, a40,   \
      a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
#define P10                                                                    \
  a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, a01, a20, a44,   \
      a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
#define P11                                                                    \
  a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, a30, a22, a14,   \
      a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
#define P12                                                                    \
  a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, a33, a32, a31,   \
      a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
#define P13                                                                    \
  a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, a23, a43, a13,   \
      a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
#define P14                                                                    \
  a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, a12, a34, a01,   \
      a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
#define P15                                                                    \
  a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, a21, a03, a30,   \
      a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
#define P16                                                                    \
  a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, a02, a40, a33,   \
      a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
#define P17                                                                    \
  a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, a10, a44, a23,   \
      a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
#define P18                                                                    \
  a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, a11, a14, a12,   \
      a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
#define P19                                                                    \
  a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, a41, a31, a21,   \
      a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
#define P20                                                                    \
  a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, a24, a13, a02,   \
      a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
#define P21                                                                    \
  a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, a42, a01, a10,   \
      a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
#define P22                                                                    \
  a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, a04, a30, a11,   \
      a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
#define P23                                                                    \
  a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, a20, a33, a41,   \
      a01, a14, a22, a30, a43, a03, a11, a24, a32, a40

#define P1_TO_P0                                                               \
  do {                                                                         \
    DECL64(t);                                                                 \
    MOV64(t, a01);                                                             \
    MOV64(a01, a30);                                                           \
    MOV64(a30, a33);                                                           \
    MOV64(a33, a23);                                                           \
    MOV64(a23, a12);                                                           \
    MOV64(a12, a21);                                                           \
    MOV64(a21, a02);                                                           \
    MOV64(a02, a10);                                                           \
    MOV64(a10, a11);                                                           \
    MOV64(a11, a41);                                                           \
    MOV64(a41, a24);                                                           \
    MOV64(a24, a42);                                                           \
    MOV64(a42, a04);                                                           \
    MOV64(a04, a20);                                                           \
    MOV64(a20, a22);                                                           \
    MOV64(a22, a32);                                                           \
    MOV64(a32, a43);                                                           \
    MOV64(a43, a34);                                                           \
    MOV64(a34, a03);                                                           \
    MOV64(a03, a40);                                                           \
    MOV64(a40, a44);                                                           \
    MOV64(a44, a14);                                                           \
    MOV64(a14, a31);                                                           \
    MOV64(a31, a13);                                                           \
    MOV64(a13, t);                                                             \
  } while (0)

#define P2_TO_P0                                                               \
  do {                                                                         \
    DECL64(t);                                                                 \
    MOV64(t, a01);                                                             \
    MOV64(a01, a33);                                                           \
    MOV64(a33, a12);                                                           \
    MOV64(a12, a02);                                                           \
    MOV64(a02, a11);                                                           \
    MOV64(a11, a24);                                                           \
    MOV64(a24, a04);                                                           \
    MOV64(a04, a22);                                                           \
    MOV64(a22, a43);                                                           \
    MOV64(a43, a03);                                                           \
    MOV64(a03, a44);                                                           \
    MOV64(a44, a31);                                                           \
    MOV64(a31, t);                                                             \
    MOV64(t, a10);                                                             \
    MOV64(a10, a41);                                                           \
    MOV64(a41, a42);                                                           \
    MOV64(a42, a20);                                                           \
    MOV64(a20, a32);                                                           \
    MOV64(a32, a34);                                                           \
    MOV64(a34, a40);                                                           \
    MOV64(a40, a14);                                                           \
    MOV64(a14, a13);                                                           \
    MOV64(a13, a30);                                                           \
    MOV64(a30, a23);                                                           \
    MOV64(a23, a21);                                                           \
    MOV64(a21, t);                                                             \
  } while (0)

#define P4_TO_P0                                                               \
  do {                                                                         \
    DECL64(t);                                                                 \
    MOV64(t, a01);                                                             \
    MOV64(a01, a12);                                                           \
    MOV64(a12, a11);                                                           \
    MOV64(a11, a04);                                                           \
    MOV64(a04, a43);                                                           \
    MOV64(a43, a44);                                                           \
    MOV64(a44, t);                                                             \
    MOV64(t, a02);                                                             \
    MOV64(a02, a24);                                                           \
    MOV64(a24, a22);                                                           \
    MOV64(a22, a03);                                                           \
    MOV64(a03, a31);                                                           \
    MOV64(a31, a33);                                                           \
    MOV64(a33, t);                                                             \
    MOV64(t, a10);                                                             \
    MOV64(a10, a42);                                                           \
    MOV64(a42, a32);                                                           \
    MOV64(a32, a40);                                                           \
    MOV64(a40, a13);                                                           \
    MOV64(a13, a23);                                                           \
    MOV64(a23, t);                                                             \
    MOV64(t, a14);                                                             \
    MOV64(a14, a30);                                                           \
    MOV64(a30, a21);                                                           \
    MOV64(a21, a41);                                                           \
    MOV64(a41, a20);                                                           \
    MOV64(a20, a34);                                                           \
    MOV64(a34, t);                                                             \
  } while (0)

#define P6_TO_P0                                                               \
  do {                                                                         \
    DECL64(t);                                                                 \
    MOV64(t, a01);                                                             \
    MOV64(a01, a02);                                                           \
    MOV64(a02, a04);                                                           \
    MOV64(a04, a03);                                                           \
    MOV64(a03, t);                                                             \
    MOV64(t, a10);                                                             \
    MOV64(a10, a20);                                                           \
    MOV64(a20, a40);                                                           \
    MOV64(a40, a30);                                                           \
    MOV64(a30, t);                                                             \
    MOV64(t, a11);                                                             \
    MOV64(a11, a22);                                                           \
    MOV64(a22, a44);                                                           \
    MOV64(a44, a33);                                                           \
    MOV64(a33, t);                                                             \
    MOV64(t, a12);                                                             \
    MOV64(a12, a24);                                                           \
    MOV64(a24, a43);                                                           \
    MOV64(a43, a31);                                                           \
    MOV64(a31, t);                                                             \
    MOV64(t, a13);                                                             \
    MOV64(a13, a21);                                                           \
    MOV64(a21, a42);                                                           \
    MOV64(a42, a34);                                                           \
    MOV64(a34, t);                                                             \
    MOV64(t, a14);                                                             \
    MOV64(a14, a23);                                                           \
    MOV64(a23, a41);                                                           \
    MOV64(a41, a32);                                                           \
    MOV64(a32, t);                                                             \
  } while (0)

#define P8_TO_P0                                                               \
  do {                                                                         \
    DECL64(t);                                                                 \
    MOV64(t, a01);                                                             \
    MOV64(a01, a11);                                                           \
    MOV64(a11, a43);                                                           \
    MOV64(a43, t);                                                             \
    MOV64(t, a02);                                                             \
    MOV64(a02, a22);                                                           \
    MOV64(a22, a31);                                                           \
    MOV64(a31, t);                                                             \
    MOV64(t, a03);                                                             \
    MOV64(a03, a33);                                                           \
    MOV64(a33, a24);                                                           \
    MOV64(a24, t);                                                             \
    MOV64(t, a04);                                                             \
    MOV64(a04, a44);                                                           \
    MOV64(a44, a12);                                                           \
    MOV64(a12, t);                                                             \
    MOV64(t, a10);                                                             \
    MOV64(a10, a32);                                                           \
    MOV64(a32, a13);                                                           \
    MOV64(a13, t);                                                             \
    MOV64(t, a14);                                                             \
    MOV64(a14, a21);                                                           \
    MOV64(a21, a20);                                                           \
    MOV64(a20, t);                                                             \
    MOV64(t, a23);                                                             \
    MOV64(a23, a42);                                                           \
    MOV64(a42, a40);                                                           \
    MOV64(a40, t);                                                             \
    MOV64(t, a30);                                                             \
    MOV64(a30, a41);                                                           \
    MOV64(a41, a34);                                                           \
    MOV64(a34, t);                                                             \
  } while (0)

#define P12_TO_P0                                                              \
  do {                                                                         \
    DECL64(t);                                                                 \
    MOV64(t, a01);                                                             \
    MOV64(a01, a04);                                                           \
    MOV64(a04, t);                                                             \
    MOV64(t, a02);                                                             \
    MOV64(a02, a03);                                                           \
    MOV64(a03, t);                                                             \
    MOV64(t, a10);                                                             \
    MOV64(a10, a40);                                                           \
    MOV64(a40, t);                                                             \
    MOV64(t, a11);                                                             \
    MOV64(a11, a44);                                                           \
    MOV64(a44, t);                                                             \
    MOV64(t, a12);                                                             \
    MOV64(a12, a43);                                                           \
    MOV64(a43, t);                                                             \
    MOV64(t, a13);                                                             \
    MOV64(a13, a42);                                                           \
    MOV64(a42, t);                                                             \
    MOV64(t, a14);                                                             \
    MOV64(a14, a41);                                                           \
    MOV64(a41, t);                                                             \
    MOV64(t, a20);                                                             \
    MOV64(a20, a30);                                                           \
    MOV64(a30, t);                                                             \
    MOV64(t, a21);                                                             \
    MOV64(a21, a34);                                                           \
    MOV64(a34, t);                                                             \
    MOV64(t, a22);                                                             \
    MOV64(a22, a33);                                                           \
    MOV64(a33, t);                                                             \
    MOV64(t, a23);                                                             \
    MOV64(a23, a32);                                                           \
    MOV64(a32, t);                                                             \
    MOV64(t, a24);                                                             \
    MOV64(a24, a31);                                                           \
    MOV64(a31, t);                                                             \
  } while (0)

#define LPAR   (
#define RPAR   )

#define KF_ELT(r, s, k)                                                        \
  do {                                                                         \
    THETA LPAR P##r RPAR;                                                      \
    RHO LPAR P##r RPAR;                                                        \
    KHI LPAR P##s RPAR;                                                        \
    IOTA(k);                                                                   \
  } while (0)

#define DO(x) x

#define KECCAK_F_1600 DO(KECCAK_F_1600_)

#if SPH_KECCAK_UNROLL == 1

#define KECCAK_F_1600_                                                         \
  do {                                                                         \
    int j;                                                                     \
    for (j = 0; j < 24; j++) {                                                 \
      KF_ELT(0, 1, RC[j + 0]);                                                 \
      P1_TO_P0;                                                                \
    }                                                                          \
  } while (0)

#elif SPH_KECCAK_UNROLL == 2

#define KECCAK_F_1600_                                                         \
  do {                                                                         \
    int j;                                                                     \
    for (j = 0; j < 24; j += 2) {                                              \
      KF_ELT(0, 1, RC[j + 0]);                                                 \
      KF_ELT(1, 2, RC[j + 1]);                                                 \
      P2_TO_P0;                                                                \
    }                                                                          \
  } while (0)

#elif SPH_KECCAK_UNROLL == 4

#define KECCAK_F_1600_                                                         \
  do {                                                                         \
    int j;                                                                     \
    for (j = 0; j < 24; j += 4) {                                              \
      KF_ELT(0, 1, RC[j + 0]);                                                 \
      KF_ELT(1, 2, RC[j + 1]);                                                 \
      KF_ELT(2, 3, RC[j + 2]);                                                 \
      KF_ELT(3, 4, RC[j + 3]);                                                 \
      P4_TO_P0;                                                                \
    }                                                                          \
  } while (0)

#elif SPH_KECCAK_UNROLL == 6

#define KECCAK_F_1600_                                                         \
  do {                                                                         \
    int j;                                                                     \
    for (j = 0; j < 24; j += 6) {                                              \
      KF_ELT(0, 1, RC[j + 0]);                                                 \
      KF_ELT(1, 2, RC[j + 1]);                                                 \
      KF_ELT(2, 3, RC[j + 2]);                                                 \
      KF_ELT(3, 4, RC[j + 3]);                                                 \
      KF_ELT(4, 5, RC[j + 4]);                                                 \
      KF_ELT(5, 6, RC[j + 5]);                                                 \
      P6_TO_P0;                                                                \
    }                                                                          \
  } while (0)

#elif SPH_KECCAK_UNROLL == 8

#define KECCAK_F_1600_                                                         \
  do {                                                                         \
    int j;                                                                     \
    for (j = 0; j < 24; j += 8) {                                              \
      KF_ELT(0, 1, RC[j + 0]);                                                 \
      KF_ELT(1, 2, RC[j + 1]);                                                 \
      KF_ELT(2, 3, RC[j + 2]);                                                 \
      KF_ELT(3, 4, RC[j + 3]);                                                 \
      KF_ELT(4, 5, RC[j + 4]);                                                 \
      KF_ELT(5, 6, RC[j + 5]);                                                 \
      KF_ELT(6, 7, RC[j + 6]);                                                 \
      KF_ELT(7, 8, RC[j + 7]);                                                 \
      P8_TO_P0;                                                                \
    }                                                                          \
  } while (0)

#elif SPH_KECCAK_UNROLL == 12

#define KECCAK_F_1600_                                                         \
  do {                                                                         \
    int j;                                                                     \
    for (j = 0; j < 24; j += 12) {                                             \
      KF_ELT(0, 1, RC[j + 0]);                                                 \
      KF_ELT(1, 2, RC[j + 1]);                                                 \
      KF_ELT(2, 3, RC[j + 2]);                                                 \
      KF_ELT(3, 4, RC[j + 3]);                                                 \
      KF_ELT(4, 5, RC[j + 4]);                                                 \
      KF_ELT(5, 6, RC[j + 5]);                                                 \
      KF_ELT(6, 7, RC[j + 6]);                                                 \
      KF_ELT(7, 8, RC[j + 7]);                                                 \
      KF_ELT(8, 9, RC[j + 8]);                                                 \
      KF_ELT(9, 10, RC[j + 9]);                                                \
      KF_ELT(10, 11, RC[j + 10]);                                              \
      KF_ELT(11, 12, RC[j + 11]);                                              \
      P12_TO_P0;                                                               \
    }                                                                          \
  } while (0)

#elif SPH_KECCAK_UNROLL == 0

#define KECCAK_F_1600_                                                         \
  do {                                                                         \
    KF_ELT(0, 1, RC[0]);                                                       \
    KF_ELT(1, 2, RC[1]);                                                       \
    KF_ELT(2, 3, RC[2]);                                                       \
    KF_ELT(3, 4, RC[3]);                                                       \
    KF_ELT(4, 5, RC[4]);                                                       \
    KF_ELT(5, 6, RC[5]);                                                       \
    KF_ELT(6, 7, RC[6]);                                                       \
    KF_ELT(7, 8, RC[7]);                                                       \
    KF_ELT(8, 9, RC[8]);                                                       \
    KF_ELT(9, 10, RC[9]);                                                      \
    KF_ELT(10, 11, RC[10]);                                                    \
    KF_ELT(11, 12, RC[11]);                                                    \
    KF_ELT(12, 13, RC[12]);                                                    \
    KF_ELT(13, 14, RC[13]);                                                    \
    KF_ELT(14, 15, RC[14]);                                                    \
    KF_ELT(15, 16, RC[15]);                                                    \
    KF_ELT(16, 17, RC[16]);                                                    \
    KF_ELT(17, 18, RC[17]);                                                    \
    KF_ELT(18, 19, RC[18]);                                                    \
    KF_ELT(19, 20, RC[19]);                                                    \
    KF_ELT(20, 21, RC[20]);                                                    \
    KF_ELT(21, 22, RC[21]);                                                    \
    KF_ELT(22, 23, RC[22]);                                                    \
    KF_ELT(23, 0, RC[23]);                                                     \
  } while (0)

#else

#error Unimplemented unroll count for Keccak.

#endif

static void keccak_init(sph_keccak_context *kc, unsigned out_size) {
  int i;

#if SPH_KECCAK_64
  for (i = 0; i < 25; i++)
    kc->u.wide[i] = 0;
  /*
   * Initialization for the "lane complement".
   */
  kc->u.wide[1] = SPH_C64(0xFFFFFFFFFFFFFFFF);
  kc->u.wide[2] = SPH_C64(0xFFFFFFFFFFFFFFFF);
  kc->u.wide[8] = SPH_C64(0xFFFFFFFFFFFFFFFF);
  kc->u.wide[12] = SPH_C64(0xFFFFFFFFFFFFFFFF);
  kc->u.wide[17] = SPH_C64(0xFFFFFFFFFFFFFFFF);
  kc->u.wide[20] = SPH_C64(0xFFFFFFFFFFFFFFFF);
#else

  for (i = 0; i < 50; i++)
    kc->u.narrow[i] = 0;
  /*
   * Initialization for the "lane complement".
   * Note: since we set to all-one full 64-bit words,
   * interleaving (if applicable) is a no-op.
   */
  kc->u.narrow[2] = SPH_C32(0xFFFFFFFF);
  kc->u.narrow[3] = SPH_C32(0xFFFFFFFF);
  kc->u.narrow[4] = SPH_C32(0xFFFFFFFF);
  kc->u.narrow[5] = SPH_C32(0xFFFFFFFF);
  kc->u.narrow[16] = SPH_C32(0xFFFFFFFF);
  kc->u.narrow[17] = SPH_C32(0xFFFFFFFF);
  kc->u.narrow[24] = SPH_C32(0xFFFFFFFF);
  kc->u.narrow[25] = SPH_C32(0xFFFFFFFF);
  kc->u.narrow[34] = SPH_C32(0xFFFFFFFF);
  kc->u.narrow[35] = SPH_C32(0xFFFFFFFF);
  kc->u.narrow[40] = SPH_C32(0xFFFFFFFF);
  kc->u.narrow[41] = SPH_C32(0xFFFFFFFF);
#endif
  kc->ptr = 0;
  kc->lim = 200 - (out_size >> 2);
}

static void keccak_core(sph_keccak_context *kc, const void *data, size_t len,
                        size_t lim) {
  unsigned char *buf;
  size_t ptr;
  DECL_STATE

  buf = kc->buf;
  ptr = kc->ptr;

  if (len < (lim - ptr)) {
    memcpy(buf + ptr, data, len);
    kc->ptr = ptr + len;
    return;
  }

  READ_STATE(kc);
  while (len > 0) {
    size_t clen;

    clen = (lim - ptr);
    if (clen > len)
      clen = len;
    memcpy(buf + ptr, data, clen);
    ptr += clen;
    data = (const unsigned char *)data + clen;
    len -= clen;
    if (ptr == lim) {
      INPUT_BUF(lim);
      KECCAK_F_1600;
      ptr = 0;
    }
  }
  WRITE_STATE(kc);
  kc->ptr = ptr;
}

#if SPH_KECCAK_64

#define DEFCLOSE(d, lim)                                                       \
  static void keccak_close##d(sph_keccak_context *kc,                          \
                              unsigned ub, unsigned n,                         \
                              void *dst) {                                     \
    unsigned eb;                                                               \
    union {                                                                    \
      unsigned char tmp[lim + 1];                                              \
      sph_u64 dummy; /* for alignment */                                       \
    } u;                                                                       \
    size_t j;                                                                  \
                                                                               \
    eb = hard_coded_eb;                                                        \
    if (kc->ptr == (lim - 1)) {                                                \
      if (n == 7) {                                                            \
        u.tmp[0] = eb;                                                         \
        memset(u.tmp + 1, 0, lim - 1);                                         \
        u.tmp[lim] = 0x80;                                                     \
        j = 1 + lim;                                                           \
      } else {                                                                 \
        u.tmp[0] = eb | 0x80;                                                  \
        j = 1;                                                                 \
      }                                                                        \
    } else {                                                                   \
      j = lim - kc->ptr;                                                       \
      u.tmp[0] = eb;                                                           \
      memset(u.tmp + 1, 0, j - 2);                                             \
      u.tmp[j - 1] = 0x80;                                                     \
    }                                                                          \
    keccak_core(kc, u.tmp, j, lim);                                            \
    /* Finalize the "lane complement" */                                       \
    kc->u.wide[1] = ~kc->u.wide[1];                                            \
    kc->u.wide[2] = ~kc->u.wide[2];                                            \
    kc->u.wide[8] = ~kc->u.wide[8];                                            \
    kc->u.wide[12] = ~kc->u.wide[12];                                          \
    kc->u.wide[17] = ~kc->u.wide[17];                                          \
    kc->u.wide[20] = ~kc->u.wide[20];                                          \
    for (j = 0; j < d; j += 8)                                                 \
      sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]);                      \
    memcpy(dst, u.tmp, d);                                                     \
  }

#else

#define DEFCLOSE(d, lim)                                                       \
  static void keccak_close##d(sph_keccak_context *kc, unsigned ub, unsigned n, \
                              void *dst) {                                     \
    unsigned eb;                                                               \
    union {                                                                    \
      unsigned char tmp[lim + 1];                                              \
      sph_u64 dummy; /* for alignment */                                       \
    } u;                                                                       \
    size_t j;                                                                  \
                                                                               \
    eb = (0x100 | (ub & 0xFF)) >> (8 - n);                                     \
    if (kc->ptr == (lim - 1)) {                                                \
      if (n == 7) {                                                            \
        u.tmp[0] = eb;                                                         \
        memset(u.tmp + 1, 0, lim - 1);                                         \
        u.tmp[lim] = 0x80;                                                     \
        j = 1 + lim;                                                           \
      } else {                                                                 \
        u.tmp[0] = eb | 0x80;                                                  \
        j = 1;                                                                 \
      }                                                                        \
    } else {                                                                   \
      j = lim - kc->ptr;                                                       \
      u.tmp[0] = eb;                                                           \
      memset(u.tmp + 1, 0, j - 2);                                             \
      u.tmp[j - 1] = 0x80;                                                     \
    }                                                                          \
    keccak_core(kc, u.tmp, j, lim);                                            \
    /* Finalize the "lane complement" */                                       \
    kc->u.narrow[2] = ~kc->u.narrow[2];                                        \
    kc->u.narrow[3] = ~kc->u.narrow[3];                                        \
    kc->u.narrow[4] = ~kc->u.narrow[4];                                        \
    kc->u.narrow[5] = ~kc->u.narrow[5];                                        \
    kc->u.narrow[16] = ~kc->u.narrow[16];                                      \
    kc->u.narrow[17] = ~kc->u.narrow[17];                                      \
    kc->u.narrow[24] = ~kc->u.narrow[24];                                      \
    kc->u.narrow[25] = ~kc->u.narrow[25];                                      \
    kc->u.narrow[34] = ~kc->u.narrow[34];                                      \
    kc->u.narrow[35] = ~kc->u.narrow[35];                                      \
    kc->u.narrow[40] = ~kc->u.narrow[40];                                      \
    kc->u.narrow[41] = ~kc->u.narrow[41];                                      \
    /* un-interleave */                                                        \
    for (j = 0; j < 50; j += 2)                                                \
      UNINTERLEAVE(kc->u.narrow[j], kc->u.narrow[j + 1]);                      \
    for (j = 0; j < d; j += 4)                                                 \
      sph_enc32le_aligned(u.tmp + j, kc->u.narrow[j >> 2]);                    \
    memcpy(dst, u.tmp, d);                                                     \
    keccak_init(kc, (unsigned)d << 3);                                         \
  }

#endif

DEFCLOSE(28, 144)
DEFCLOSE(32, 136)
DEFCLOSE(48, 104)
DEFCLOSE(64, 72)

/* see sph_keccak.h */
void sph_keccak224_init(void *cc) { keccak_init(cc, 224); }

/* see sph_keccak.h */
void sph_keccak224(void *cc, const void *data, size_t len) {
  keccak_core(cc, data, len, 144);
}

/* see sph_keccak.h */
void sph_keccak224_close(void *cc, void *dst) {
  sph_keccak224_addbits_and_close(cc, 0, 0, dst);
}

/* see sph_keccak.h */
void sph_keccak224_addbits_and_close(void *cc, unsigned ub, unsigned n,
                                     void *dst) {
  keccak_close28(cc, ub, n, dst);
}

/* see sph_keccak.h */
void sph_keccak256_init(void *cc) { keccak_init(cc, 256); }

/* see sph_keccak.h */
void sph_keccak256(void *cc, const void *data, size_t len) {
  keccak_core(cc, data, len, 136);
}

/* see sph_keccak.h */
void sph_keccak256_close(void *cc, void *dst) {
  sph_keccak256_addbits_and_close(cc, 0, 0, dst);
}

/* see sph_keccak.h */
void sph_keccak256_addbits_and_close(void *cc, unsigned ub, unsigned n,
                                     void *dst) {
  keccak_close32(cc, ub, n, dst);
}

/* see sph_keccak.h */
void sph_keccak384_init(void *cc) { keccak_init(cc, 384); }

/* see sph_keccak.h */
void sph_keccak384(void *cc, const void *data, size_t len) {
  keccak_core(cc, data, len, 104);
}

/* see sph_keccak.h */
void sph_keccak384_close(void *cc, void *dst) {
  sph_keccak384_addbits_and_close(cc, 0, 0, dst);
}

/* see sph_keccak.h */
void sph_keccak384_addbits_and_close(void *cc, unsigned ub, unsigned n,
                                     void *dst) {
  keccak_close48(cc, ub, n, dst);
}

/* see sph_keccak.h */
void sph_keccak512_init(void *cc) { keccak_init(cc, 512); }

/* see sph_keccak.h */
void sph_keccak512(void *cc, const void *data, size_t len) {
  keccak_core(cc, data, len, 72);
}

/* see sph_keccak.h */
void sph_keccak512_close(void *cc, void *dst) {
  sph_keccak512_addbits_and_close(cc, 0, 0, dst);
}

/* see sph_keccak.h */
void sph_keccak512_addbits_and_close(void *cc, unsigned ub, unsigned n,
                                     void *dst) {
  keccak_close64(cc, ub, n, dst);
}

#ifdef __cplusplus
}
#endif