161 lines
8.3 KiB
Common Lisp
161 lines
8.3 KiB
Common Lisp
#ifndef WOLF_AES_CL
|
|
#define WOLF_AES_CL
|
|
|
|
#ifdef cl_amd_media_ops2
|
|
#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
|
|
#define STATIC static
|
|
#else
|
|
#define amd_bfe(src0, offset, width) ((src0 << (32 - (offset) - width)) >> (32 - width))
|
|
#define STATIC
|
|
#endif
|
|
|
|
// AES table - the other three are generated on the fly
|
|
|
|
STATIC const __constant uint AES0_C[256] =
|
|
{
|
|
0xA56363C6U, 0x847C7CF8U, 0x997777EEU, 0x8D7B7BF6U,
|
|
0x0DF2F2FFU, 0xBD6B6BD6U, 0xB16F6FDEU, 0x54C5C591U,
|
|
0x50303060U, 0x03010102U, 0xA96767CEU, 0x7D2B2B56U,
|
|
0x19FEFEE7U, 0x62D7D7B5U, 0xE6ABAB4DU, 0x9A7676ECU,
|
|
0x45CACA8FU, 0x9D82821FU, 0x40C9C989U, 0x877D7DFAU,
|
|
0x15FAFAEFU, 0xEB5959B2U, 0xC947478EU, 0x0BF0F0FBU,
|
|
0xECADAD41U, 0x67D4D4B3U, 0xFDA2A25FU, 0xEAAFAF45U,
|
|
0xBF9C9C23U, 0xF7A4A453U, 0x967272E4U, 0x5BC0C09BU,
|
|
0xC2B7B775U, 0x1CFDFDE1U, 0xAE93933DU, 0x6A26264CU,
|
|
0x5A36366CU, 0x413F3F7EU, 0x02F7F7F5U, 0x4FCCCC83U,
|
|
0x5C343468U, 0xF4A5A551U, 0x34E5E5D1U, 0x08F1F1F9U,
|
|
0x937171E2U, 0x73D8D8ABU, 0x53313162U, 0x3F15152AU,
|
|
0x0C040408U, 0x52C7C795U, 0x65232346U, 0x5EC3C39DU,
|
|
0x28181830U, 0xA1969637U, 0x0F05050AU, 0xB59A9A2FU,
|
|
0x0907070EU, 0x36121224U, 0x9B80801BU, 0x3DE2E2DFU,
|
|
0x26EBEBCDU, 0x6927274EU, 0xCDB2B27FU, 0x9F7575EAU,
|
|
0x1B090912U, 0x9E83831DU, 0x742C2C58U, 0x2E1A1A34U,
|
|
0x2D1B1B36U, 0xB26E6EDCU, 0xEE5A5AB4U, 0xFBA0A05BU,
|
|
0xF65252A4U, 0x4D3B3B76U, 0x61D6D6B7U, 0xCEB3B37DU,
|
|
0x7B292952U, 0x3EE3E3DDU, 0x712F2F5EU, 0x97848413U,
|
|
0xF55353A6U, 0x68D1D1B9U, 0x00000000U, 0x2CEDEDC1U,
|
|
0x60202040U, 0x1FFCFCE3U, 0xC8B1B179U, 0xED5B5BB6U,
|
|
0xBE6A6AD4U, 0x46CBCB8DU, 0xD9BEBE67U, 0x4B393972U,
|
|
0xDE4A4A94U, 0xD44C4C98U, 0xE85858B0U, 0x4ACFCF85U,
|
|
0x6BD0D0BBU, 0x2AEFEFC5U, 0xE5AAAA4FU, 0x16FBFBEDU,
|
|
0xC5434386U, 0xD74D4D9AU, 0x55333366U, 0x94858511U,
|
|
0xCF45458AU, 0x10F9F9E9U, 0x06020204U, 0x817F7FFEU,
|
|
0xF05050A0U, 0x443C3C78U, 0xBA9F9F25U, 0xE3A8A84BU,
|
|
0xF35151A2U, 0xFEA3A35DU, 0xC0404080U, 0x8A8F8F05U,
|
|
0xAD92923FU, 0xBC9D9D21U, 0x48383870U, 0x04F5F5F1U,
|
|
0xDFBCBC63U, 0xC1B6B677U, 0x75DADAAFU, 0x63212142U,
|
|
0x30101020U, 0x1AFFFFE5U, 0x0EF3F3FDU, 0x6DD2D2BFU,
|
|
0x4CCDCD81U, 0x140C0C18U, 0x35131326U, 0x2FECECC3U,
|
|
0xE15F5FBEU, 0xA2979735U, 0xCC444488U, 0x3917172EU,
|
|
0x57C4C493U, 0xF2A7A755U, 0x827E7EFCU, 0x473D3D7AU,
|
|
0xAC6464C8U, 0xE75D5DBAU, 0x2B191932U, 0x957373E6U,
|
|
0xA06060C0U, 0x98818119U, 0xD14F4F9EU, 0x7FDCDCA3U,
|
|
0x66222244U, 0x7E2A2A54U, 0xAB90903BU, 0x8388880BU,
|
|
0xCA46468CU, 0x29EEEEC7U, 0xD3B8B86BU, 0x3C141428U,
|
|
0x79DEDEA7U, 0xE25E5EBCU, 0x1D0B0B16U, 0x76DBDBADU,
|
|
0x3BE0E0DBU, 0x56323264U, 0x4E3A3A74U, 0x1E0A0A14U,
|
|
0xDB494992U, 0x0A06060CU, 0x6C242448U, 0xE45C5CB8U,
|
|
0x5DC2C29FU, 0x6ED3D3BDU, 0xEFACAC43U, 0xA66262C4U,
|
|
0xA8919139U, 0xA4959531U, 0x37E4E4D3U, 0x8B7979F2U,
|
|
0x32E7E7D5U, 0x43C8C88BU, 0x5937376EU, 0xB76D6DDAU,
|
|
0x8C8D8D01U, 0x64D5D5B1U, 0xD24E4E9CU, 0xE0A9A949U,
|
|
0xB46C6CD8U, 0xFA5656ACU, 0x07F4F4F3U, 0x25EAEACFU,
|
|
0xAF6565CAU, 0x8E7A7AF4U, 0xE9AEAE47U, 0x18080810U,
|
|
0xD5BABA6FU, 0x887878F0U, 0x6F25254AU, 0x722E2E5CU,
|
|
0x241C1C38U, 0xF1A6A657U, 0xC7B4B473U, 0x51C6C697U,
|
|
0x23E8E8CBU, 0x7CDDDDA1U, 0x9C7474E8U, 0x211F1F3EU,
|
|
0xDD4B4B96U, 0xDCBDBD61U, 0x868B8B0DU, 0x858A8A0FU,
|
|
0x907070E0U, 0x423E3E7CU, 0xC4B5B571U, 0xAA6666CCU,
|
|
0xD8484890U, 0x05030306U, 0x01F6F6F7U, 0x120E0E1CU,
|
|
0xA36161C2U, 0x5F35356AU, 0xF95757AEU, 0xD0B9B969U,
|
|
0x91868617U, 0x58C1C199U, 0x271D1D3AU, 0xB99E9E27U,
|
|
0x38E1E1D9U, 0x13F8F8EBU, 0xB398982BU, 0x33111122U,
|
|
0xBB6969D2U, 0x70D9D9A9U, 0x898E8E07U, 0xA7949433U,
|
|
0xB69B9B2DU, 0x221E1E3CU, 0x92878715U, 0x20E9E9C9U,
|
|
0x49CECE87U, 0xFF5555AAU, 0x78282850U, 0x7ADFDFA5U,
|
|
0x8F8C8C03U, 0xF8A1A159U, 0x80898909U, 0x170D0D1AU,
|
|
0xDABFBF65U, 0x31E6E6D7U, 0xC6424284U, 0xB86868D0U,
|
|
0xC3414182U, 0xB0999929U, 0x772D2D5AU, 0x110F0F1EU,
|
|
0xCBB0B07BU, 0xFC5454A8U, 0xD6BBBB6DU, 0x3A16162CU
|
|
};
|
|
|
|
#define BYTE(x, y) (amd_bfe((x), (y) << 3U, 8U))
|
|
|
|
#if (ALGO == ALGO_CN_HEAVY_TUBE)
|
|
inline uint4 AES_Round_bittube2(const __local uint *AES0, const __local uint *AES1, uint4 x, uint4 k)
|
|
{
|
|
x = ~x;
|
|
k.s0 ^= AES0[BYTE(x.s0, 0)] ^ AES1[BYTE(x.s1, 1)] ^ rotate(AES0[BYTE(x.s2, 2)] ^ AES1[BYTE(x.s3, 3)], 16U);
|
|
x.s0 ^= k.s0;
|
|
k.s1 ^= AES0[BYTE(x.s1, 0)] ^ AES1[BYTE(x.s2, 1)] ^ rotate(AES0[BYTE(x.s3, 2)] ^ AES1[BYTE(x.s0, 3)], 16U);
|
|
x.s1 ^= k.s1;
|
|
k.s2 ^= AES0[BYTE(x.s2, 0)] ^ AES1[BYTE(x.s3, 1)] ^ rotate(AES0[BYTE(x.s0, 2)] ^ AES1[BYTE(x.s1, 3)], 16U);
|
|
x.s2 ^= k.s2;
|
|
k.s3 ^= AES0[BYTE(x.s3, 0)] ^ AES1[BYTE(x.s0, 1)] ^ rotate(AES0[BYTE(x.s1, 2)] ^ AES1[BYTE(x.s2, 3)], 16U);
|
|
return k;
|
|
}
|
|
#endif
|
|
|
|
uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key)
|
|
{
|
|
key.s0 ^= AES0[BYTE(X.s0, 0)] ^ AES1[BYTE(X.s1, 1)] ^ AES2[BYTE(X.s2, 2)] ^ AES3[BYTE(X.s3, 3)];
|
|
key.s1 ^= AES0[BYTE(X.s1, 0)] ^ AES1[BYTE(X.s2, 1)] ^ AES2[BYTE(X.s3, 2)] ^ AES3[BYTE(X.s0, 3)];
|
|
key.s2 ^= AES0[BYTE(X.s2, 0)] ^ AES1[BYTE(X.s3, 1)] ^ AES2[BYTE(X.s0, 2)] ^ AES3[BYTE(X.s1, 3)];
|
|
key.s3 ^= AES0[BYTE(X.s3, 0)] ^ AES1[BYTE(X.s0, 1)] ^ AES2[BYTE(X.s1, 2)] ^ AES3[BYTE(X.s2, 3)];
|
|
|
|
return key;
|
|
}
|
|
|
|
uint4 AES_Round_Two_Tables(const __local uint *AES0, const __local uint *AES1, const uint4 X, uint4 key)
|
|
{
|
|
key.s0 ^= AES0[BYTE(X.s0, 0)] ^ AES1[BYTE(X.s1, 1)] ^ rotate(AES0[BYTE(X.s2, 2)] ^ AES1[BYTE(X.s3, 3)], 16U);
|
|
key.s1 ^= AES0[BYTE(X.s1, 0)] ^ AES1[BYTE(X.s2, 1)] ^ rotate(AES0[BYTE(X.s3, 2)] ^ AES1[BYTE(X.s0, 3)], 16U);
|
|
key.s2 ^= AES0[BYTE(X.s2, 0)] ^ AES1[BYTE(X.s3, 1)] ^ rotate(AES0[BYTE(X.s0, 2)] ^ AES1[BYTE(X.s1, 3)], 16U);
|
|
key.s3 ^= AES0[BYTE(X.s3, 0)] ^ AES1[BYTE(X.s0, 1)] ^ rotate(AES0[BYTE(X.s1, 2)] ^ AES1[BYTE(X.s2, 3)], 16U);
|
|
|
|
return key;
|
|
}
|
|
|
|
|
|
STATIC const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 };
|
|
|
|
|
|
STATIC const __constant uchar sbox[256] =
|
|
{
|
|
0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
|
|
0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
|
|
0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
|
|
0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
|
|
0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
|
|
0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
|
|
0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
|
|
0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
|
|
0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
|
|
0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
|
|
0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
|
|
0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
|
|
0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
|
|
0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
|
|
0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
|
|
0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
|
|
};
|
|
|
|
|
|
#define SubWord(inw) ((sbox[BYTE(inw, 3)] << 24) | (sbox[BYTE(inw, 2)] << 16) | (sbox[BYTE(inw, 1)] << 8) | sbox[BYTE(inw, 0)])
|
|
|
|
|
|
void AESExpandKey256(uint *keybuf)
|
|
{
|
|
//#pragma unroll 4
|
|
for (uint c = 8, i = 1; c < 40; ++c) {
|
|
// For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th
|
|
uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1];
|
|
|
|
// If the uint we're generating has an index that is a multiple of 8, rotate and XOR with the round constant,
|
|
// then XOR this with previously generated uint. If it's 4 after a multiple of 8, only the sbox permutation
|
|
// is done, followed by the XOR. If neither are true, only the XOR with the previously generated uint is done.
|
|
keybuf[c] = keybuf[c - 8] ^ ((!(c & 7)) ? rotate(t, 24U) ^ as_uint((uchar4)(rcon[i++], 0U, 0U, 0U)) : t);
|
|
}
|
|
}
|
|
|
|
#endif
|