Optimized soft AES implementations

cn-pico: +6.7% cn/half: +6.2% cn/2: +4.3% cn-heavy: +9.1% cn/wow, cn/r: 2.4-2.6 times faster
2019-02-24 20:04:09 +01:00 · 2019-02-24 20:04:09 +01:00 · 488cec09dd
commit 488cec09dd
parent a5dcd6dd1f
18 changed files with 2380 additions and 1090 deletions
--- a/src/crypto/CryptoNight_x86.h
+++ b/src/crypto/CryptoNight_x86.h
@ -192,31 +192,102 @@ static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, _
 }


-template<bool SOFT_AES>
-static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
+static FORCEINLINE void soft_aesenc(void* __restrict ptr, const void* __restrict key, const uint32_t* __restrict t)
 {
-    if (SOFT_AES) {
-        *x0 = soft_aesenc((uint32_t*)x0, key);
-        *x1 = soft_aesenc((uint32_t*)x1, key);
-        *x2 = soft_aesenc((uint32_t*)x2, key);
-        *x3 = soft_aesenc((uint32_t*)x3, key);
-        *x4 = soft_aesenc((uint32_t*)x4, key);
-        *x5 = soft_aesenc((uint32_t*)x5, key);
-        *x6 = soft_aesenc((uint32_t*)x6, key);
-        *x7 = soft_aesenc((uint32_t*)x7, key);
-    }
-    else {
-        *x0 = _mm_aesenc_si128(*x0, key);
-        *x1 = _mm_aesenc_si128(*x1, key);
-        *x2 = _mm_aesenc_si128(*x2, key);
-        *x3 = _mm_aesenc_si128(*x3, key);
-        *x4 = _mm_aesenc_si128(*x4, key);
-        *x5 = _mm_aesenc_si128(*x5, key);
-        *x6 = _mm_aesenc_si128(*x6, key);
-        *x7 = _mm_aesenc_si128(*x7, key);
-    }
+    uint32_t x0 = ((const uint32_t*)(ptr))[0];
+    uint32_t x1 = ((const uint32_t*)(ptr))[1];
+    uint32_t x2 = ((const uint32_t*)(ptr))[2];
+    uint32_t x3 = ((const uint32_t*)(ptr))[3];
+
+    uint32_t y0 = t[x0 & 0xff]; x0 >>= 8;
+    uint32_t y1 = t[x1 & 0xff]; x1 >>= 8;
+    uint32_t y2 = t[x2 & 0xff]; x2 >>= 8;
+    uint32_t y3 = t[x3 & 0xff]; x3 >>= 8;
+    t += 256;
+
+    y0 ^= t[x1 & 0xff]; x1 >>= 8;
+    y1 ^= t[x2 & 0xff]; x2 >>= 8;
+    y2 ^= t[x3 & 0xff]; x3 >>= 8;
+    y3 ^= t[x0 & 0xff]; x0 >>= 8;
+    t += 256;
+
+    y0 ^= t[x2 & 0xff]; x2 >>= 8;
+    y1 ^= t[x3 & 0xff]; x3 >>= 8;
+    y2 ^= t[x0 & 0xff]; x0 >>= 8;
+    y3 ^= t[x1 & 0xff]; x1 >>= 8;
+    t += 256;
+
+    y0 ^= t[x3];
+    y1 ^= t[x0];
+    y2 ^= t[x1];
+    y3 ^= t[x2];
+
+    ((uint32_t*)ptr)[0] = y0 ^ ((uint32_t*)key)[0];
+    ((uint32_t*)ptr)[1] = y1 ^ ((uint32_t*)key)[1];
+    ((uint32_t*)ptr)[2] = y2 ^ ((uint32_t*)key)[2];
+    ((uint32_t*)ptr)[3] = y3 ^ ((uint32_t*)key)[3];
 }

+static FORCEINLINE __m128i soft_aesenc(const void* __restrict ptr, const __m128i key, const uint32_t* __restrict t)
+{
+    uint32_t x0 = ((const uint32_t*)(ptr))[0];
+    uint32_t x1 = ((const uint32_t*)(ptr))[1];
+    uint32_t x2 = ((const uint32_t*)(ptr))[2];
+    uint32_t x3 = ((const uint32_t*)(ptr))[3];
+
+    uint32_t y0 = t[x0 & 0xff]; x0 >>= 8;
+    uint32_t y1 = t[x1 & 0xff]; x1 >>= 8;
+    uint32_t y2 = t[x2 & 0xff]; x2 >>= 8;
+    uint32_t y3 = t[x3 & 0xff]; x3 >>= 8;
+    t += 256;
+
+    y0 ^= t[x1 & 0xff]; x1 >>= 8;
+    y1 ^= t[x2 & 0xff]; x2 >>= 8;
+    y2 ^= t[x3 & 0xff]; x3 >>= 8;
+    y3 ^= t[x0 & 0xff]; x0 >>= 8;
+    t += 256;
+
+    y0 ^= t[x2 & 0xff]; x2 >>= 8;
+    y1 ^= t[x3 & 0xff]; x3 >>= 8;
+    y2 ^= t[x0 & 0xff]; x0 >>= 8;
+    y3 ^= t[x1 & 0xff]; x1 >>= 8;
+
+    y0 ^= t[x3 + 256];
+    y1 ^= t[x0 + 256];
+    y2 ^= t[x1 + 256];
+    y3 ^= t[x2 + 256];
+
+    return _mm_xor_si128(_mm_set_epi32(y3, y2, y1, y0), key);
+}
+
+template<bool SOFT_AES>
+void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7);
+
+template<>
+static NOINLINE void aes_round<true>(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
+{
+    *x0 = soft_aesenc((uint32_t*)x0, key, (const uint32_t*)saes_table);
+    *x1 = soft_aesenc((uint32_t*)x1, key, (const uint32_t*)saes_table);
+    *x2 = soft_aesenc((uint32_t*)x2, key, (const uint32_t*)saes_table);
+    *x3 = soft_aesenc((uint32_t*)x3, key, (const uint32_t*)saes_table);
+    *x4 = soft_aesenc((uint32_t*)x4, key, (const uint32_t*)saes_table);
+    *x5 = soft_aesenc((uint32_t*)x5, key, (const uint32_t*)saes_table);
+    *x6 = soft_aesenc((uint32_t*)x6, key, (const uint32_t*)saes_table);
+    *x7 = soft_aesenc((uint32_t*)x7, key, (const uint32_t*)saes_table);
+}
+
+template<>
+static FORCEINLINE void aes_round<false>(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
+{
+    *x0 = _mm_aesenc_si128(*x0, key);
+    *x1 = _mm_aesenc_si128(*x1, key);
+    *x2 = _mm_aesenc_si128(*x2, key);
+    *x3 = _mm_aesenc_si128(*x3, key);
+    *x4 = _mm_aesenc_si128(*x4, key);
+    *x5 = _mm_aesenc_si128(*x5, key);
+    *x6 = _mm_aesenc_si128(*x6, key);
+    *x7 = _mm_aesenc_si128(*x7, key);
+}

 inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7)
 {
@ -478,6 +549,8 @@ static inline void cryptonight_monero_tweak(uint64_t* mem_out, const uint8_t* l,
    }
 }

+void wow_soft_aes_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM);
+void v4_soft_aes_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM);

 template<xmrig::Algo ALGO, bool SOFT_AES, xmrig::Variant VARIANT>
 inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
@ -498,9 +571,31 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si

    cn_explode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);

-    const uint8_t* l0 = ctx[0]->memory;
    uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx[0]->state);

+#ifndef XMRIG_NO_ASM
+    if (SOFT_AES && xmrig::cn_is_cryptonight_r<VARIANT>())
+    {
+        if (!ctx[0]->generated_code_data.match(VARIANT, height)) {
+            V4_Instruction code[256];
+            const int code_size = v4_random_math_init<VARIANT>(code, height);
+
+            if (VARIANT == xmrig::VARIANT_WOW)
+                wow_soft_aes_compile_code(code, code_size, reinterpret_cast<void*>(ctx[0]->generated_code), xmrig::ASM_NONE);
+            else if (VARIANT == xmrig::VARIANT_4)
+                v4_soft_aes_compile_code(code, code_size, reinterpret_cast<void*>(ctx[0]->generated_code), xmrig::ASM_NONE);
+
+            ctx[0]->generated_code_data.variant = VARIANT;
+            ctx[0]->generated_code_data.height = height;
+        }
+
+        ctx[0]->saes_table = (const uint32_t*)saes_table;
+        ctx[0]->generated_code(ctx[0]);
+    } else {
+#endif
+
+    const uint8_t* l0 = ctx[0]->memory;
+
    VARIANT1_INIT(0);
    VARIANT2_INIT(0);
    VARIANT2_SET_ROUNDING_MODE();
@ -524,7 +619,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
            cx = aes_round_tweak_div(cx, ax0);
        }
        else if (SOFT_AES) {
-            cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0);
+            cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0, (const uint32_t*)saes_table);
        }
        else {
            cx = _mm_aesenc_si128(cx, ax0);
@ -602,6 +697,10 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
        bx0 = cx;
    }

+#ifndef XMRIG_NO_ASM
+    }
+#endif
+        
    cn_implode_scratchpad<ALGO, MEM, SOFT_AES>((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);

    xmrig::keccakf(h0, 24);
@ -857,8 +956,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
            cx1 = aes_round_tweak_div(cx1, ax1);
        }
        else if (SOFT_AES) {
-            cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0);
-            cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1);
+            cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0, (const uint32_t*)saes_table);
+            cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1, (const uint32_t*)saes_table);
        }
        else {
            cx0 = _mm_aesenc_si128(cx0, ax0);
@ -1019,7 +1118,7 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
        c = aes_round_tweak_div(c, a);                                 \
    }                                                                  \
    else if (SOFT_AES) {                                               \
-        c = soft_aesenc(c, a);                                         \
+        c = soft_aesenc(&c, a, (const uint32_t*)saes_table);           \
    } else {                                                           \
        c = _mm_aesenc_si128(c, a);                                    \
    }                                                                  \