From 9bceb65ad87e64e6e88599e0d6988cbea51c5555 Mon Sep 17 00:00:00 2001
From: Foudge <olivier.fouge@free.fr>
Date: Sat, 20 Jan 2018 10:43:56 +0100
Subject: [PATCH 1/5] +15% boost with non-AES CPU

Performance boost validated on Core 2 Quad processor under Windows 10.
But it's Windows/MS Visual C++ specific.
---
 src/crypto/soft_aes.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/src/crypto/soft_aes.h b/src/crypto/soft_aes.h
index 99321c4e..148f39c1 100644
--- a/src/crypto/soft_aes.h
+++ b/src/crypto/soft_aes.h
@@ -91,10 +91,17 @@ alignas(16) const uint8_t  saes_sbox[256] = saes_data(saes_h0);
 
 static inline __m128i soft_aesenc(__m128i in, __m128i key)
 {
-    const uint32_t x0 = _mm_cvtsi128_si32(in);
-    const uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
-    const uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA));
-    const uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF));
+#if defined(_MSC_VER)
+	const uint32_t x0 = in.m128i_u32[0];
+	const uint32_t x1 = in.m128i_u32[1];
+	const uint32_t x2 = in.m128i_u32[2];
+	const uint32_t x3 = in.m128i_u32[3];
+#else
+	const uint32_t x0 = _mm_cvtsi128_si32(in);
+	const uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
+	const uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA));
+	const uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF));
+#endif
 
 	__m128i out = _mm_set_epi32(
 		(saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]),

From 15fe6ce23f4aa2654bb8b458463e6213ee867486 Mon Sep 17 00:00:00 2001
From: Foudge <olivier.fouge@free.fr>
Date: Sat, 27 Jan 2018 11:42:22 +0100
Subject: [PATCH 2/5] Remove compilation warnings under MSVC

---
 src/api/NetworkState.cpp | 4 ++--
 src/net/Client.cpp       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/api/NetworkState.cpp b/src/api/NetworkState.cpp
index bae290d0..d3ffddd3 100644
--- a/src/api/NetworkState.cpp
+++ b/src/api/NetworkState.cpp
@@ -46,7 +46,7 @@ NetworkState::NetworkState() :
 
 int NetworkState::connectionTime() const
 {
-    return m_active ? ((uv_now(uv_default_loop()) - m_connectionTime) / 1000) : 0;
+    return m_active ? (int)((uv_now(uv_default_loop()) - m_connectionTime) / 1000) : 0;
 }
 
 
@@ -56,7 +56,7 @@ uint32_t NetworkState::avgTime() const
         return 0;
     }
 
-    return (uint32_t) connectionTime() / m_latency.size();
+    return connectionTime() / (uint32_t)m_latency.size();
 }
 
 
diff --git a/src/net/Client.cpp b/src/net/Client.cpp
index 8e8ebcfd..fcaec8eb 100644
--- a/src/net/Client.cpp
+++ b/src/net/Client.cpp
@@ -532,7 +532,7 @@ void Client::onAllocBuffer(uv_handle_t *handle, size_t suggested_size, uv_buf_t
     auto client = getClient(handle->data);
 
     buf->base = &client->m_recvBuf.base[client->m_recvBufPos];
-    buf->len  = client->m_recvBuf.len - client->m_recvBufPos;
+    buf->len  = client->m_recvBuf.len - (ULONG)client->m_recvBufPos;
 }
 
 

From 9a28ad590ca6137bf5e19ba477e3f379527bbd73 Mon Sep 17 00:00:00 2001
From: Foudge <olivier.fouge@free.fr>
Date: Sun, 28 Jan 2018 12:58:19 +0100
Subject: [PATCH 3/5] up to 20% perf increase with Cryptonight with non-AES CPU

This time, the performance increase is got with MSVC and GCC. On non-AES CPU, there were an useless load/store SSE2 register. The last MSVC "hack" is replaced by a portable code and he's more complete (a load is saved).

On my C2Q6600, with 3 thread, I have +16% with MSVC2015 and +20% with GCC 7.3, compared to official 2.4.4 version.
---
 src/crypto/CryptoNight_arm.h | 30 +++++++++--------
 src/crypto/CryptoNight_x86.h | 62 ++++++++++++++++++------------------
 src/crypto/soft_aes.h        | 17 +++-------
 3 files changed, 52 insertions(+), 57 deletions(-)

diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h
index 15be6c3d..17bba7af 100644
--- a/src/crypto/CryptoNight_arm.h
+++ b/src/crypto/CryptoNight_arm.h
@@ -194,14 +194,14 @@ template<bool SOFT_AES>
 static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
 {
     if (SOFT_AES) {
-        *x0 = soft_aesenc(*x0, key);
-        *x1 = soft_aesenc(*x1, key);
-        *x2 = soft_aesenc(*x2, key);
-        *x3 = soft_aesenc(*x3, key);
-        *x4 = soft_aesenc(*x4, key);
-        *x5 = soft_aesenc(*x5, key);
-        *x6 = soft_aesenc(*x6, key);
-        *x7 = soft_aesenc(*x7, key);
+        *x0 = soft_aesenc((uint32_t*)x0, key);
+        *x1 = soft_aesenc((uint32_t*)x1, key);
+        *x2 = soft_aesenc((uint32_t*)x2, key);
+        *x3 = soft_aesenc((uint32_t*)x3, key);
+        *x4 = soft_aesenc((uint32_t*)x4, key);
+        *x5 = soft_aesenc((uint32_t*)x5, key);
+        *x6 = soft_aesenc((uint32_t*)x6, key);
+        *x7 = soft_aesenc((uint32_t*)x7, key);
     }
 #   ifndef XMRIG_ARMv7
     else {
@@ -361,12 +361,13 @@ inline void cryptonight_hash(const void *__restrict__ input, size_t size, void *
     uint64_t idx0 = h0[0] ^ h0[4];
 
     for (size_t i = 0; i < ITERATIONS; i++) {
-        __m128i cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
+		__m128i cx;
 
         if (SOFT_AES) {
-            cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
+            cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
         }
         else {
+			cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
 #           ifndef XMRIG_ARMv7
             cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
 #           endif
@@ -425,14 +426,15 @@ inline void cryptonight_double_hash(const void *__restrict__ input, size_t size,
     uint64_t idx1 = h1[0] ^ h1[4];
 
     for (size_t i = 0; i < ITERATIONS; i++) {
-        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
-        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]);
+        __m128i cx0, cx1;
 
         if (SOFT_AES) {
-            cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
-            cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
+            cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+            cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
         }
         else {
+			cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
+			cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]);
 #           ifndef XMRIG_ARMv7
             cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
             cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h
index 362a1a9f..786d28f1 100644
--- a/src/crypto/CryptoNight_x86.h
+++ b/src/crypto/CryptoNight_x86.h
@@ -193,14 +193,14 @@ template<bool SOFT_AES>
 static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
 {
     if (SOFT_AES) {
-        *x0 = soft_aesenc(*x0, key);
-        *x1 = soft_aesenc(*x1, key);
-        *x2 = soft_aesenc(*x2, key);
-        *x3 = soft_aesenc(*x3, key);
-        *x4 = soft_aesenc(*x4, key);
-        *x5 = soft_aesenc(*x5, key);
-        *x6 = soft_aesenc(*x6, key);
-        *x7 = soft_aesenc(*x7, key);
+        *x0 = soft_aesenc((uint32_t*)x0, key);
+        *x1 = soft_aesenc((uint32_t*)x1, key);
+        *x2 = soft_aesenc((uint32_t*)x2, key);
+        *x3 = soft_aesenc((uint32_t*)x3, key);
+        *x4 = soft_aesenc((uint32_t*)x4, key);
+        *x5 = soft_aesenc((uint32_t*)x5, key);
+        *x6 = soft_aesenc((uint32_t*)x6, key);
+        *x7 = soft_aesenc((uint32_t*)x7, key);
     }
     else {
         *x0 = _mm_aesenc_si128(*x0, key);
@@ -324,19 +324,18 @@ inline void cryptonight_hash(const void *__restrict__ input, size_t size, void *
     uint64_t idx0 = h0[0] ^ h0[4];
 
     for (size_t i = 0; i < ITERATIONS; i++) {
-        __m128i cx;
-        cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
+		__m128i cx;
 
-        if (SOFT_AES) {
-            cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
-        }
-        else {
-            cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
-        }
-
-        _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
-        idx0 = EXTRACT64(cx);
-        bx0 = cx;
+		if (SOFT_AES) {
+			cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+		}
+		else {	
+			cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
+			cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
+		}
+		_mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+		idx0 = EXTRACT64(cx);
+		bx0 = cx;
 
         uint64_t hi, lo, cl, ch;
         cl = ((uint64_t*) &l0[idx0 & MASK])[0];
@@ -386,18 +385,19 @@ inline void cryptonight_double_hash(const void *__restrict__ input, size_t size,
     uint64_t idx0 = h0[0] ^ h0[4];
     uint64_t idx1 = h1[0] ^ h1[4];
 
-    for (size_t i = 0; i < ITERATIONS; i++) {
-        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
-        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]);
+    for (size_t i = 0; i < ITERATIONS; i++)	{
+		__m128i cx0, cx1;
 
-        if (SOFT_AES) {
-            cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
-            cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
-        }
-        else {
-            cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
-            cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
-        }
+		if (SOFT_AES) {
+			cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+			cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+		}
+		else {
+			cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
+			cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]);
+			cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+			cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+		}
 
         _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
         _mm_store_si128((__m128i *) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
diff --git a/src/crypto/soft_aes.h b/src/crypto/soft_aes.h
index 148f39c1..b7698ac4 100644
--- a/src/crypto/soft_aes.h
+++ b/src/crypto/soft_aes.h
@@ -89,19 +89,12 @@
 alignas(16) const uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) };
 alignas(16) const uint8_t  saes_sbox[256] = saes_data(saes_h0);
 
-static inline __m128i soft_aesenc(__m128i in, __m128i key)
+static inline __m128i soft_aesenc(const uint32_t* in, __m128i key)
 {
-#if defined(_MSC_VER)
-	const uint32_t x0 = in.m128i_u32[0];
-	const uint32_t x1 = in.m128i_u32[1];
-	const uint32_t x2 = in.m128i_u32[2];
-	const uint32_t x3 = in.m128i_u32[3];
-#else
-	const uint32_t x0 = _mm_cvtsi128_si32(in);
-	const uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
-	const uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA));
-	const uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF));
-#endif
+	const uint32_t x0 = in[0];
+	const uint32_t x1 = in[1];
+	const uint32_t x2 = in[2];
+	const uint32_t x3 = in[3];
 
 	__m128i out = _mm_set_epi32(
 		(saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]),

From d2964576c7da7784d060116a4970f60f6b895f10 Mon Sep 17 00:00:00 2001
From: Foudge <olivier.fouge@free.fr>
Date: Sun, 28 Jan 2018 18:13:00 +0100
Subject: [PATCH 4/5] Compilation error under FreeBSD

ULONG is not recognized under this OS, so replaced it with more portable definition.
---
 src/net/Client.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/net/Client.cpp b/src/net/Client.cpp
index fcaec8eb..fb83acd2 100644
--- a/src/net/Client.cpp
+++ b/src/net/Client.cpp
@@ -532,7 +532,7 @@ void Client::onAllocBuffer(uv_handle_t *handle, size_t suggested_size, uv_buf_t
     auto client = getClient(handle->data);
 
     buf->base = &client->m_recvBuf.base[client->m_recvBufPos];
-    buf->len  = client->m_recvBuf.len - (ULONG)client->m_recvBufPos;
+    buf->len  = client->m_recvBuf.len - (unsigned long)client->m_recvBufPos;
 }
 
 

From 037abd703720cc52f228c99d1e0e205ce48abd85 Mon Sep 17 00:00:00 2001
From: Foudge <olivier.fouge@free.fr>
Date: Sat, 3 Feb 2018 16:03:14 +0100
Subject: [PATCH 5/5] Correct L2 cache size calculation for Intel Core 2 family

This is a workaround for total L2 cache size calculation of Intel Core Solo, Core Duo, Core 2 Duo, Core 2 Quad and their Xeon homologue. These processors have L2 cache shared by 2 cores.

There is maybe more CPU with L2 shared cache, but I am sure that these models are concerned and they are not so numerous.
A better way would be to modify libcpuid to implement L2 cache counting.
---
 src/Cpu.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/Cpu.cpp b/src/Cpu.cpp
index ff6f49e9..b122e156 100644
--- a/src/Cpu.cpp
+++ b/src/Cpu.cpp
@@ -100,7 +100,13 @@ void Cpu::initCommon()
         m_l2_cache = data.l2_cache * (m_totalCores / 2) * m_sockets;
         m_l2_exclusive = true;
     }
-    else {
+	// Workaround for Intel Core Solo, Core Duo, Core 2 Duo, Core 2 Quad and their Xeon homologue
+	// These processors have L2 cache shared by 2 cores.
+	else if (data.vendor == VENDOR_INTEL && data.family == 0x06 && (data.model == 0x0E || data.model == 0x0F || data.model == 0x07)) {
+		int l2_count_per_socket = m_totalCores > 1 ? m_totalCores / 2 : 1;
+		m_l2_cache = data.l2_cache > 0 ? data.l2_cache * l2_count_per_socket * m_sockets : 0;
+	}
+	else{
         m_l2_cache = data.l2_cache > 0 ? data.l2_cache * m_totalCores * m_sockets : 0;
     }