Optimized cn-heavy for Zen3

- Uses scratchpad interleaving to access only the closest L3 slice from each CPU core. - Also activates MSR mod for cn-heavy because CPU prefetchers get confused with interleaving - 7-8% speedup on Zen3
2021-02-07 22:05:11 +01:00 · 2021-02-07 22:05:11 +01:00 · 8af8df25aa
commit 8af8df25aa
parent b1e14dc1d3
8 changed files with 187 additions and 81 deletions
--- a/src/backend/cpu/CpuBackend.cpp
+++ b/src/backend/cpu/CpuBackend.cpp
@ -81,6 +81,7 @@ public:

    inline void start(const std::vector<CpuLaunchData> &threads, size_t memory)
    {
+        m_workersMemory.clear();
        m_hugePages.reset();
        m_memory    = memory;
        m_started   = 0;
@ -95,8 +96,10 @@ public:
        if (ready) {
            m_started++;

-            m_hugePages += worker->memory()->hugePages();
-            m_ways      += worker->intensity();
+            if (m_workersMemory.insert(worker->memory()).second) {
+                m_hugePages += worker->memory()->hugePages();
+            }
+            m_ways += worker->intensity();
        }
        else {
            m_errors++;
@ -126,6 +129,7 @@ public:
    }

 private:
+    std::set<const VirtualMemory*> m_workersMemory;
    HugePagesInfo m_hugePages;
    size_t m_errors       = 0;
    size_t m_memory       = 0;
--- a/src/backend/cpu/CpuConfig.cpp
+++ b/src/backend/cpu/CpuConfig.cpp
@ -103,7 +103,7 @@ rapidjson::Value xmrig::CpuConfig::toJSON(rapidjson::Document &doc) const

 size_t xmrig::CpuConfig::memPoolSize() const
 {
-    return m_memoryPool < 0 ? Cpu::info()->threads() : m_memoryPool;
+    return m_memoryPool < 0 ? std::max(Cpu::info()->threads(), Cpu::info()->L3() >> 21) : m_memoryPool;
 }


--- a/src/backend/cpu/CpuWorker.cpp
+++ b/src/backend/cpu/CpuWorker.cpp
@ -19,8 +19,10 @@

 #include <cassert>
 #include <thread>
+#include <mutex>


+#include "backend/cpu/Cpu.h"
 #include "backend/cpu/CpuWorker.h"
 #include "base/tools/Chrono.h"
 #include "core/config/Config.h"
@ -55,6 +57,12 @@ namespace xmrig {

 static constexpr uint32_t kReserveCount = 32768;

+
+#ifdef XMRIG_ALGO_CN_HEAVY
+static std::mutex cn_heavyZen3MemoryMutex;
+VirtualMemory* cn_heavyZen3Memory = nullptr;
+#endif
+
 } // namespace xmrig


@ -73,7 +81,20 @@ xmrig::CpuWorker<N>::CpuWorker(size_t id, const CpuLaunchData &data) :
    m_threads(data.threads),
    m_ctx()
 {
-    m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node());
+#   ifdef XMRIG_ALGO_CN_HEAVY
+    // cn-heavy optimization for Zen3 CPUs
+    if ((N == 1) && (m_av == CnHash::AV_SINGLE) && (m_algorithm.family() == Algorithm::CN_HEAVY) && (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3)) {
+        std::lock_guard<std::mutex> lock(cn_heavyZen3MemoryMutex);
+        if (!cn_heavyZen3Memory) {
+            cn_heavyZen3Memory = new VirtualMemory(m_algorithm.l3() * m_threads, data.hugePages, false, false, node());
+        }
+        m_memory = cn_heavyZen3Memory;
+    }
+    else
+#   endif
+    {
+        m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node());
+    }
 }


@ -85,7 +106,13 @@ xmrig::CpuWorker<N>::~CpuWorker()
 #   endif

    CnCtx::release(m_ctx, N);
-    delete m_memory;
+
+#   ifdef XMRIG_ALGO_CN_HEAVY
+    if (m_memory != cn_heavyZen3Memory)
+#   endif
+    {
+        delete m_memory;
+    }
 }


@ -387,7 +414,16 @@ template<size_t N>
 void xmrig::CpuWorker<N>::allocateCnCtx()
 {
    if (m_ctx[0] == nullptr) {
-        CnCtx::create(m_ctx, m_memory->scratchpad(), m_algorithm.l3(), N);
+        int shift = 0;
+
+#       ifdef XMRIG_ALGO_CN_HEAVY
+        // cn-heavy optimization for Zen3 CPUs
+        if (m_memory == cn_heavyZen3Memory) {
+            shift = (id() / 8) * m_algorithm.l3() * 8 + (id() % 8) * 64;
+        }
+#       endif
+
+        CnCtx::create(m_ctx, m_memory->scratchpad() + shift, m_algorithm.l3(), N);
    }
 }

--- a/src/backend/cpu/platform/HwlocCpuInfo.cpp
+++ b/src/backend/cpu/platform/HwlocCpuInfo.cpp
@ -363,10 +363,14 @@ void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorith
        return;
    }

+    std::vector<std::pair<int64_t, int32_t>> threads_data;
+    threads_data.reserve(cores.size());
+
    size_t pu_id = 0;
    while (cacheHashes > 0 && PUs > 0) {
        bool allocated_pu = false;

+        threads_data.clear();
        for (hwloc_obj_t core : cores) {
            const std::vector<hwloc_obj_t> units = findByType(core, HWLOC_OBJ_PU);
            if (units.size() <= pu_id) {
@ -377,18 +381,31 @@ void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorith
            PUs--;

            allocated_pu = true;
-            threads.add(units[pu_id]->os_index, intensity);
+            threads_data.emplace_back(units[pu_id]->os_index, intensity);

            if (cacheHashes == 0) {
                break;
            }
        }

+        // Reversing of "threads_data" and "cores" is done to fill in virtual cores starting from the last one, but still in order
+        // For example, cn-heavy threads on 6-core Zen2/Zen3 will have affinity [0,2,4,6,8,10,9,11]
+        // This is important for Zen3 cn-heavy optimization
+
+        if (pu_id & 1) {
+            std::reverse(threads_data.begin(), threads_data.end());
+        }
+
+        for (const auto& t : threads_data) {
+            threads.add(t.first, t.second);
+        }
+
        if (!allocated_pu) {
            break;
        }

        pu_id++;
+        std::reverse(cores.begin(), cores.end());
    }
 #   endif
 }