Optimized cn-heavy for Zen3

- Uses scratchpad interleaving to access only the closest L3 slice from each CPU core.
- Also activates MSR mod for cn-heavy because CPU prefetchers get confused with interleaving
- 7-8% speedup on Zen3
This commit is contained in:
SChernykh 2021-02-07 22:05:11 +01:00
parent b1e14dc1d3
commit 8af8df25aa
8 changed files with 187 additions and 81 deletions

View file

@ -81,6 +81,7 @@ public:
inline void start(const std::vector<CpuLaunchData> &threads, size_t memory)
{
m_workersMemory.clear();
m_hugePages.reset();
m_memory = memory;
m_started = 0;
@ -95,8 +96,10 @@ public:
if (ready) {
m_started++;
m_hugePages += worker->memory()->hugePages();
m_ways += worker->intensity();
if (m_workersMemory.insert(worker->memory()).second) {
m_hugePages += worker->memory()->hugePages();
}
m_ways += worker->intensity();
}
else {
m_errors++;
@ -126,6 +129,7 @@ public:
}
private:
std::set<const VirtualMemory*> m_workersMemory;
HugePagesInfo m_hugePages;
size_t m_errors = 0;
size_t m_memory = 0;

View file

@ -103,7 +103,7 @@ rapidjson::Value xmrig::CpuConfig::toJSON(rapidjson::Document &doc) const
size_t xmrig::CpuConfig::memPoolSize() const
{
return m_memoryPool < 0 ? Cpu::info()->threads() : m_memoryPool;
return m_memoryPool < 0 ? std::max(Cpu::info()->threads(), Cpu::info()->L3() >> 21) : m_memoryPool;
}

View file

@ -19,8 +19,10 @@
#include <cassert>
#include <thread>
#include <mutex>
#include "backend/cpu/Cpu.h"
#include "backend/cpu/CpuWorker.h"
#include "base/tools/Chrono.h"
#include "core/config/Config.h"
@ -55,6 +57,12 @@ namespace xmrig {
static constexpr uint32_t kReserveCount = 32768;
#ifdef XMRIG_ALGO_CN_HEAVY
static std::mutex cn_heavyZen3MemoryMutex;
VirtualMemory* cn_heavyZen3Memory = nullptr;
#endif
} // namespace xmrig
@ -73,7 +81,20 @@ xmrig::CpuWorker<N>::CpuWorker(size_t id, const CpuLaunchData &data) :
m_threads(data.threads),
m_ctx()
{
m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node());
# ifdef XMRIG_ALGO_CN_HEAVY
// cn-heavy optimization for Zen3 CPUs
if ((N == 1) && (m_av == CnHash::AV_SINGLE) && (m_algorithm.family() == Algorithm::CN_HEAVY) && (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3)) {
std::lock_guard<std::mutex> lock(cn_heavyZen3MemoryMutex);
if (!cn_heavyZen3Memory) {
cn_heavyZen3Memory = new VirtualMemory(m_algorithm.l3() * m_threads, data.hugePages, false, false, node());
}
m_memory = cn_heavyZen3Memory;
}
else
# endif
{
m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node());
}
}
@ -85,7 +106,13 @@ xmrig::CpuWorker<N>::~CpuWorker()
# endif
CnCtx::release(m_ctx, N);
delete m_memory;
# ifdef XMRIG_ALGO_CN_HEAVY
if (m_memory != cn_heavyZen3Memory)
# endif
{
delete m_memory;
}
}
@ -387,7 +414,16 @@ template<size_t N>
void xmrig::CpuWorker<N>::allocateCnCtx()
{
if (m_ctx[0] == nullptr) {
CnCtx::create(m_ctx, m_memory->scratchpad(), m_algorithm.l3(), N);
int shift = 0;
# ifdef XMRIG_ALGO_CN_HEAVY
// cn-heavy optimization for Zen3 CPUs
if (m_memory == cn_heavyZen3Memory) {
shift = (id() / 8) * m_algorithm.l3() * 8 + (id() % 8) * 64;
}
# endif
CnCtx::create(m_ctx, m_memory->scratchpad() + shift, m_algorithm.l3(), N);
}
}

View file

@ -363,10 +363,14 @@ void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorith
return;
}
std::vector<std::pair<int64_t, int32_t>> threads_data;
threads_data.reserve(cores.size());
size_t pu_id = 0;
while (cacheHashes > 0 && PUs > 0) {
bool allocated_pu = false;
threads_data.clear();
for (hwloc_obj_t core : cores) {
const std::vector<hwloc_obj_t> units = findByType(core, HWLOC_OBJ_PU);
if (units.size() <= pu_id) {
@ -377,18 +381,31 @@ void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorith
PUs--;
allocated_pu = true;
threads.add(units[pu_id]->os_index, intensity);
threads_data.emplace_back(units[pu_id]->os_index, intensity);
if (cacheHashes == 0) {
break;
}
}
// Reversing of "threads_data" and "cores" is done to fill in virtual cores starting from the last one, but still in order
// For example, cn-heavy threads on 6-core Zen2/Zen3 will have affinity [0,2,4,6,8,10,9,11]
// This is important for Zen3 cn-heavy optimization
if (pu_id & 1) {
std::reverse(threads_data.begin(), threads_data.end());
}
for (const auto& t : threads_data) {
threads.add(t.first, t.second);
}
if (!allocated_pu) {
break;
}
pu_id++;
std::reverse(cores.begin(), cores.end());
}
# endif
}