Optimized cn-heavy for Zen3
- Uses scratchpad interleaving to access only the closest L3 slice from each CPU core. - Also activates MSR mod for cn-heavy because CPU prefetchers get confused with interleaving - 7-8% speedup on Zen3
This commit is contained in:
parent
b1e14dc1d3
commit
8af8df25aa
8 changed files with 187 additions and 81 deletions
|
@ -81,6 +81,7 @@ public:
|
|||
|
||||
inline void start(const std::vector<CpuLaunchData> &threads, size_t memory)
|
||||
{
|
||||
m_workersMemory.clear();
|
||||
m_hugePages.reset();
|
||||
m_memory = memory;
|
||||
m_started = 0;
|
||||
|
@ -95,8 +96,10 @@ public:
|
|||
if (ready) {
|
||||
m_started++;
|
||||
|
||||
m_hugePages += worker->memory()->hugePages();
|
||||
m_ways += worker->intensity();
|
||||
if (m_workersMemory.insert(worker->memory()).second) {
|
||||
m_hugePages += worker->memory()->hugePages();
|
||||
}
|
||||
m_ways += worker->intensity();
|
||||
}
|
||||
else {
|
||||
m_errors++;
|
||||
|
@ -126,6 +129,7 @@ public:
|
|||
}
|
||||
|
||||
private:
|
||||
std::set<const VirtualMemory*> m_workersMemory;
|
||||
HugePagesInfo m_hugePages;
|
||||
size_t m_errors = 0;
|
||||
size_t m_memory = 0;
|
||||
|
|
|
@ -103,7 +103,7 @@ rapidjson::Value xmrig::CpuConfig::toJSON(rapidjson::Document &doc) const
|
|||
|
||||
size_t xmrig::CpuConfig::memPoolSize() const
|
||||
{
|
||||
return m_memoryPool < 0 ? Cpu::info()->threads() : m_memoryPool;
|
||||
return m_memoryPool < 0 ? std::max(Cpu::info()->threads(), Cpu::info()->L3() >> 21) : m_memoryPool;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -19,8 +19,10 @@
|
|||
|
||||
#include <cassert>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
|
||||
|
||||
#include "backend/cpu/Cpu.h"
|
||||
#include "backend/cpu/CpuWorker.h"
|
||||
#include "base/tools/Chrono.h"
|
||||
#include "core/config/Config.h"
|
||||
|
@ -55,6 +57,12 @@ namespace xmrig {
|
|||
|
||||
static constexpr uint32_t kReserveCount = 32768;
|
||||
|
||||
|
||||
#ifdef XMRIG_ALGO_CN_HEAVY
|
||||
static std::mutex cn_heavyZen3MemoryMutex;
|
||||
VirtualMemory* cn_heavyZen3Memory = nullptr;
|
||||
#endif
|
||||
|
||||
} // namespace xmrig
|
||||
|
||||
|
||||
|
@ -73,7 +81,20 @@ xmrig::CpuWorker<N>::CpuWorker(size_t id, const CpuLaunchData &data) :
|
|||
m_threads(data.threads),
|
||||
m_ctx()
|
||||
{
|
||||
m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node());
|
||||
# ifdef XMRIG_ALGO_CN_HEAVY
|
||||
// cn-heavy optimization for Zen3 CPUs
|
||||
if ((N == 1) && (m_av == CnHash::AV_SINGLE) && (m_algorithm.family() == Algorithm::CN_HEAVY) && (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3)) {
|
||||
std::lock_guard<std::mutex> lock(cn_heavyZen3MemoryMutex);
|
||||
if (!cn_heavyZen3Memory) {
|
||||
cn_heavyZen3Memory = new VirtualMemory(m_algorithm.l3() * m_threads, data.hugePages, false, false, node());
|
||||
}
|
||||
m_memory = cn_heavyZen3Memory;
|
||||
}
|
||||
else
|
||||
# endif
|
||||
{
|
||||
m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -85,7 +106,13 @@ xmrig::CpuWorker<N>::~CpuWorker()
|
|||
# endif
|
||||
|
||||
CnCtx::release(m_ctx, N);
|
||||
delete m_memory;
|
||||
|
||||
# ifdef XMRIG_ALGO_CN_HEAVY
|
||||
if (m_memory != cn_heavyZen3Memory)
|
||||
# endif
|
||||
{
|
||||
delete m_memory;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -387,7 +414,16 @@ template<size_t N>
|
|||
void xmrig::CpuWorker<N>::allocateCnCtx()
|
||||
{
|
||||
if (m_ctx[0] == nullptr) {
|
||||
CnCtx::create(m_ctx, m_memory->scratchpad(), m_algorithm.l3(), N);
|
||||
int shift = 0;
|
||||
|
||||
# ifdef XMRIG_ALGO_CN_HEAVY
|
||||
// cn-heavy optimization for Zen3 CPUs
|
||||
if (m_memory == cn_heavyZen3Memory) {
|
||||
shift = (id() / 8) * m_algorithm.l3() * 8 + (id() % 8) * 64;
|
||||
}
|
||||
# endif
|
||||
|
||||
CnCtx::create(m_ctx, m_memory->scratchpad() + shift, m_algorithm.l3(), N);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -363,10 +363,14 @@ void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorith
|
|||
return;
|
||||
}
|
||||
|
||||
std::vector<std::pair<int64_t, int32_t>> threads_data;
|
||||
threads_data.reserve(cores.size());
|
||||
|
||||
size_t pu_id = 0;
|
||||
while (cacheHashes > 0 && PUs > 0) {
|
||||
bool allocated_pu = false;
|
||||
|
||||
threads_data.clear();
|
||||
for (hwloc_obj_t core : cores) {
|
||||
const std::vector<hwloc_obj_t> units = findByType(core, HWLOC_OBJ_PU);
|
||||
if (units.size() <= pu_id) {
|
||||
|
@ -377,18 +381,31 @@ void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorith
|
|||
PUs--;
|
||||
|
||||
allocated_pu = true;
|
||||
threads.add(units[pu_id]->os_index, intensity);
|
||||
threads_data.emplace_back(units[pu_id]->os_index, intensity);
|
||||
|
||||
if (cacheHashes == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Reversing of "threads_data" and "cores" is done to fill in virtual cores starting from the last one, but still in order
|
||||
// For example, cn-heavy threads on 6-core Zen2/Zen3 will have affinity [0,2,4,6,8,10,9,11]
|
||||
// This is important for Zen3 cn-heavy optimization
|
||||
|
||||
if (pu_id & 1) {
|
||||
std::reverse(threads_data.begin(), threads_data.end());
|
||||
}
|
||||
|
||||
for (const auto& t : threads_data) {
|
||||
threads.add(t.first, t.second);
|
||||
}
|
||||
|
||||
if (!allocated_pu) {
|
||||
break;
|
||||
}
|
||||
|
||||
pu_id++;
|
||||
std::reverse(cores.begin(), cores.end());
|
||||
}
|
||||
# endif
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue