Optimized cn-heavy for Zen3

- Uses scratchpad interleaving to access only the closest L3 slice from each CPU core.
- Also activates MSR mod for cn-heavy because CPU prefetchers get confused with interleaving
- 7-8% speedup on Zen3
This commit is contained in:
SChernykh 2021-02-07 22:05:11 +01:00
parent b1e14dc1d3
commit 8af8df25aa
8 changed files with 187 additions and 81 deletions

View file

@ -363,10 +363,14 @@ void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorith
return;
}
std::vector<std::pair<int64_t, int32_t>> threads_data;
threads_data.reserve(cores.size());
size_t pu_id = 0;
while (cacheHashes > 0 && PUs > 0) {
bool allocated_pu = false;
threads_data.clear();
for (hwloc_obj_t core : cores) {
const std::vector<hwloc_obj_t> units = findByType(core, HWLOC_OBJ_PU);
if (units.size() <= pu_id) {
@ -377,18 +381,31 @@ void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorith
PUs--;
allocated_pu = true;
threads.add(units[pu_id]->os_index, intensity);
threads_data.emplace_back(units[pu_id]->os_index, intensity);
if (cacheHashes == 0) {
break;
}
}
// Reversing of "threads_data" and "cores" is done to fill in virtual cores starting from the last one, but still in order
// For example, cn-heavy threads on 6-core Zen2/Zen3 will have affinity [0,2,4,6,8,10,9,11]
// This is important for Zen3 cn-heavy optimization
if (pu_id & 1) {
std::reverse(threads_data.begin(), threads_data.end());
}
for (const auto& t : threads_data) {
threads.add(t.first, t.second);
}
if (!allocated_pu) {
break;
}
pu_id++;
std::reverse(cores.begin(), cores.end());
}
# endif
}