block_of_code: Allow Fast BMI2 paths on Zen 3 (#593)

BMI2 instructions such as `pdep` and `pext` have been
known to be incredibly slow on AMD. But on Zen3
and newer, the performance of these instructions
are now much greater, but previous versions of AMD
architectures should still avoid BMI2.

On Zen 2, pdep/pext were 300 cycles. Now on Zen 3 it is 3 cycles.
This is a big enough improvement to allow BMI2 code to
be dispatched if available. The Zen 3 architecture is checked for
by detecting the family of the processor.
This commit is contained in:
Wunk 2021-03-27 14:36:51 -07:00 committed by GitHub
parent c28f13af97
commit e06933f123
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -13,6 +13,7 @@
#include "backend/x64/block_of_code.h"
#include "backend/x64/perf_map.h"
#include "common/assert.h"
#include "common/bit_util.h"
#ifdef _WIN32
#include <windows.h>
@ -364,7 +365,21 @@ bool BlockOfCode::HasBMI2() const {
}
bool BlockOfCode::HasFastBMI2() const {
return DoesCpuSupport(Xbyak::util::Cpu::tBMI2) && !DoesCpuSupport(Xbyak::util::Cpu::tAMD);
if (DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) {
// BMI2 instructions such as pdep and pext have been very slow up until Zen 3.
// Check for Zen 3 or newer by its family (0x19).
// See also: https://en.wikichip.org/wiki/amd/cpuid
if (DoesCpuSupport(Xbyak::util::Cpu::tAMD)) {
std::array<u32, 4> data{};
cpu_info.getCpuid(1, data.data());
const u32 family_base = Common::Bits< 8, 11>(data[0]);
const u32 family_extended = Common::Bits<20, 27>(data[0]);
const u32 family = family_base + family_extended;
return family >= 0x19;
}
return true;
}
return false;
}
bool BlockOfCode::HasFMA() const {