backend/x64: Implement HostFeature

This commit is contained in:
Wunkolo 2021-05-10 00:17:21 -07:00 committed by merry
parent b93ae62acf
commit 105b464bc1
17 changed files with 346 additions and 345 deletions

View file

@ -288,6 +288,7 @@ if (ARCHITECTURE STREQUAL "x86_64")
backend/x64/exclusive_monitor.cpp
backend/x64/hostloc.cpp
backend/x64/hostloc.h
backend/x64/host_feature.h
backend/x64/jitstate_info.h
backend/x64/oparg.h
backend/x64/perf_map.cpp

View file

@ -295,7 +295,7 @@ void A32EmitX64::GenTerminalHandlers() {
calculate_location_descriptor();
code.L(rsb_cache_miss);
code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data()));
if (code.HasSSE42()) {
if (code.HasHostFeature(HostFeature::SSE42)) {
code.crc32(ebp, r12d);
}
code.and_(ebp, fast_dispatch_table_mask);
@ -313,7 +313,7 @@ void A32EmitX64::GenTerminalHandlers() {
code.align();
fast_dispatch_table_lookup = code.getCurr<FastDispatchEntry&(*)(u64)>();
code.mov(code.ABI_PARAM2, reinterpret_cast<u64>(fast_dispatch_table.data()));
if (code.HasSSE42()) {
if (code.HasHostFeature(HostFeature::SSE42)) {
code.crc32(code.ABI_PARAM1.cvt32(), code.ABI_PARAM2.cvt32());
}
code.and_(code.ABI_PARAM1.cvt32(), fast_dispatch_table_mask);
@ -428,7 +428,7 @@ void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
if (code.HasFastBMI2()) {
if (code.HasHostFeature(HostFeature::FastBMI2)) {
// Here we observe that cpsr_et and cpsr_ge are right next to each other in memory,
// so we load them both at the same time with one 64-bit read. This allows us to
// extract all of their bits together at once with one pext.
@ -456,7 +456,7 @@ void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
code.or_(result, tmp);
code.mov(tmp2, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
if (code.HasFastBMI2()) {
if (code.HasHostFeature(HostFeature::FastBMI2)) {
code.mov(tmp, NZCV::x64_mask);
code.pext(tmp2, tmp2, tmp);
code.shl(tmp2, 28);
@ -490,7 +490,7 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
// cpsr_nzcv
code.mov(tmp, cpsr);
code.shr(tmp, 28);
if (code.HasFastBMI2()) {
if (code.HasHostFeature(HostFeature::FastBMI2)) {
code.mov(tmp2, NZCV::x64_mask);
code.pdep(tmp, tmp, tmp2);
} else {
@ -504,7 +504,7 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
code.and_(tmp, 0x07F0FDDF);
code.mov(dword[r15 + offsetof(A32JitState, cpsr_jaifm)], tmp);
if (code.HasFastBMI2()) {
if (code.HasHostFeature(HostFeature::FastBMI2)) {
// cpsr_et and cpsr_ge
static_assert(offsetof(A32JitState, upper_location_descriptor) + 4 == offsetof(A32JitState, cpsr_ge));
// This mask is 0x7FFF0000, because we do not want the MSB to be sign extended to the upper dword.
@ -546,7 +546,7 @@ void A32EmitX64::EmitA32SetCpsrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
const u32 imm = args[0].GetImmediateU32();
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
} else if (code.HasFastBMI2()) {
} else if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
@ -571,7 +571,7 @@ void A32EmitX64::EmitA32SetCpsrNZCVQ(A32EmitContext& ctx, IR::Inst* inst) {
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_q)], u8((imm & 0x08000000) != 0 ? 1 : 0));
} else if (code.HasFastBMI2()) {
} else if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
@ -698,7 +698,7 @@ void A32EmitX64::EmitA32SetGEFlagsCompressed(A32EmitContext& ctx, IR::Inst* inst
ge |= Common::Bit<16>(imm) ? 0x000000FF : 0;
code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], ge);
} else if (code.HasFastBMI2()) {
} else if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
@ -860,7 +860,7 @@ void A32EmitX64::EmitA32GetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
void A32EmitX64::EmitA32SetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasFastBMI2()) {
if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(args[0]).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();

View file

@ -164,7 +164,7 @@ void A64EmitX64::GenMemory128Accessors() {
#else
code.sub(rsp, 8);
Devirtualize<&A64::UserCallbacks::MemoryRead128>(conf.callbacks).EmitCall(code);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.movq(xmm1, code.ABI_RETURN);
code.pinsrq(xmm1, code.ABI_RETURN2, 1);
} else {
@ -187,7 +187,7 @@ void A64EmitX64::GenMemory128Accessors() {
code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE);
#else
code.sub(rsp, 8);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.movq(code.ABI_PARAM3, xmm1);
code.pextrq(code.ABI_PARAM4, xmm1, 1);
} else {
@ -338,7 +338,7 @@ void A64EmitX64::GenTerminalHandlers() {
calculate_location_descriptor();
code.L(rsb_cache_miss);
code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data()));
if (code.HasSSE42()) {
if (code.HasHostFeature(HostFeature::SSE42)) {
code.crc32(rbx, r12d);
}
code.and_(ebp, fast_dispatch_table_mask);
@ -356,7 +356,7 @@ void A64EmitX64::GenTerminalHandlers() {
code.align();
fast_dispatch_table_lookup = code.getCurr<FastDispatchEntry&(*)(u64)>();
code.mov(code.ABI_PARAM2, reinterpret_cast<u64>(fast_dispatch_table.data()));
if (code.HasSSE42()) {
if (code.HasHostFeature(HostFeature::SSE42)) {
code.crc32(code.ABI_PARAM1, code.ABI_PARAM2);
}
code.and_(code.ABI_PARAM1.cvt32(), fast_dispatch_table_mask);
@ -393,7 +393,7 @@ void A64EmitX64::EmitA64GetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
code.mov(nzcv_raw, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]);
if (code.HasFastBMI2()) {
if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
code.mov(tmp, NZCV::x64_mask);
code.pext(nzcv_raw, nzcv_raw, tmp);
@ -412,7 +412,7 @@ void A64EmitX64::EmitA64SetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
code.shr(nzcv_raw, 28);
if (code.HasFastBMI2()) {
if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
code.mov(tmp, NZCV::x64_mask);
code.pdep(nzcv_raw, nzcv_raw, tmp);
@ -804,7 +804,7 @@ Xbyak::RegExp EmitVAddrLookup(BlockOfCode& code, A64EmitContext& ctx, size_t bit
code.shr(tmp, int(page_bits));
} else if (ctx.conf.silently_mirror_page_table) {
if (valid_page_index_bits >= 32) {
if (code.HasBMI2()) {
if (code.HasHostFeature(HostFeature::BMI2)) {
const Xbyak::Reg64 bit_count = ctx.reg_alloc.ScratchGpr();
code.mov(bit_count, unused_top_bits);
code.bzhi(tmp, vaddr, bit_count);

View file

@ -61,7 +61,7 @@ void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size, const
size_t xmm_offset = frame_info.xmm_offset;
for (HostLoc xmm : regs) {
if (HostLocIsXMM(xmm)) {
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vmovaps(code.xword[rsp + xmm_offset], HostLocToXmm(xmm));
} else {
code.movaps(code.xword[rsp + xmm_offset], HostLocToXmm(xmm));
@ -83,7 +83,7 @@ void ABI_PopRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size, const
size_t xmm_offset = frame_info.xmm_offset;
for (HostLoc xmm : regs) {
if (HostLocIsXMM(xmm)) {
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vmovaps(HostLocToXmm(xmm), code.xword[rsp + xmm_offset]);
} else {
code.movaps(HostLocToXmm(xmm), code.xword[rsp + xmm_offset]);

View file

@ -73,6 +73,56 @@ void ProtectMemory(const void* base, size_t size, bool is_executable) {
}
#endif
HostFeature GetHostFeatures()
{
HostFeature features = {};
#ifdef DYNARMIC_ENABLE_CPU_FEATURE_DETECTION
using Cpu = Xbyak::util::Cpu;
Xbyak::util::Cpu cpu_info;
if (cpu_info.has(Cpu::tSSSE3)) features |= HostFeature::SSSE3;
if (cpu_info.has(Cpu::tSSE41)) features |= HostFeature::SSE41;
if (cpu_info.has(Cpu::tSSE42)) features |= HostFeature::SSE42;
if (cpu_info.has(Cpu::tAVX)) features |= HostFeature::AVX;
if (cpu_info.has(Cpu::tAVX2)) features |= HostFeature::AVX2;
if (cpu_info.has(Cpu::tAVX512F)) features |= HostFeature::AVX512F;
if (cpu_info.has(Cpu::tAVX512CD)) features |= HostFeature::AVX512CD;
if (cpu_info.has(Cpu::tAVX512VL)) features |= HostFeature::AVX512VL;
if (cpu_info.has(Cpu::tAVX512BW)) features |= HostFeature::AVX512BW;
if (cpu_info.has(Cpu::tAVX512DQ)) features |= HostFeature::AVX512DQ;
if (cpu_info.has(Cpu::tAVX512_BITALG)) features |= HostFeature::AVX512BITALG;
if (cpu_info.has(Cpu::tPCLMULQDQ)) features |= HostFeature::PCLMULQDQ;
if (cpu_info.has(Cpu::tF16C)) features |= HostFeature::F16C;
if (cpu_info.has(Cpu::tFMA)) features |= HostFeature::FMA;
if (cpu_info.has(Cpu::tAESNI)) features |= HostFeature::AES;
if (cpu_info.has(Cpu::tPOPCNT)) features |= HostFeature::POPCNT;
if (cpu_info.has(Cpu::tBMI1)) features |= HostFeature::BMI1;
if (cpu_info.has(Cpu::tBMI2)) features |= HostFeature::BMI2;
if (cpu_info.has(Cpu::tLZCNT)) features |= HostFeature::LZCNT;
if (cpu_info.has(Cpu::tGFNI)) features |= HostFeature::GFNI;
if (cpu_info.has(Cpu::tBMI2)) {
// BMI2 instructions such as pdep and pext have been very slow up until Zen 3.
// Check for Zen 3 or newer by its family (0x19).
// See also: https://en.wikichip.org/wiki/amd/cpuid
if (cpu_info.has(Cpu::tAMD)) {
std::array<u32, 4> data{};
cpu_info.getCpuid(1, data.data());
const u32 family_base = Common::Bits< 8, 11>(data[0]);
const u32 family_extended = Common::Bits<20, 27>(data[0]);
const u32 family = family_base + family_extended;
if (family >= 0x19)
features |= HostFeature::FastBMI2;
} else {
features |= HostFeature::FastBMI2;
}
}
#endif
return features;
}
} // anonymous namespace
BlockOfCode::BlockOfCode(RunCodeCallbacks cb, JitStateInfo jsi, size_t total_code_size, size_t far_code_offset, std::function<void(BlockOfCode&)> rcp)
@ -81,6 +131,7 @@ BlockOfCode::BlockOfCode(RunCodeCallbacks cb, JitStateInfo jsi, size_t total_cod
, jsi(jsi)
, far_code_offset(far_code_offset)
, constant_pool(*this, CONSTANT_POOL_SIZE)
, host_features(GetHostFeatures())
{
ASSERT(total_code_size > far_code_offset);
EnableWriting();
@ -317,106 +368,4 @@ void BlockOfCode::EnsurePatchLocationSize(CodePtr begin, size_t size) {
nop(size - current_size);
}
bool BlockOfCode::HasSSSE3() const {
return DoesCpuSupport(Xbyak::util::Cpu::tSSSE3);
}
bool BlockOfCode::HasSSE41() const {
return DoesCpuSupport(Xbyak::util::Cpu::tSSE41);
}
bool BlockOfCode::HasSSE42() const {
return DoesCpuSupport(Xbyak::util::Cpu::tSSE42);
}
bool BlockOfCode::HasPCLMULQDQ() const {
return DoesCpuSupport(Xbyak::util::Cpu::tPCLMULQDQ);
}
bool BlockOfCode::HasAVX() const {
return DoesCpuSupport(Xbyak::util::Cpu::tAVX);
}
bool BlockOfCode::HasF16C() const {
return DoesCpuSupport(Xbyak::util::Cpu::tF16C);
}
bool BlockOfCode::HasAESNI() const {
return DoesCpuSupport(Xbyak::util::Cpu::tAESNI);
}
bool BlockOfCode::HasLZCNT() const {
return DoesCpuSupport(Xbyak::util::Cpu::tLZCNT);
}
bool BlockOfCode::HasBMI1() const {
return DoesCpuSupport(Xbyak::util::Cpu::tBMI1);
}
bool BlockOfCode::HasBMI2() const {
return DoesCpuSupport(Xbyak::util::Cpu::tBMI2);
}
bool BlockOfCode::HasFastBMI2() const {
if (DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) {
// BMI2 instructions such as pdep and pext have been very slow up until Zen 3.
// Check for Zen 3 or newer by its family (0x19).
// See also: https://en.wikichip.org/wiki/amd/cpuid
if (DoesCpuSupport(Xbyak::util::Cpu::tAMD)) {
std::array<u32, 4> data{};
cpu_info.getCpuid(1, data.data());
const u32 family_base = Common::Bits< 8, 11>(data[0]);
const u32 family_extended = Common::Bits<20, 27>(data[0]);
const u32 family = family_base + family_extended;
return family >= 0x19;
}
return true;
}
return false;
}
bool BlockOfCode::HasFMA() const {
return DoesCpuSupport(Xbyak::util::Cpu::tFMA);
}
bool BlockOfCode::HasAVX2() const {
return DoesCpuSupport(Xbyak::util::Cpu::tAVX2);
}
bool BlockOfCode::HasAVX512_Skylake() const {
// The feature set formerly known as AVX3.2. (Introduced with Skylake.)
return DoesCpuSupport(Xbyak::util::Cpu::tAVX512F)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512CD)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512BW)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512DQ)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL);
}
bool BlockOfCode::HasAVX512_Icelake() const {
return DoesCpuSupport(Xbyak::util::Cpu::tAVX512F)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512CD)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512BW)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512DQ)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512_VPOPCNTDQ)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512_VNNI)
&& DoesCpuSupport(Xbyak::util::Cpu::tGFNI)
&& DoesCpuSupport(Xbyak::util::Cpu::tVAES)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512_VBMI2)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512_BITALG)
&& DoesCpuSupport(Xbyak::util::Cpu::tVPCLMULQDQ);
}
bool BlockOfCode::HasAVX512_BITALG() const {
return DoesCpuSupport(Xbyak::util::Cpu::tAVX512_BITALG);
}
bool BlockOfCode::DoesCpuSupport([[maybe_unused]] Xbyak::util::Cpu::Type type) const {
#ifdef DYNARMIC_ENABLE_CPU_FEATURE_DETECTION
return cpu_info.has(type);
#else
return false;
#endif
}
} // namespace Dynarmic::Backend::X64

View file

@ -16,6 +16,7 @@
#include "backend/x64/abi.h"
#include "backend/x64/callback.h"
#include "backend/x64/constant_pool.h"
#include "backend/x64/host_feature.h"
#include "backend/x64/jitstate_info.h"
#include "common/cast_util.h"
#include "common/common_types.h"
@ -145,22 +146,9 @@ public:
JitStateInfo GetJitStateInfo() const { return jsi; }
bool HasSSSE3() const;
bool HasSSE41() const;
bool HasSSE42() const;
bool HasPCLMULQDQ() const;
bool HasAVX() const;
bool HasF16C() const;
bool HasAESNI() const;
bool HasLZCNT() const;
bool HasBMI1() const;
bool HasBMI2() const;
bool HasFastBMI2() const;
bool HasFMA() const;
bool HasAVX2() const;
bool HasAVX512_Skylake() const;
bool HasAVX512_Icelake() const;
bool HasAVX512_BITALG() const;
bool HasHostFeature(HostFeature feature) const {
return (host_features & feature) == feature;
}
private:
RunCodeCallbacks cb;
@ -185,8 +173,7 @@ private:
std::array<const void*, 4> return_from_run_code;
void GenRunCode(std::function<void(BlockOfCode&)> rcp);
Xbyak::util::Cpu cpu_info;
bool DoesCpuSupport(Xbyak::util::Cpu::Type type) const;
const HostFeature host_features;
};
} // namespace Dynarmic::Backend::X64

View file

@ -161,7 +161,7 @@ void EmitX64::EmitNZCVFromPackedFlags(EmitContext& ctx, IR::Inst* inst) {
value |= Common::Bit<28>(args[0].GetImmediateU32()) ? (1 << 0) : 0;
code.mov(nzcv, value);
ctx.reg_alloc.DefineValue(inst, nzcv);
} else if (code.HasFastBMI2()) {
} else if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 nzcv = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();

View file

@ -41,7 +41,7 @@ static void EmitAESFunction(RegAlloc::ArgumentInfo args, EmitContext& ctx, Block
void EmitX64::EmitAESDecryptSingleRound(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasAESNI()) {
if (code.HasHostFeature(HostFeature::AES)) {
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
@ -58,7 +58,7 @@ void EmitX64::EmitAESDecryptSingleRound(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitAESEncryptSingleRound(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasAESNI()) {
if (code.HasHostFeature(HostFeature::AES)) {
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
@ -75,7 +75,7 @@ void EmitX64::EmitAESEncryptSingleRound(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitAESInverseMixColumns(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasAESNI()) {
if (code.HasHostFeature(HostFeature::AES)) {
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
code.aesimc(data, data);
@ -90,7 +90,7 @@ void EmitX64::EmitAESInverseMixColumns(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitAESMixColumns(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasAESNI()) {
if (code.HasHostFeature(HostFeature::AES)) {
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();

View file

@ -19,7 +19,7 @@ namespace CRC32 = Common::Crypto::CRC32;
static void EmitCRC32Castagnoli(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, const int data_size) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasSSE42()) {
if (code.HasHostFeature(HostFeature::SSE42)) {
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg value = ctx.reg_alloc.UseGpr(args[1]).changeBit(data_size);
code.crc32(crc, value);
@ -35,7 +35,7 @@ static void EmitCRC32Castagnoli(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, const int data_size) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasPCLMULQDQ() && data_size < 32) {
if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size < 32) {
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg64 value = ctx.reg_alloc.UseScratchGpr(args[1]);
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
@ -49,7 +49,7 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
code.movd(xmm_tmp, value.cvt32());
code.pslldq(xmm_tmp, (64 - data_size) / 8);
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpclmulqdq(xmm_value, xmm_tmp, xmm_const, 0x00);
code.pclmulqdq(xmm_value, xmm_const, 0x10);
code.pxor(xmm_value, xmm_tmp);
@ -66,7 +66,7 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
return;
}
if (code.HasPCLMULQDQ() && data_size == 32) {
if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 32) {
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(args[1]).cvt32();
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
@ -87,7 +87,7 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
return;
}
if (code.HasPCLMULQDQ() && data_size == 64) {
if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 64) {
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]);
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();

View file

@ -36,7 +36,7 @@ void EmitX64::EmitPack2x64To1x128(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg64 hi = ctx.reg_alloc.UseGpr(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.movq(result, lo);
code.pinsrq(result, hi, 1);
} else {
@ -303,7 +303,7 @@ void EmitX64::EmitLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
}
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
} else if (code.HasHostFeature(HostFeature::BMI2)) {
const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32();
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
@ -392,7 +392,7 @@ void EmitX64::EmitLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
}
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
} else if (code.HasHostFeature(HostFeature::BMI2)) {
const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg);
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
@ -441,7 +441,7 @@ void EmitX64::EmitLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) {
}
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
} else if (code.HasHostFeature(HostFeature::BMI2)) {
const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32();
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
@ -528,7 +528,7 @@ void EmitX64::EmitLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
}
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
} else if (code.HasHostFeature(HostFeature::BMI2)) {
const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg);
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
@ -573,7 +573,7 @@ void EmitX64::EmitArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) {
code.sar(result, u8(shift < 31 ? shift : 31));
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
} else if (code.HasHostFeature(HostFeature::BMI2)) {
const Xbyak::Reg32 shift = ctx.reg_alloc.UseScratchGpr(shift_arg).cvt32();
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
@ -658,7 +658,7 @@ void EmitX64::EmitArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) {
code.sar(result, u8(shift < 63 ? shift : 63));
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
} else if (code.HasHostFeature(HostFeature::BMI2)) {
const Xbyak::Reg64 shift = ctx.reg_alloc.UseScratchGpr(shift_arg);
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
@ -697,7 +697,7 @@ void EmitX64::EmitRotateRight32(EmitContext& ctx, IR::Inst* inst) {
auto& carry_arg = args[2];
if (!carry_inst) {
if (shift_arg.IsImmediate() && code.HasBMI2()) {
if (shift_arg.IsImmediate() && code.HasHostFeature(HostFeature::BMI2)) {
const u8 shift = shift_arg.GetImmediateU8();
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
@ -768,7 +768,7 @@ void EmitX64::EmitRotateRight64(EmitContext& ctx, IR::Inst* inst) {
auto& operand_arg = args[0];
auto& shift_arg = args[1];
if (shift_arg.IsImmediate() && code.HasBMI2()) {
if (shift_arg.IsImmediate() && code.HasHostFeature(HostFeature::BMI2)) {
const u8 shift = shift_arg.GetImmediateU8();
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
@ -831,7 +831,7 @@ static void EmitMaskedShift32(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
}
if constexpr (!std::is_same_v<BMI2FT, std::nullptr_t>) {
if (code.HasBMI2()) {
if (code.HasHostFeature(HostFeature::BMI2)) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32();
@ -868,7 +868,7 @@ static void EmitMaskedShift64(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
}
if constexpr (!std::is_same_v<BMI2FT, std::nullptr_t>) {
if (code.HasBMI2()) {
if (code.HasHostFeature(HostFeature::BMI2)) {
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg);
@ -1482,7 +1482,7 @@ void EmitX64::EmitByteReverseDual(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasLZCNT()) {
if (code.HasHostFeature(HostFeature::LZCNT)) {
const Xbyak::Reg32 source = ctx.reg_alloc.UseGpr(args[0]).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
@ -1506,7 +1506,7 @@ void EmitX64::EmitCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitCountLeadingZeros64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasLZCNT()) {
if (code.HasHostFeature(HostFeature::LZCNT)) {
const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]).cvt64();
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr().cvt64();

View file

@ -103,7 +103,7 @@ void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list
code.andps(xmm0, xmm);
if constexpr (fsize == 32) {
code.pcmpgtd(xmm0, code.MConst(xword, f32_smallest_normal - 1));
} else if (code.HasSSE42()) {
} else if (code.HasHostFeature(HostFeature::SSE42)) {
code.pcmpgtq(xmm0, code.MConst(xword, f64_smallest_normal - 1));
} else {
code.pcmpgtd(xmm0, code.MConst(xword, f64_smallest_normal - 1));
@ -124,7 +124,7 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch)
template<size_t fsize>
void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) {
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vcmpunords)(xmm0, result, result);
FCODE(blendvp)(result, code.MConst(xword, fsize == 32 ? f32_nan : f64_nan));
} else {
@ -205,7 +205,7 @@ void EmitPostProcessNaNs(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm op1, X
// op1 == QNaN && op2 == QNaN is the most common case. With this method
// that case would only require one branch.
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vxorps(xmm0, op1, op2);
} else {
code.movaps(xmm0, op1);
@ -240,7 +240,7 @@ void EmitPostProcessNaNs(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm op1, X
code.jna(end, code.T_NEAR);
// Silence the SNaN as required by spec.
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vorps(result, op2, code.MConst(xword, mantissa_msb));
} else {
code.movaps(result, op2);
@ -596,7 +596,7 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if constexpr (fsize != 16) {
if (code.HasFMA() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
if (code.HasHostFeature(HostFeature::FMA) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
@ -607,7 +607,7 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
return;
}
if (code.HasFMA()) {
if (code.HasHostFeature(HostFeature::FMA)) {
Xbyak::Label end, fallback;
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
@ -709,7 +709,7 @@ static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
Xbyak::Label end, nan, op_are_nans;
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vmuls)(result, op1, op2);
} else {
code.movaps(result, op1);
@ -723,7 +723,7 @@ static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
code.L(nan);
FCODE(ucomis)(op1, op2);
code.jp(op_are_nans);
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vxorps(result, op1, op2);
} else {
code.movaps(result, op1);
@ -824,7 +824,7 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if constexpr (fsize != 16) {
if (code.HasFMA() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
if (code.HasHostFeature(HostFeature::FMA) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
Xbyak::Label end, fallback;
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
@ -838,7 +838,7 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
return;
}
if (code.HasFMA()) {
if (code.HasHostFeature(HostFeature::FMA)) {
Xbyak::Label end, fallback;
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
@ -909,7 +909,7 @@ static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, siz
const bool exact = inst->GetArg(2).GetU1();
const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode);
if (fsize != 16 && code.HasSSE41() && round_imm && !exact) {
if (fsize != 16 && code.HasHostFeature(HostFeature::SSE41) && round_imm && !exact) {
if (fsize == 64) {
FPTwoOp<64>(code, ctx, inst, [&](Xbyak::Xmm result) {
code.roundsd(result, result, *round_imm);
@ -1089,7 +1089,7 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
}
code.L(zero);
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpor(result, value, code.MConst(xword, 0x7FF0'0000'0000'0000));
} else {
code.movaps(result, value);
@ -1099,7 +1099,7 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
code.L(nan);
if (!ctx.FPCR().DN()) {
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpor(result, operand, code.MConst(xword, 0x0008'0000'0000'0000));
} else {
code.movaps(result, operand);
@ -1159,7 +1159,7 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if constexpr (fsize != 16) {
if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
@ -1172,7 +1172,7 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
return;
}
if (code.HasFMA() && code.HasAVX()) {
if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
Xbyak::Label end, fallback;
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
@ -1330,7 +1330,7 @@ void EmitX64::EmitFPHalfToDouble(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
if (code.HasF16C() && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]);
@ -1356,7 +1356,7 @@ void EmitX64::EmitFPHalfToSingle(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
if (code.HasF16C() && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]);
@ -1403,7 +1403,7 @@ void EmitX64::EmitFPSingleToHalf(EmitContext& ctx, IR::Inst* inst) {
const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode);
if (code.HasF16C() && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
if (ctx.FPCR().DN()) {
@ -1468,7 +1468,7 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
if constexpr (fsize != 16) {
const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode);
if (code.HasSSE41() && round_imm){
if (code.HasHostFeature(HostFeature::SSE41) && round_imm){
const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm();
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr().cvt64();
@ -1720,7 +1720,7 @@ void EmitX64::EmitFPFixedU32ToSingle(EmitContext& ctx, IR::Inst* inst) {
const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
const auto op = [&]{
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512F)) {
const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]);
code.vcvtusi2ss(result, result, from.cvt32());
} else {
@ -1813,7 +1813,7 @@ void EmitX64::EmitFPFixedU32ToDouble(EmitContext& ctx, IR::Inst* inst) {
const size_t fbits = args[1].GetImmediateU8();
[[maybe_unused]] const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); // Not required
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512F)) {
const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]);
code.vcvtusi2sd(to, to, from.cvt32());
} else {
@ -1878,7 +1878,7 @@ void EmitX64::EmitFPFixedU64ToDouble(EmitContext& ctx, IR::Inst* inst) {
const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
ASSERT(rounding_mode == ctx.FPCR().RMode());
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512F)) {
code.vcvtusi2sd(result, result, from);
} else {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
@ -1909,7 +1909,7 @@ void EmitX64::EmitFPFixedU64ToSingle(EmitContext& ctx, IR::Inst* inst) {
const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
ASSERT(rounding_mode == ctx.FPCR().RMode());
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512F)) {
const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]);
code.vcvtusi2ss(result, result, from);
} else {

View file

@ -76,7 +76,7 @@ void EmitX64::EmitPackedAddU16(EmitContext& ctx, IR::Inst* inst) {
code.paddw(xmm_a, xmm_b);
if (ge_inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
@ -199,7 +199,7 @@ void EmitX64::EmitPackedSubU16(EmitContext& ctx, IR::Inst* inst) {
return;
}
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
@ -685,7 +685,7 @@ void EmitX64::EmitPackedSelect(EmitContext& ctx, IR::Inst* inst) {
code.por(from, ge);
ctx.reg_alloc.DefineValue(inst, from);
} else if (code.HasBMI1()) {
} else if (code.HasHostFeature(HostFeature::BMI1)) {
const Xbyak::Reg32 ge = ctx.reg_alloc.UseGpr(args[0]).cvt32();
const Xbyak::Reg32 to = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
const Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32();

View file

@ -161,7 +161,7 @@ void EmitX64::EmitVectorGetElement8(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr().cvt32();
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pextrb(dest, source, index);
} else {
code.pextrw(dest, source, index / 2);
@ -197,7 +197,7 @@ void EmitX64::EmitVectorGetElement32(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr().cvt32();
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]);
code.pextrd(dest, source, index);
} else {
@ -225,7 +225,7 @@ void EmitX64::EmitVectorGetElement64(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg64 dest = ctx.reg_alloc.ScratchGpr().cvt64();
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]);
code.pextrq(dest, source, 1);
} else {
@ -243,7 +243,7 @@ void EmitX64::EmitVectorSetElement8(EmitContext& ctx, IR::Inst* inst) {
const u8 index = args[1].GetImmediateU8();
const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Reg8 source_elem = ctx.reg_alloc.UseGpr(args[2]).cvt8();
code.pinsrb(source_vector, source_elem.cvt32(), index);
@ -288,7 +288,7 @@ void EmitX64::EmitVectorSetElement32(EmitContext& ctx, IR::Inst* inst) {
const u8 index = args[1].GetImmediateU8();
const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Reg32 source_elem = ctx.reg_alloc.UseGpr(args[2]).cvt32();
code.pinsrd(source_vector, source_elem, index);
@ -311,7 +311,7 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) {
const u8 index = args[1].GetImmediateU8();
const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Reg64 source_elem = ctx.reg_alloc.UseGpr(args[2]);
code.pinsrq(source_vector, source_elem, index);
@ -334,7 +334,7 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) {
}
static void VectorAbs8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
if (code.HasSSSE3()) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
code.pabsb(data, data);
} else {
const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm();
@ -345,7 +345,7 @@ static void VectorAbs8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& da
}
static void VectorAbs16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
if (code.HasSSSE3()) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
code.pabsw(data, data);
} else {
const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm();
@ -356,7 +356,7 @@ static void VectorAbs16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& d
}
static void VectorAbs32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
if (code.HasSSSE3()) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
code.pabsd(data, data);
} else {
const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm();
@ -368,7 +368,7 @@ static void VectorAbs32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& d
}
static void VectorAbs64(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
code.vpabsq(data, data);
} else {
const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm();
@ -439,7 +439,7 @@ void EmitX64::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) {
}
static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const Xbyak::Xmm& result, u8 shift_amount) {
if (code.HasAVX512_Icelake()) {
if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::GFNI)) {
const u64 shift_matrix = shift_amount < 8
? (0x0102040810204080 << (shift_amount * 8)) | (0x8080808080808080 >> (64 - shift_amount * 8))
: 0x8080808080808080;
@ -494,7 +494,7 @@ void EmitX64::EmitVectorArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst)
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const u8 shift_amount = std::min(args[1].GetImmediateU8(), u8(63));
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
code.vpsraq(result, result, shift_amount);
} else {
const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
@ -548,7 +548,7 @@ void EmitX64::EmitVectorArithmeticVShift8(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorArithmeticVShift16(EmitContext& ctx, IR::Inst* inst) {
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -580,7 +580,7 @@ void EmitX64::EmitVectorArithmeticVShift16(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorArithmeticVShift32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasAVX2()) {
if (code.HasHostFeature(HostFeature::AVX2)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -611,7 +611,7 @@ void EmitX64::EmitVectorArithmeticVShift32(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorArithmeticVShift64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -645,10 +645,10 @@ void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.HasAVX2()) {
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastb(a, a);
code.vmovq(a, a);
} else if (code.HasSSSE3()) {
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
code.pxor(tmp, tmp);
@ -684,9 +684,9 @@ void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.HasAVX2()) {
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastb(a, a);
} else if (code.HasSSSE3()) {
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
code.pxor(tmp, tmp);
@ -704,7 +704,7 @@ void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.HasAVX2()) {
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastw(a, a);
} else {
code.pshuflw(a, a, 0);
@ -718,7 +718,7 @@ void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.HasAVX2()) {
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastd(a, a);
} else {
code.pshufd(a, a, 0);
@ -731,7 +731,7 @@ void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.HasAVX2()) {
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastq(a, a);
} else {
code.punpcklqdq(a, a);
@ -756,7 +756,7 @@ static void EmitVectorCountLeadingZeros(VectorArray<T>& result, const VectorArra
}
void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSSE3()) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -785,7 +785,7 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -817,7 +817,7 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
return;
}
if (code.HasSSSE3()) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -857,7 +857,7 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512CD)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -1087,7 +1087,7 @@ void EmitX64::EmitVectorEqual32(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorEqual64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpeqq);
return;
}
@ -1108,7 +1108,7 @@ void EmitX64::EmitVectorEqual64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorEqual128(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
@ -1144,7 +1144,7 @@ void EmitX64::EmitVectorExtract(EmitContext& ctx, IR::Inst* inst) {
return;
}
if (code.HasSSSE3()) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
@ -1195,7 +1195,7 @@ void EmitX64::EmitVectorGreaterS32(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorGreaterS64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE42()) {
if (code.HasHostFeature(HostFeature::SSE42)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpgtq);
return;
}
@ -1471,7 +1471,7 @@ void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) {
code.pxor(result, result);
} else if (shift_amount == 1) {
code.paddb(result, result);
} else if (code.HasAVX512_Icelake()) {
} else if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::GFNI)) {
const u64 shift_matrix = 0x0102040810204080 >> (shift_amount * 8);
code.vgf2p8affineqb(result, result, code.MConst(xword_b, shift_matrix), 0);
} else {
@ -1528,7 +1528,7 @@ void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) {
// Do nothing
} else if (shift_amount >= 8) {
code.pxor(result, result);
} else if (code.HasAVX512_Icelake()) {
} else if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::GFNI)) {
const u64 shift_matrix = 0x0102040810204080 << (shift_amount * 8);
code.vgf2p8affineqb(result, result, code.MConst(xword_b, shift_matrix), 0);
} else {
@ -1582,7 +1582,7 @@ void EmitX64::EmitVectorLogicalVShift8(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) {
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -1610,7 +1610,7 @@ void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorLogicalVShift32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasAVX2()) {
if (code.HasHostFeature(HostFeature::AVX2)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -1638,7 +1638,7 @@ void EmitX64::EmitVectorLogicalVShift32(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorLogicalVShift64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasAVX2()) {
if (code.HasHostFeature(HostFeature::AVX2)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -1666,7 +1666,7 @@ void EmitX64::EmitVectorLogicalVShift64(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorMaxS8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb);
return;
}
@ -1691,7 +1691,7 @@ void EmitX64::EmitVectorMaxS16(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorMaxS32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd);
return;
}
@ -1712,12 +1712,12 @@ void EmitX64::EmitVectorMaxS32(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorMaxS64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxsq);
return;
}
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -1740,7 +1740,7 @@ void EmitX64::EmitVectorMaxU8(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorMaxU16(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw);
return;
}
@ -1756,7 +1756,7 @@ void EmitX64::EmitVectorMaxU16(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorMaxU32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud);
return;
}
@ -1783,12 +1783,12 @@ void EmitX64::EmitVectorMaxU32(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorMaxU64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxuq);
return;
}
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -1811,7 +1811,7 @@ void EmitX64::EmitVectorMaxU64(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorMinS8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsb);
return;
}
@ -1836,7 +1836,7 @@ void EmitX64::EmitVectorMinS16(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorMinS32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsd);
return;
}
@ -1857,12 +1857,12 @@ void EmitX64::EmitVectorMinS32(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorMinS64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminsq);
return;
}
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
@ -1885,7 +1885,7 @@ void EmitX64::EmitVectorMinU8(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorMinU16(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminuw);
return;
}
@ -1904,7 +1904,7 @@ void EmitX64::EmitVectorMinU16(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorMinU32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminud);
return;
}
@ -1933,12 +1933,12 @@ void EmitX64::EmitVectorMinU32(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorMinU64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminuq);
return;
}
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
@ -1986,7 +1986,7 @@ void EmitX64::EmitVectorMultiply16(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorMultiply32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmulld);
return;
}
@ -2009,14 +2009,14 @@ void EmitX64::EmitVectorMultiply32(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmullq);
return;
}
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Reg64 tmp1 = ctx.reg_alloc.ScratchGpr();
@ -2062,7 +2062,7 @@ void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorNarrow16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
@ -2090,7 +2090,7 @@ void EmitX64::EmitVectorNarrow32(EmitContext& ctx, IR::Inst* inst) {
// TODO: AVX512F implementation
code.pxor(zeros, zeros);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pblendw(a, zeros, 0b10101010);
code.packusdw(a, zeros);
} else {
@ -2157,7 +2157,7 @@ void EmitX64::EmitVectorPairedAddLower16(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
code.punpcklqdq(xmm_a, xmm_b);
if (code.HasSSSE3()) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
code.pxor(tmp, tmp);
code.phaddw(xmm_a, tmp);
} else {
@ -2180,7 +2180,7 @@ void EmitX64::EmitVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
code.punpcklqdq(xmm_a, xmm_b);
if (code.HasSSSE3()) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
code.pxor(tmp, tmp);
code.phaddd(xmm_a, tmp);
} else {
@ -2218,7 +2218,7 @@ void EmitX64::EmitVectorPairedAdd8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAdd16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasSSSE3()) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
@ -2248,7 +2248,7 @@ void EmitX64::EmitVectorPairedAdd16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasSSSE3()) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
@ -2324,7 +2324,7 @@ void EmitX64::EmitVectorPairedAddSignedWiden32(EmitContext& ctx, IR::Inst* inst)
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
code.vpsraq(c, a, 32);
code.vpsllq(a, a, 32);
code.vpsraq(a, a, 32);
@ -2441,7 +2441,7 @@ void EmitX64::EmitVectorPairedMaxS32(EmitContext& ctx, IR::Inst* inst) {
code.shufps(tmp, y, 0b10001000);
code.shufps(x, y, 0b11011101);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pmaxsd(x, tmp);
ctx.reg_alloc.DefineValue(inst, x);
@ -2481,7 +2481,7 @@ void EmitX64::EmitVectorPairedMaxU32(EmitContext& ctx, IR::Inst* inst) {
code.shufps(tmp1, y, 0b10001000);
code.shufps(x, y, 0b11011101);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pmaxud(x, tmp1);
ctx.reg_alloc.DefineValue(inst, x);
@ -2526,7 +2526,7 @@ void EmitX64::EmitVectorPairedMinS32(EmitContext& ctx, IR::Inst* inst) {
code.shufps(tmp, y, 0b10001000);
code.shufps(x, y, 0b11011101);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pminsd(x, tmp);
ctx.reg_alloc.DefineValue(inst, x);
@ -2566,7 +2566,7 @@ void EmitX64::EmitVectorPairedMinU32(EmitContext& ctx, IR::Inst* inst) {
code.shufps(tmp1, y, 0b10001000);
code.shufps(x, y, 0b11011101);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pminud(x, tmp1);
ctx.reg_alloc.DefineValue(inst, x);
@ -2604,7 +2604,7 @@ static D PolynomialMultiply(T lhs, T rhs) {
}
void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
@ -2620,7 +2620,7 @@ void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
code.mov(counter, 8);
code.L(loop);
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpand(xmm0, xmm_b, mask);
code.vpxor(alternate, result, xmm_a);
} else {
@ -2646,7 +2646,7 @@ void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
@ -2664,7 +2664,7 @@ void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst
code.mov(counter, 8);
code.L(loop);
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpand(xmm0, xmm_b, mask);
code.vpxor(alternate, result, xmm_a);
} else {
@ -2692,7 +2692,7 @@ void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst
}
void EmitX64::EmitVectorPolynomialMultiplyLong64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasPCLMULQDQ()) {
if (code.HasHostFeature(HostFeature::PCLMULQDQ)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
@ -2723,7 +2723,7 @@ void EmitX64::EmitVectorPolynomialMultiplyLong64(EmitContext& ctx, IR::Inst* ins
}
void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
if (code.HasAVX512_BITALG()) {
if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::AVX512BITALG)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -2733,7 +2733,7 @@ void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
return;
}
if (code.HasSSSE3()) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm low_a = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -2770,7 +2770,7 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.HasAVX512_Icelake()) {
if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::GFNI)) {
code.vgf2p8affineqb(data, data, code.MConst(xword_b, 0x8040201008040201), 0);
} else {
const Xbyak::Xmm high_nibble_reg = ctx.reg_alloc.ScratchXmm();
@ -2779,7 +2779,7 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
code.pxor(data, high_nibble_reg);
code.psrld(high_nibble_reg, 4);
if (code.HasSSSE3()) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
// High lookup
const Xbyak::Xmm high_reversed_reg = ctx.reg_alloc.ScratchXmm();
code.movdqa(high_reversed_reg, code.MConst(xword, 0xE060A020C0408000, 0xF070B030D0509010));
@ -3020,7 +3020,7 @@ void EmitX64::EmitVectorShuffleWords(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
code.pmovsxbw(a, a);
ctx.reg_alloc.DefineValue(inst, a);
@ -3036,7 +3036,7 @@ void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorSignExtend16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
code.pmovsxwd(a, a);
ctx.reg_alloc.DefineValue(inst, a);
@ -3054,7 +3054,7 @@ void EmitX64::EmitVectorSignExtend32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pmovsxdq(a, a);
} else {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
@ -3076,7 +3076,7 @@ void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) {
code.movq(gpr_tmp, data);
code.sar(gpr_tmp, 63);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pinsrq(data, gpr_tmp, 1);
} else {
const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm();
@ -3147,7 +3147,7 @@ void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
if (upper_inst) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmulhw(result, x, y);
} else {
code.movdqa(result, x);
@ -3160,7 +3160,7 @@ void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
if (lower_inst) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmullw(result, x, y);
} else {
code.movdqa(result, x);
@ -3177,7 +3177,7 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (lower_inst && !upper_inst && code.HasAVX()) {
if (lower_inst && !upper_inst && code.HasHostFeature(HostFeature::AVX)) {
const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
@ -3189,7 +3189,7 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
return;
}
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
@ -3346,7 +3346,7 @@ void EmitX64::EmitVectorSignedSaturatedAbs32(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorSignedSaturatedAbs64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorSignedSaturatedAbs(64, code, ctx, inst);
return;
}
@ -3381,7 +3381,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
switch (bit_width) {
case 8:
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpaddb(result, x, xmm0);
} else {
code.movdqa(result, x);
@ -3389,7 +3389,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
}
break;
case 16:
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpaddw(result, x, xmm0);
} else {
code.movdqa(result, x);
@ -3397,7 +3397,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
}
break;
case 32:
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpaddd(result, x, xmm0);
} else {
code.movdqa(result, x);
@ -3405,7 +3405,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
}
break;
case 64:
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpaddq(result, x, xmm0);
} else {
code.movdqa(result, x);
@ -3414,10 +3414,10 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
break;
}
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
// xmm0 = majority(~y, x, res)
code.vpternlogd(xmm0, x, result, 0b10001110);
} else if (code.HasAVX()) {
} else if (code.HasHostFeature(HostFeature::AVX)) {
code.vpor(tmp, x, result);
code.pand(x, result);
code.vpblendvb(xmm0, tmp, x, xmm0);
@ -3433,7 +3433,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
switch (bit_width) {
case 8:
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
code.pcmpeqb(tmp2, tmp2);
code.pxor(tmp, tmp);
@ -3454,7 +3454,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
code.psrad(xmm0, 31);
break;
case 64:
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
code.vpsraq(xmm0, xmm0, 63);
} else {
code.psrad(xmm0, 31);
@ -3484,7 +3484,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
code.pmovmskb(mask, xmm0);
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pblendvb(result, tmp);
} else {
code.pandn(xmm0, result);
@ -3521,14 +3521,14 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiply16(EmitContext& ctx, IR::
const Xbyak::Xmm upper_tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm lower_tmp = ctx.reg_alloc.ScratchXmm();
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmulhw(upper_tmp, x, y);
} else {
code.movdqa(upper_tmp, x);
code.pmulhw(upper_tmp, y);
}
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmullw(lower_tmp, x, y);
} else {
code.movdqa(lower_tmp, x);
@ -3541,7 +3541,7 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiply16(EmitContext& ctx, IR::
if (lower_inst) {
const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm();
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpaddw(lower_result, lower_tmp, lower_tmp);
} else {
code.movdqa(lower_result, lower_tmp);
@ -3555,7 +3555,7 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiply16(EmitContext& ctx, IR::
if (upper_inst) {
const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm();
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpsrlw(lower_tmp, lower_tmp, 15);
code.vpaddw(upper_tmp, upper_tmp, upper_tmp);
code.vpor(upper_result, upper_tmp, lower_tmp);
@ -3586,7 +3586,7 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiply32(EmitContext& ctx, IR::
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm odds = ctx.reg_alloc.ScratchXmm();
@ -3706,7 +3706,7 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong16(EmitContext& ctx,
code.punpcklwd(y, y);
code.pmaddwd(x, y);
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpcmpeqd(y, x, code.MConst(xword, 0x8000000080000000, 0x8000000080000000));
code.vpxor(x, x, y);
} else {
@ -3728,7 +3728,7 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx,
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmovsxdq(x, x);
code.vpmovsxdq(y, y);
code.vpmuldq(x, x, y);
@ -3759,7 +3759,7 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx,
}
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpcmpeqq(y, x, code.MConst(xword, 0x8000000000000000, 0x8000000000000000));
code.vpxor(x, x, y);
code.vpmovmskb(bit, y);
@ -3850,7 +3850,7 @@ static void EmitVectorSignedSaturatedNarrowToUnsigned(size_t original_esize, Blo
code.punpcklbw(reconstructed, zero);
break;
case 32:
ASSERT(code.HasSSE41());
ASSERT(code.HasHostFeature(HostFeature::SSE41));
code.packusdw(dest, dest); // SSE4.1
code.movdqa(reconstructed, dest);
code.punpcklwd(reconstructed, zero);
@ -3873,7 +3873,7 @@ void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned16(EmitContext& ctx, IR::
}
void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorSignedSaturatedNarrowToUnsigned(32, code, ctx, inst);
return;
}
@ -3982,7 +3982,7 @@ void EmitX64::EmitVectorSignedSaturatedNeg32(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorSignedSaturatedNeg64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorSignedSaturatedNeg(64, code, ctx, inst);
return;
}
@ -4161,7 +4161,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
0x6060606060606060,
};
if (code.HasSSSE3() && is_defaults_zero && table_size <= 2) {
if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size <= 2) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
@ -4178,7 +4178,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
return;
}
if (code.HasSSE41() && table_size <= 2) {
if (code.HasHostFeature(HostFeature::SSE41) && table_size <= 2) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
@ -4189,7 +4189,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.Release(xmm_table0_upper);
}
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpaddusb(xmm0, indicies, code.MConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
} else {
code.movaps(xmm0, indicies);
@ -4202,7 +4202,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
return;
}
if (code.HasSSE41() && is_defaults_zero) {
if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[2]);
@ -4218,7 +4218,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.Release(xmm_table1_upper);
}
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpaddusb(xmm0, indicies, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
} else {
code.movaps(xmm0, indicies);
@ -4233,7 +4233,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
return;
}
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
@ -4250,7 +4250,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.Release(xmm_table1_upper);
}
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpaddusb(xmm0, indicies, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
} else {
code.movaps(xmm0, indicies);
@ -4259,7 +4259,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
code.pshufb(xmm_table0, indicies);
code.pshufb(xmm_table1, indicies);
code.pblendvb(xmm_table0, xmm_table1);
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpaddusb(xmm0, indicies, code.MConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
} else {
code.movaps(xmm0, indicies);
@ -4320,7 +4320,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
// TODO: AVX512VL implementation when available (VPERMB / VPERMI2B / VPERMT2B)
if (code.HasSSSE3() && is_defaults_zero && table_size == 1) {
if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
@ -4331,12 +4331,12 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
return;
}
if (code.HasSSE41() && table_size == 1) {
if (code.HasHostFeature(HostFeature::SSE41) && table_size == 1) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpaddusb(xmm0, indicies, code.MConst(xword, 0x7070707070707070, 0x7070707070707070));
} else {
code.movaps(xmm0, indicies);
@ -4349,12 +4349,12 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
return;
}
if (code.HasSSE41() && is_defaults_zero && table_size == 2) {
if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero && table_size == 2) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[1]);
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpaddusb(xmm0, indicies, code.MConst(xword, 0x7070707070707070, 0x7070707070707070));
} else {
code.movaps(xmm0, indicies);
@ -4369,7 +4369,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
return;
}
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm();
@ -4385,7 +4385,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
if (table_index == 0) {
code.pxor(xmm0, xmm0);
code.pcmpeqb(xmm0, masked);
} else if (code.HasAVX()) {
} else if (code.HasHostFeature(HostFeature::AVX)) {
code.vpcmpeqb(xmm0, masked, code.MConst(xword, table_index, table_index));
} else {
code.movaps(xmm0, code.MConst(xword, table_index, table_index));
@ -4529,7 +4529,7 @@ static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx,
break;
}
case 32:
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
@ -4579,7 +4579,7 @@ void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
if (upper_inst) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmulhuw(result, x, y);
} else {
code.movdqa(result, x);
@ -4592,7 +4592,7 @@ void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
if (lower_inst) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmullw(result, x, y);
} else {
code.movdqa(result, x);
@ -4609,7 +4609,7 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (lower_inst && !upper_inst && code.HasAVX()) {
if (lower_inst && !upper_inst && code.HasHostFeature(HostFeature::AVX)) {
const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
@ -4621,7 +4621,7 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
return;
}
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
@ -4870,7 +4870,7 @@ void EmitX64::EmitVectorUnsignedSaturatedShiftLeft64(EmitContext& ctx, IR::Inst*
void EmitX64::EmitVectorZeroExtend8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pmovzxbw(a, a);
} else {
const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
@ -4883,7 +4883,7 @@ void EmitX64::EmitVectorZeroExtend8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorZeroExtend16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pmovzxwd(a, a);
} else {
const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
@ -4896,7 +4896,7 @@ void EmitX64::EmitVectorZeroExtend16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorZeroExtend32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pmovzxdq(a, a);
} else {
const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();

View file

@ -101,7 +101,7 @@ template<size_t fsize, size_t nargs, typename NaNHandler>
void HandleNaNs(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, std::array<Xbyak::Xmm, nargs + 1> xmms, const Xbyak::Xmm& nan_mask, NaNHandler nan_handler) {
static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64");
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.ptest(nan_mask, nan_mask);
} else {
const Xbyak::Reg32 bitmask = ctx.reg_alloc.ScratchGpr().cvt32();
@ -188,7 +188,7 @@ template<size_t fsize>
void ForceToDefaultNaN(BlockOfCode& code, FP::FPCR fpcr, Xbyak::Xmm result) {
if (fpcr.DN()) {
const Xbyak::Xmm nan_mask = xmm0;
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vcmpunordp)(nan_mask, result, result);
FCODE(blendvp)(result, GetNaNVector<fsize>(code));
} else {
@ -204,7 +204,7 @@ void ForceToDefaultNaN(BlockOfCode& code, FP::FPCR fpcr, Xbyak::Xmm result) {
template<size_t fsize>
void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) {
const Xbyak::Xmm nan_mask = xmm0;
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vcmpordp)(nan_mask, result, result);
FCODE(vandp)(result, result, nan_mask);
} else {
@ -325,7 +325,7 @@ void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
fn(result, xmm_a);
}
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vcmpunordp)(nan_mask, result, result);
} else {
code.movaps(nan_mask, result);
@ -378,7 +378,7 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
code.movaps(result, xmm_a);
if (check_input_nan == CheckInputNaN::Yes) {
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vcmpunordp)(nan_mask, xmm_a, xmm_b);
} else {
code.movaps(nan_mask, xmm_b);
@ -394,7 +394,7 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
if (check_input_nan == CheckInputNaN::Yes) {
FCODE(cmpunordp)(nan_mask, result);
} else if (code.HasAVX()) {
} else if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vcmpunordp)(nan_mask, result, result);
} else {
code.movaps(nan_mask, result);
@ -664,9 +664,9 @@ void EmitX64::EmitFPVectorFromSignedFixed64(EmitContext& ctx, IR::Inst* inst) {
ASSERT(rounding_mode == ctx.FPCR(fpcr_controlled).RMode());
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
code.vcvtqq2pd(xmm, xmm);
} else if (code.HasSSE41()) {
} else if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr();
@ -715,7 +715,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed32(EmitContext& ctx, IR::Inst* inst)
ASSERT(rounding_mode == ctx.FPCR(fpcr_controlled).RMode());
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
code.vcvtudq2ps(xmm, xmm);
} else {
const Xbyak::Address mem_4B000000 = code.MConst(xword, 0x4B0000004B000000, 0x4B0000004B000000);
@ -724,7 +724,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed32(EmitContext& ctx, IR::Inst* inst)
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpblendw(tmp, xmm, mem_4B000000, 0b10101010);
code.vpsrld(xmm, xmm, 16);
code.vpblendw(xmm, xmm, mem_53000000, 0b10101010);
@ -765,7 +765,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed64(EmitContext& ctx, IR::Inst* inst)
ASSERT(rounding_mode == ctx.FPCR(fpcr_controlled).RMode());
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
if (code.HasAVX512_Skylake()) {
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
code.vcvtuqq2pd(xmm, xmm);
} else {
const Xbyak::Address unpack = code.MConst(xword, 0x4530000043300000, 0);
@ -775,7 +775,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed64(EmitContext& ctx, IR::Inst* inst)
const Xbyak::Xmm subtrahend_reg = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vmovapd(unpack_reg, unpack);
code.vmovapd(subtrahend_reg, subtrahend);
@ -895,7 +895,7 @@ static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {result, xmm_b}, mask);
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vcmpeqp)(mask, result, xmm_b);
FCODE(vcmpunordp)(nan_mask, result, xmm_b);
if constexpr (is_max) {
@ -952,7 +952,7 @@ static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
// x86-64 treats differently signed zeros as equal while ARM does not.
// Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero.
if (code.HasAVX()) {
if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vcmpeqp)(mask, result, xmm_b);
if constexpr (is_max) {
FCODE(vandp)(eq, result, xmm_b);
@ -1017,7 +1017,7 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
};
if constexpr (fsize != 16) {
if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[3].GetImmediateU1();
@ -1033,7 +1033,7 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
return;
}
if (code.HasFMA() && code.HasAVX()) {
if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[3].GetImmediateU1();
@ -1108,7 +1108,7 @@ static void EmitFPVectorMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
if (ctx.FPCR(fpcr_controlled).DN() && code.HasAVX()) {
if (ctx.FPCR(fpcr_controlled).DN() && code.HasHostFeature(HostFeature::AVX)) {
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
@ -1280,7 +1280,7 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
};
if constexpr (fsize != 16) {
if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
@ -1297,7 +1297,7 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
return;
}
if (code.HasFMA() && code.HasAVX()) {
if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
@ -1371,7 +1371,7 @@ void EmitFPVectorRoundInt(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const bool exact = inst->GetArg(2).GetU1();
if constexpr (fsize != 16) {
if (code.HasSSE41() && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero && !exact) {
if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero && !exact) {
const u8 round_imm = [&]() -> u8 {
switch (rounding) {
case FP::RoundingMode::ToNearest_TieEven:
@ -1494,7 +1494,7 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
};
if constexpr (fsize != 16) {
if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
@ -1512,7 +1512,7 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
return;
}
if (code.HasFMA() && code.HasAVX()) {
if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
@ -1616,7 +1616,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
// TODO: AVX512 implementation
if constexpr (fsize != 16) {
if (code.HasSSE41() && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(args[0]);

View file

@ -28,7 +28,7 @@ void EmitVectorSaturatedNative(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
(code.*unsaturated_fn)(xmm0, addend);
(code.*sub_fn)(xmm0, result);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.ptest(xmm0, xmm0);
} else {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
@ -96,7 +96,7 @@ void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
}
code.pxor(tmp, code.MConst(xword, msb_mask, msb_mask));
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.ptest(xmm0, code.MConst(xword, msb_mask, msb_mask));
} else {
if constexpr (esize == 32) {
@ -109,7 +109,7 @@ void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
code.setnz(overflow);
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
if constexpr (esize == 32) {
code.blendvps(result, tmp);
} else {
@ -196,7 +196,7 @@ void EmitX64::EmitVectorUnsignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst)
code.por(result, tmp);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.ptest(tmp, tmp);
} else {
code.movmskps(overflow.cvt32(), tmp);
@ -232,7 +232,7 @@ void EmitX64::EmitVectorUnsignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst)
code.por(result, tmp);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.ptest(tmp, tmp);
} else {
code.movmskpd(overflow.cvt32(), tmp);
@ -273,7 +273,7 @@ void EmitX64::EmitVectorUnsignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst)
code.psubd(tmp, xmm0);
code.psrad(tmp, 31);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.ptest(tmp, tmp);
} else {
code.movmskps(overflow.cvt32(), tmp);
@ -308,7 +308,7 @@ void EmitX64::EmitVectorUnsignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst)
code.psrad(tmp, 31);
code.pshufd(tmp, tmp, 0b11110101);
if (code.HasSSE41()) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.ptest(tmp, tmp);
} else {
code.movmskpd(overflow.cvt32(), tmp);

View file

@ -0,0 +1,64 @@
/* This file is part of the dynarmic project.
* Copyright (c) 2021 MerryMage
* SPDX-License-Identifier: 0BSD
*/
#pragma once
#include "common/common_types.h"
namespace Dynarmic::Backend::X64 {
enum class HostFeature : u64 {
SSSE3 = 1ULL << 0,
SSE41 = 1ULL << 1,
SSE42 = 1ULL << 2,
AVX = 1ULL << 3,
AVX2 = 1ULL << 4,
AVX512F = 1ULL << 5,
AVX512CD = 1ULL << 6,
AVX512VL = 1ULL << 7,
AVX512BW = 1ULL << 8,
AVX512DQ = 1ULL << 9,
AVX512BITALG = 1ULL << 10,
PCLMULQDQ = 1ULL << 11,
F16C = 1ULL << 12,
FMA = 1ULL << 13,
AES = 1ULL << 14,
POPCNT = 1ULL << 15,
BMI1 = 1ULL << 16,
BMI2 = 1ULL << 17,
LZCNT = 1ULL << 18,
GFNI = 1ULL << 19,
// Zen-based BMI2
FastBMI2 = 1ULL << 20,
// Orthographic AVX512 features on 128 and 256 vectors
AVX512_Ortho = AVX512F | AVX512VL,
// Orthographic AVX512 features for both 32-bit and 64-bit floats
AVX512_OrthoFloat = AVX512_Ortho | AVX512DQ,
};
constexpr HostFeature operator~(HostFeature f) {
return static_cast<HostFeature>(~static_cast<u64>(f));
}
constexpr HostFeature operator|(HostFeature f1, HostFeature f2) {
return static_cast<HostFeature>(static_cast<u64>(f1) | static_cast<u64>(f2));
}
constexpr HostFeature operator&(HostFeature f1, HostFeature f2) {
return static_cast<HostFeature>(static_cast<u64>(f1) & static_cast<u64>(f2));
}
constexpr HostFeature operator|=(HostFeature& result, HostFeature f) {
return result = (result | f);
}
constexpr HostFeature operator&=(HostFeature& result, HostFeature f) {
return result = (result & f);
}
}

View file

@ -19,7 +19,7 @@ namespace Dynarmic::Backend::X64 {
#define MAYBE_AVX(OPCODE, ...) \
[&] { \
if (code.HasAVX()) { \
if (code.HasHostFeature(HostFeature::AVX)) { \
code.v##OPCODE(__VA_ARGS__); \
} else { \
code.OPCODE(__VA_ARGS__); \