block_of_code: Add XmmBConst

This is a redo of https://github.com/merryhime/dynarmic/pull/690 with a
much smaller foot-print to introduce a new pattern while avoiding the
initial bugs
(5d9b720189)

**B**roadcasts a value as an **Xmm**-sized **Const**ant. Intended to
eventually encourage more hits within the constant-pool between vector
and non-vector code.
This commit is contained in:
Wunkolo 2024-01-02 12:55:37 -08:00 committed by merry
parent b02292bec7
commit 917335ae8a
4 changed files with 21 additions and 31 deletions

View file

@ -10,6 +10,7 @@
#include <memory> #include <memory>
#include <type_traits> #include <type_traits>
#include <mcl/bit/bit_field.hpp>
#include <mcl/stdint.hpp> #include <mcl/stdint.hpp>
#include <xbyak/xbyak.h> #include <xbyak/xbyak.h>
#include <xbyak/xbyak_util.h> #include <xbyak/xbyak_util.h>
@ -124,6 +125,12 @@ public:
Xbyak::Address XmmConst(const Xbyak::AddressFrame& frame, u64 lower, u64 upper = 0); Xbyak::Address XmmConst(const Xbyak::AddressFrame& frame, u64 lower, u64 upper = 0);
template<size_t esize>
Xbyak::Address XmmBConst(const Xbyak::AddressFrame& frame, u64 value) {
return XmmConst(frame, mcl::bit::replicate_element<u64>(esize, value),
mcl::bit::replicate_element<u64>(esize, value));
}
CodePtr GetCodeBegin() const; CodePtr GetCodeBegin() const;
size_t GetTotalCodeSize() const; size_t GetTotalCodeSize() const;

View file

@ -89,10 +89,9 @@ void ForceDenormalsToZero(BlockOfCode& code, std::initializer_list<Xbyak::Xmm> t
FpFixup::Norm_Src, FpFixup::Norm_Src,
FpFixup::Norm_Src, FpFixup::Norm_Src,
FpFixup::Norm_Src); FpFixup::Norm_Src);
constexpr u64 denormal_to_zero64 = mcl::bit::replicate_element<fsize, u64>(denormal_to_zero);
const Xbyak::Xmm tmp = xmm16; const Xbyak::Xmm tmp = xmm16;
FCODE(vmovap)(tmp, code.XmmConst(xword, u64(denormal_to_zero64), u64(denormal_to_zero64))); FCODE(vmovap)(tmp, code.XmmBConst<fsize>(xword, denormal_to_zero));
for (const Xbyak::Xmm& xmm : to_daz) { for (const Xbyak::Xmm& xmm : to_daz) {
FCODE(vfixupimms)(xmm, xmm, tmp, u8(0)); FCODE(vfixupimms)(xmm, xmm, tmp, u8(0));

View file

@ -145,26 +145,12 @@ void HandleNaNs(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, std::
template<size_t fsize> template<size_t fsize>
Xbyak::Address GetVectorOf(BlockOfCode& code, u64 value) { Xbyak::Address GetVectorOf(BlockOfCode& code, u64 value) {
if constexpr (fsize == 16) { return code.XmmBConst<fsize>(xword, value);
return code.XmmConst(xword, (value << 48) | (value << 32) | (value << 16) | value, (value << 48) | (value << 32) | (value << 16) | value);
} else if constexpr (fsize == 32) {
return code.XmmConst(xword, (value << 32) | value, (value << 32) | value);
} else {
static_assert(fsize == 64);
return code.XmmConst(xword, value, value);
}
} }
template<size_t fsize, u64 value> template<size_t fsize, u64 value>
Xbyak::Address GetVectorOf(BlockOfCode& code) { Xbyak::Address GetVectorOf(BlockOfCode& code) {
if constexpr (fsize == 16) { return code.XmmBConst<fsize>(xword, value);
return code.XmmConst(xword, (value << 48) | (value << 32) | (value << 16) | value, (value << 48) | (value << 32) | (value << 16) | value);
} else if constexpr (fsize == 32) {
return code.XmmConst(xword, (value << 32) | value, (value << 32) | value);
} else {
static_assert(fsize == 64);
return code.XmmConst(xword, value, value);
}
} }
template<size_t fsize> template<size_t fsize>
@ -227,7 +213,7 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) {
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero, constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero,
FpFixup::PosZero); FpFixup::PosZero);
FCODE(vfixupimmp)(result, result, code.XmmConst(ptr_b, u64(nan_to_zero)), u8(0)); FCODE(vfixupimmp)(result, result, code.XmmBConst<32>(ptr_b, nan_to_zero), u8(0));
} else if (code.HasHostFeature(HostFeature::AVX)) { } else if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vcmpordp)(nan_mask, result, result); FCODE(vcmpordp)(nan_mask, result, result);
FCODE(vandp)(result, result, nan_mask); FCODE(vandp)(result, result, nan_mask);
@ -251,9 +237,8 @@ void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list<Xb
FpFixup::Norm_Src, FpFixup::Norm_Src,
FpFixup::Norm_Src, FpFixup::Norm_Src,
FpFixup::Norm_Src); FpFixup::Norm_Src);
constexpr u64 denormal_to_zero64 = mcl::bit::replicate_element<fsize, u64>(denormal_to_zero);
FCODE(vmovap)(tmp, code.XmmConst(xword, u64(denormal_to_zero64), u64(denormal_to_zero64))); FCODE(vmovap)(tmp, code.XmmBConst<fsize>(xword, denormal_to_zero));
for (const Xbyak::Xmm& xmm : to_daz) { for (const Xbyak::Xmm& xmm : to_daz) {
FCODE(vfixupimmp)(xmm, xmm, tmp, u8(0)); FCODE(vfixupimmp)(xmm, xmm, tmp, u8(0));
@ -800,9 +785,9 @@ void EmitX64::EmitFPVectorFromUnsignedFixed32(EmitContext& ctx, IR::Inst* inst)
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
code.vcvtudq2ps(xmm, xmm); code.vcvtudq2ps(xmm, xmm);
} else { } else {
const Xbyak::Address mem_4B000000 = code.XmmConst(xword, 0x4B0000004B000000, 0x4B0000004B000000); const Xbyak::Address mem_4B000000 = code.XmmBConst<32>(xword, 0x4B000000);
const Xbyak::Address mem_53000000 = code.XmmConst(xword, 0x5300000053000000, 0x5300000053000000); const Xbyak::Address mem_53000000 = code.XmmBConst<32>(xword, 0x53000000);
const Xbyak::Address mem_D3000080 = code.XmmConst(xword, 0xD3000080D3000080, 0xD3000080D3000080); const Xbyak::Address mem_D3000080 = code.XmmBConst<32>(xword, 0xD3000080);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
@ -813,7 +798,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed32(EmitContext& ctx, IR::Inst* inst)
code.vaddps(xmm, xmm, mem_D3000080); code.vaddps(xmm, xmm, mem_D3000080);
code.vaddps(xmm, tmp, xmm); code.vaddps(xmm, tmp, xmm);
} else { } else {
const Xbyak::Address mem_0xFFFF = code.XmmConst(xword, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF); const Xbyak::Address mem_0xFFFF = code.XmmBConst<32>(xword, 0x0000FFFF);
code.movdqa(tmp, mem_0xFFFF); code.movdqa(tmp, mem_0xFFFF);
@ -831,7 +816,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed32(EmitContext& ctx, IR::Inst* inst)
} }
if (ctx.FPCR(fpcr_controlled).RMode() == FP::RoundingMode::TowardsMinusInfinity) { if (ctx.FPCR(fpcr_controlled).RMode() == FP::RoundingMode::TowardsMinusInfinity) {
code.pand(xmm, code.XmmConst(xword, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF)); code.pand(xmm, code.XmmBConst<32>(xword, 0x7FFFFFFF));
} }
}); });
@ -898,7 +883,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed64(EmitContext& ctx, IR::Inst* inst)
} }
if (ctx.FPCR(fpcr_controlled).RMode() == FP::RoundingMode::TowardsMinusInfinity) { if (ctx.FPCR(fpcr_controlled).RMode() == FP::RoundingMode::TowardsMinusInfinity) {
code.pand(xmm, code.XmmConst(xword, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF)); code.pand(xmm, code.XmmBConst<64>(xword, 0x7FFFFFFFFFFFFFFF));
} }
}); });
@ -1504,12 +1489,11 @@ template<size_t fsize>
void FPVectorNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { void FPVectorNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
using FPT = mcl::unsigned_integer_of_size<fsize>; using FPT = mcl::unsigned_integer_of_size<fsize>;
constexpr FPT sign_mask = FP::FPInfo<FPT>::sign_mask; constexpr FPT sign_mask = FP::FPInfo<FPT>::sign_mask;
constexpr u64 sign_mask64 = mcl::bit::replicate_element<fsize, u64>(sign_mask);
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Address mask = code.XmmConst(xword, sign_mask64, sign_mask64); const Xbyak::Address mask = code.XmmBConst<fsize>(xword, sign_mask);
code.xorps(a, mask); code.xorps(a, mask);

View file

@ -97,7 +97,7 @@ void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
code.vpmovq2m(k1, xmm0); code.vpmovq2m(k1, xmm0);
} }
ICODE(vpsra)(result | k1, result, u8(esize - 1)); ICODE(vpsra)(result | k1, result, u8(esize - 1));
ICODE(vpxor)(result | k1, result, code.XmmConst(xword_b, msb_mask, msb_mask)); ICODE(vpxor)(result | k1, result, code.XmmBConst<esize>(xword_b, msb_mask));
code.ktestb(k1, k1); code.ktestb(k1, k1);
code.setnz(overflow); code.setnz(overflow);
@ -148,7 +148,7 @@ void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
if constexpr (esize == 64) { if constexpr (esize == 64) {
code.pshufd(tmp, tmp, 0b11110101); code.pshufd(tmp, tmp, 0b11110101);
} }
code.pxor(tmp, code.XmmConst(xword, msb_mask, msb_mask)); code.pxor(tmp, code.XmmBConst<esize>(xword, msb_mask));
if (code.HasHostFeature(HostFeature::SSE41)) { if (code.HasHostFeature(HostFeature::SSE41)) {
code.ptest(xmm0, code.XmmConst(xword, msb_mask, msb_mask)); code.ptest(xmm0, code.XmmConst(xword, msb_mask, msb_mask));