IR: SSE4.1 implementation of FPVectorRoundInt

This commit is contained in:
MerryMage 2018-07-30 13:48:04 +01:00
parent 9669e49817
commit 21a28c2545
2 changed files with 123 additions and 29 deletions

View file

@ -34,8 +34,10 @@ namespace Dynarmic::BackendX64 {
using namespace Xbyak::util;
namespace mp = Common::mp;
namespace {
template<size_t fsize, typename T>
static T ChooseOnFsize([[maybe_unused]] T f32, [[maybe_unused]] T f64) {
T ChooseOnFsize([[maybe_unused]] T f32, [[maybe_unused]] T f64) {
static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64");
if constexpr (fsize == 32) {
@ -78,7 +80,7 @@ private:
};
template<size_t fsize, size_t nargs, typename NaNHandler>
static void HandleNaNs(BlockOfCode& code, EmitContext& ctx, std::array<Xbyak::Xmm, nargs + 1> xmms, const Xbyak::Xmm& nan_mask, NaNHandler nan_handler) {
void HandleNaNs(BlockOfCode& code, EmitContext& ctx, std::array<Xbyak::Xmm, nargs + 1> xmms, const Xbyak::Xmm& nan_mask, NaNHandler nan_handler) {
static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64");
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
@ -120,8 +122,27 @@ static void HandleNaNs(BlockOfCode& code, EmitContext& ctx, std::array<Xbyak::Xm
code.SwitchToNearCode();
}
template<size_t fsize>
void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
if (ctx.FPSCR_DN()) {
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
code.pcmpeqw(tmp, tmp);
code.movaps(nan_mask, result);
FCODE(cmpordp)(nan_mask, nan_mask);
code.andps(result, nan_mask);
code.xorps(nan_mask, tmp);
code.andps(nan_mask, fsize == 32 ? code.MConst(xword, 0x7fc0'0000'7fc0'0000, 0x7fc0'0000'7fc0'0000) : code.MConst(xword, 0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000));
code.orps(result, nan_mask);
}
}
template<typename T>
struct DefaultIndexer {
std::tuple<T> operator()(size_t i, const VectorArray<T>& a) {
return std::make_tuple(a[i]);
}
std::tuple<T, T> operator()(size_t i, const VectorArray<T>& a, const VectorArray<T>& b) {
return std::make_tuple(a[i], b[i]);
}
@ -173,7 +194,48 @@ struct PairedLowerIndexer {
};
template<size_t fsize, template<typename> class Indexer, typename Function>
static void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler<fsize, Indexer, 3>::function_type nan_handler = NaNHandler<fsize, Indexer, 3>::GetDefault()) {
void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler<fsize, Indexer, 2>::function_type nan_handler = NaNHandler<fsize, Indexer, 2>::GetDefault()) {
static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64");
if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
if constexpr (std::is_member_function_pointer_v<Function>) {
(code.*fn)(xmm_a);
} else {
fn(xmm_a);
}
ForceToDefaultNaN<fsize>(code, ctx, xmm_a);
ctx.reg_alloc.DefineValue(inst, xmm_a);
return;
}
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
code.movaps(nan_mask, xmm_a);
code.movaps(result, xmm_a);
FCODE(cmpunordp)(nan_mask, nan_mask);
if constexpr (std::is_member_function_pointer_v<Function>) {
(code.*fn)(result);
} else {
fn(result);
}
FCODE(cmpunordp)(nan_mask, result);
HandleNaNs<fsize, 1>(code, ctx, {result, xmm_a}, nan_mask, nan_handler);
ctx.reg_alloc.DefineValue(inst, result);
}
template<size_t fsize, template<typename> class Indexer, typename Function>
void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler<fsize, Indexer, 3>::function_type nan_handler = NaNHandler<fsize, Indexer, 3>::GetDefault()) {
static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64");
if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) {
@ -187,17 +249,7 @@ static void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::
fn(xmm_a, xmm_b);
}
if (ctx.FPSCR_DN()) {
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
code.pcmpeqw(tmp, tmp);
code.movaps(nan_mask, xmm_a);
FCODE(cmpordp)(nan_mask, nan_mask);
code.andps(xmm_a, nan_mask);
code.xorps(nan_mask, tmp);
code.andps(nan_mask, fsize == 32 ? code.MConst(xword, 0x7fc0'0000'7fc0'0000, 0x7fc0'0000'7fc0'0000) : code.MConst(xword, 0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000));
code.orps(xmm_a, nan_mask);
}
ForceToDefaultNaN<fsize>(code, ctx, xmm_a);
ctx.reg_alloc.DefineValue(inst, xmm_a);
return;
@ -226,7 +278,7 @@ static void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::
}
template<size_t fsize, template<typename> class Indexer, typename Function>
static void EmitFourOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler<fsize, Indexer, 4>::function_type nan_handler = NaNHandler<fsize, Indexer, 4>::GetDefault()) {
void EmitFourOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler<fsize, Indexer, 4>::function_type nan_handler = NaNHandler<fsize, Indexer, 4>::GetDefault()) {
static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64");
if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) {
@ -241,17 +293,7 @@ static void EmitFourOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::I
fn(xmm_a, xmm_b, xmm_c);
}
if (ctx.FPSCR_DN()) {
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
code.pcmpeqw(tmp, tmp);
code.movaps(nan_mask, xmm_a);
FCODE(cmpordp)(nan_mask, nan_mask);
code.andps(xmm_a, nan_mask);
code.xorps(nan_mask, tmp);
code.andps(nan_mask, fsize == 32 ? code.MConst(xword, 0x7fc0'0000'7fc0'0000, 0x7fc0'0000'7fc0'0000) : code.MConst(xword, 0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000));
code.orps(xmm_a, nan_mask);
}
ForceToDefaultNaN<fsize>(code, ctx, xmm_a);
ctx.reg_alloc.DefineValue(inst, xmm_a);
return;
@ -282,7 +324,7 @@ static void EmitFourOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::I
}
template<typename Lambda>
inline void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
const auto fn = static_cast<mp::equivalent_function_type_t<Lambda>*>(lambda);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
@ -307,7 +349,7 @@ inline void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
}
template<typename Lambda>
inline void EmitThreeOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
void EmitThreeOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
const auto fn = static_cast<mp::equivalent_function_type_t<Lambda>*>(lambda);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
@ -351,7 +393,7 @@ inline void EmitThreeOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
}
template<typename Lambda>
inline void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
const auto fn = static_cast<mp::equivalent_function_type_t<Lambda>*>(lambda);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
@ -398,6 +440,8 @@ inline void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
ctx.reg_alloc.DefineValue(inst, xmm0);
}
} // anonymous namespace
void EmitX64::EmitFPVectorAbs16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
@ -736,6 +780,34 @@ void EmitFPVectorRoundInt(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
const bool exact = inst->GetArg(2).GetU1();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero && !exact) {
const u8 round_imm = [&]() -> u8 {
switch (rounding) {
case FP::RoundingMode::ToNearest_TieEven:
return 0b00;
case FP::RoundingMode::TowardsPlusInfinity:
return 0b10;
case FP::RoundingMode::TowardsMinusInfinity:
return 0b01;
case FP::RoundingMode::TowardsZero:
return 0b11;
default:
UNREACHABLE();
}
return 0;
}();
EmitTwoOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result){
if constexpr (fsize == 32) {
code.roundps(result, result, round_imm);
} else {
code.roundpd(result, result, round_imm);
}
});
return;
}
using rounding_list = mp::list<
std::integral_constant<FP::RoundingMode, FP::RoundingMode::ToNearest_TieEven>,
std::integral_constant<FP::RoundingMode, FP::RoundingMode::TowardsPlusInfinity>,

View file

@ -36,6 +36,17 @@ constexpr bool IsNaN(u32 value) {
return IsQNaN(value) || IsSNaN(value);
}
/// Given a single argument, return the NaN value which would be returned by an ARM processor.
/// If the argument isn't a NaN, returns boost::none.
inline boost::optional<u32> ProcessNaNs(u32 a) {
if (IsSNaN(a)) {
return a | 0x00400000;
} else if (IsQNaN(a)) {
return a;
}
return boost::none;
}
/// Given a pair of arguments, return the NaN value which would be returned by an ARM processor.
/// If neither argument is a NaN, returns boost::none.
inline boost::optional<u32> ProcessNaNs(u32 a, u32 b) {
@ -96,6 +107,17 @@ constexpr bool IsNaN(u64 value) {
return IsQNaN(value) || IsSNaN(value);
}
/// Given a single argument, return the NaN value which would be returned by an ARM processor.
/// If the argument isn't a NaN, returns boost::none.
inline boost::optional<u64> ProcessNaNs(u64 a) {
if (IsSNaN(a)) {
return a | 0x0008'0000'0000'0000;
} else if (IsQNaN(a)) {
return a;
}
return boost::none;
}
/// Given a pair of arguments, return the NaN value which would be returned by an ARM processor.
/// If neither argument is a NaN, returns boost::none.
inline boost::optional<u64> ProcessNaNs(u64 a, u64 b) {