From 5ec8e48593afd6ac784b28868ba0eff60aa6a10c Mon Sep 17 00:00:00 2001 From: MerryMage Date: Sat, 20 Jun 2020 13:33:45 +0100 Subject: [PATCH] A32: Implement ASIMD VMUL (floating-point) * Also add fpcr_controlled arguments to FPVectorMul IR instruction * Merge ASIMD floating-point instruction implementations --- .../x64/emit_x64_vector_floating_point.cpp | 90 ++++++++++--------- src/frontend/A32/decoder/asimd.inc | 2 +- .../A32/translate/impl/asimd_three_same.cpp | 71 ++++++++------- .../A32/translate/impl/translate_arm.h | 1 + src/frontend/ir/ir_emitter.cpp | 6 +- src/frontend/ir/ir_emitter.h | 2 +- src/frontend/ir/opcodes.inc | 4 +- 7 files changed, 90 insertions(+), 86 deletions(-) diff --git a/src/backend/x64/emit_x64_vector_floating_point.cpp b/src/backend/x64/emit_x64_vector_floating_point.cpp index 6531c8c1..3be0c80d 100644 --- a/src/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/backend/x64/emit_x64_vector_floating_point.cpp @@ -35,11 +35,6 @@ using namespace Xbyak::util; namespace { -enum FpcrControlledArgument { - Present, - Absent, -}; - template T ChooseOnFsize([[maybe_unused]] T f32, [[maybe_unused]] T f64) { static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); @@ -53,6 +48,24 @@ T ChooseOnFsize([[maybe_unused]] T f32, [[maybe_unused]] T f64) { #define FCODE(NAME) (code.*ChooseOnFsize(&Xbyak::CodeGenerator::NAME##s, &Xbyak::CodeGenerator::NAME##d)) +enum FpcrControlledArgument { + Present, + Absent, +}; + +template +void MaybeStandardFPSCRValue(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, Lambda lambda) { + const bool switch_mxcsr = ctx.FPCR(fpcr_controlled) != ctx.FPCR(); + + if (switch_mxcsr) { + code.EnterStandardASIMD(); + lambda(); + code.LeaveStandardASIMD(); + } else { + lambda(); + } +} + template class Indexer, size_t narg> struct NaNHandler { public: @@ -171,8 +184,8 @@ Xbyak::Address GetVectorOf(BlockOfCode& code) { } template -void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) { - if (ctx.FPCR().DN()) { +void ForceToDefaultNaN(BlockOfCode& code, FP::FPCR fpcr, Xbyak::Xmm result) { + if (fpcr.DN()) { const Xbyak::Xmm nan_mask = xmm0; if (code.HasAVX()) { FCODE(vcmpunordp)(nan_mask, result, result); @@ -287,7 +300,7 @@ void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins fn(result, xmm_a); } - ForceToDefaultNaN(code, ctx, result); + ForceToDefaultNaN(code, ctx.FPCR(), result); ctx.reg_alloc.DefineValue(inst, result); return; @@ -318,29 +331,33 @@ void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins ctx.reg_alloc.DefineValue(inst, result); } -template class Indexer, typename Function> +template class Indexer, FpcrControlledArgument fcarg = FpcrControlledArgument::Absent, typename Function> void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler::function_type nan_handler = NaNHandler::GetDefault()) { static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); - if (!ctx.AccurateNaN() || ctx.FPCR().DN()) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = fcarg == FpcrControlledArgument::Absent || args[2].GetImmediateU1(); + + if (!ctx.AccurateNaN() || ctx.FPCR(fpcr_controlled).DN()) { const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); if constexpr (std::is_member_function_pointer_v) { - (code.*fn)(xmm_a, xmm_b); + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{ + (code.*fn)(xmm_a, xmm_b); + }); } else { - fn(xmm_a, xmm_b); + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{ + fn(xmm_a, xmm_b); + }); } - ForceToDefaultNaN(code, ctx, xmm_a); + ForceToDefaultNaN(code, ctx.FPCR(fpcr_controlled), xmm_a); ctx.reg_alloc.DefineValue(inst, xmm_a); return; } - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); @@ -495,19 +512,6 @@ void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lam ctx.reg_alloc.DefineValue(inst, result); } -template -void MaybeStandardFPSCRValue(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, Lambda lambda) { - const bool switch_mxcsr = ctx.FPCR(fpcr_controlled) != ctx.FPCR(); - - if (switch_mxcsr) { - code.EnterStandardASIMD(); - lambda(); - code.LeaveStandardASIMD(); - } else { - lambda(); - } -} - } // anonymous namespace void EmitX64::EmitFPVectorAbs16(EmitContext& ctx, IR::Inst* inst) { @@ -569,9 +573,9 @@ void EmitX64::EmitFPVectorEqual16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitFPVectorEqual32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]); MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{ DenormalsAreZero<32>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0); @@ -583,9 +587,9 @@ void EmitX64::EmitFPVectorEqual32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitFPVectorEqual64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]); MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{ DenormalsAreZero<64>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0); @@ -772,9 +776,9 @@ void EmitX64::EmitFPVectorFromUnsignedFixed64(EmitContext& ctx, IR::Inst* inst) void EmitX64::EmitFPVectorGreater32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{ DenormalsAreZero<32>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0); @@ -786,9 +790,9 @@ void EmitX64::EmitFPVectorGreater32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitFPVectorGreater64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{ DenormalsAreZero<64>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0); @@ -800,9 +804,9 @@ void EmitX64::EmitFPVectorGreater64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitFPVectorGreaterEqual32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{ DenormalsAreZero<32>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0); @@ -814,9 +818,9 @@ void EmitX64::EmitFPVectorGreaterEqual32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitFPVectorGreaterEqual64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{ DenormalsAreZero<64>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0); @@ -946,11 +950,11 @@ void EmitX64::EmitFPVectorMin64(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitFPVectorMul32(EmitContext& ctx, IR::Inst* inst) { - EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulps); + EmitThreeOpVectorOperation<32, DefaultIndexer, FpcrControlledArgument::Present>(code, ctx, inst, &Xbyak::CodeGenerator::mulps); } void EmitX64::EmitFPVectorMul64(EmitContext& ctx, IR::Inst* inst) { - EmitThreeOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulpd); + EmitThreeOpVectorOperation<64, DefaultIndexer, FpcrControlledArgument::Present>(code, ctx, inst, &Xbyak::CodeGenerator::mulpd); } template diff --git a/src/frontend/A32/decoder/asimd.inc b/src/frontend/A32/decoder/asimd.inc index 0b63527f..e034a505 100644 --- a/src/frontend/A32/decoder/asimd.inc +++ b/src/frontend/A32/decoder/asimd.inc @@ -38,7 +38,7 @@ INST(asimd_VMUL, "VMUL", "1111001P0Dzznnnndddd100 //INST(asimd_VPADD_float, "VPADD (floating-point)", "111100110-0C--------1101---0----") // ASIMD //INST(asimd_VABD_float, "VABD (floating-point)", "111100110-1C--------1101---0----") // ASIMD //INST(asimd_VMLA_float, "VMLA (floating-point)", "111100100-CC--------1101---1----") // ASIMD -//INST(asimd_VMUL_float, "VMUL (floating-point)", "111100110-0C--------1101---1----") // ASIMD +INST(asimd_VMUL_float, "VMUL (floating-point)", "111100110D0znnnndddd1101NQM1mmmm") // ASIMD //INST(asimd_VCEQ_reg, "VCEQ (register)", "111100100-0C--------1110---0----") // ASIMD //INST(asimd_VCGE_reg, "VCGE (register)", "111100110-0C--------1110---0----") // ASIMD //INST(asimd_VCGT_reg, "VCGT (register)", "111100110-1C--------1110---0----") // ASIMD diff --git a/src/frontend/A32/translate/impl/asimd_three_same.cpp b/src/frontend/A32/translate/impl/asimd_three_same.cpp index 8ca37936..c1fdc5ce 100644 --- a/src/frontend/A32/translate/impl/asimd_three_same.cpp +++ b/src/frontend/A32/translate/impl/asimd_three_same.cpp @@ -34,6 +34,29 @@ bool BitwiseInstruction(ArmTranslatorVisitor& v, bool D, size_t Vn, size_t Vd, b return true; } + +template +bool FloatingPointInstruction(ArmTranslatorVisitor& v, bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm, Callable fn) { + if (Q && (Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm))) { + return v.UndefinedInstruction(); + } + + if (sz == 0b1) { + return v.UndefinedInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_d = v.ir.GetVector(d); + const auto reg_n = v.ir.GetVector(n); + const auto reg_m = v.ir.GetVector(m); + const auto result = fn(reg_d, reg_n, reg_m); + + v.ir.SetVector(d, result); + return true; +} } // Anonymous namespace bool ArmTranslatorVisitor::asimd_VHADD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { @@ -333,46 +356,22 @@ bool ArmTranslatorVisitor::asimd_VMUL(bool P, bool D, size_t sz, size_t Vn, size return true; } +bool ArmTranslatorVisitor::asimd_VMUL_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorMul(32, reg_n, reg_m, false); + }); +} + bool ArmTranslatorVisitor::asimd_VMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { - if (Q && (Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm))) { - return UndefinedInstruction(); - } - - if (sz == 0b1) { - return UndefinedInstruction(); - } - - const auto d = ToVector(Q, Vd, D); - const auto m = ToVector(Q, Vm, M); - const auto n = ToVector(Q, Vn, N); - - const auto reg_n = ir.GetVector(n); - const auto reg_m = ir.GetVector(m); - const auto result = ir.FPVectorMax(32, reg_m, reg_n, false); - - ir.SetVector(d, result); - return true; + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorMax(32, reg_n, reg_m, false); + }); } bool ArmTranslatorVisitor::asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { - if (Q && (Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm))) { - return UndefinedInstruction(); - } - - if (sz == 0b1) { - return UndefinedInstruction(); - } - - const auto d = ToVector(Q, Vd, D); - const auto m = ToVector(Q, Vm, M); - const auto n = ToVector(Q, Vn, N); - - const auto reg_n = ir.GetVector(n); - const auto reg_m = ir.GetVector(m); - const auto result = ir.FPVectorMin(32, reg_m, reg_n, false); - - ir.SetVector(d, result); - return true; + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorMin(32, reg_n, reg_m, false); + }); } } // namespace Dynarmic::A32 diff --git a/src/frontend/A32/translate/impl/translate_arm.h b/src/frontend/A32/translate/impl/translate_arm.h index f76b355e..e7beeb12 100644 --- a/src/frontend/A32/translate/impl/translate_arm.h +++ b/src/frontend/A32/translate/impl/translate_arm.h @@ -462,6 +462,7 @@ struct ArmTranslatorVisitor final { bool asimd_VRSHL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); bool asimd_VTST(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); bool asimd_VMUL(bool P, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VMUL_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); bool asimd_VMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); bool asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 363d0642..c3d8fcb8 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -2376,12 +2376,12 @@ U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpc UNREACHABLE(); } -U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b) { +U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { switch (esize) { case 32: - return Inst(Opcode::FPVectorMul32, a, b); + return Inst(Opcode::FPVectorMul32, a, b, Imm1(fpcr_controlled)); case 64: - return Inst(Opcode::FPVectorMul64, a, b); + return Inst(Opcode::FPVectorMul64, a, b, Imm1(fpcr_controlled)); } UNREACHABLE(); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 42afc1b7..d9fae230 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -354,7 +354,7 @@ public: U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); - U128 FPVectorMul(size_t esize, const U128& a, const U128& b); + U128 FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2); U128 FPVectorMulX(size_t esize, const U128& a, const U128& b); U128 FPVectorNeg(size_t esize, const U128& a); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 2b41bb9e..fea5d9ba 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -599,8 +599,8 @@ OPCODE(FPVectorMax32, U128, U128 OPCODE(FPVectorMax64, U128, U128, U128, U1 ) OPCODE(FPVectorMin32, U128, U128, U128, U1 ) OPCODE(FPVectorMin64, U128, U128, U128, U1 ) -OPCODE(FPVectorMul32, U128, U128, U128 ) -OPCODE(FPVectorMul64, U128, U128, U128 ) +OPCODE(FPVectorMul32, U128, U128, U128, U1 ) +OPCODE(FPVectorMul64, U128, U128, U128, U1 ) OPCODE(FPVectorMulAdd16, U128, U128, U128, U128 ) OPCODE(FPVectorMulAdd32, U128, U128, U128, U128 ) OPCODE(FPVectorMulAdd64, U128, U128, U128, U128 )