From 0785a6d027e5be1a62d001a21f092a0fed2a29d8 Mon Sep 17 00:00:00 2001 From: zmt00 Date: Tue, 6 Feb 2024 18:49:15 -0800 Subject: [PATCH] ir: Implement FPMulSub --- .../arm64/emit_arm64_floating_point.cpp | 18 ++++++ .../backend/x64/emit_x64_floating_point.cpp | 57 +++++++++++++++---- src/dynarmic/common/fp/op/FPMulAdd.cpp | 9 +++ src/dynarmic/common/fp/op/FPMulAdd.h | 3 + .../frontend/A32/translate/impl/vfp.cpp | 4 +- ...g_point_data_processing_three_register.cpp | 4 +- src/dynarmic/ir/ir_emitter.cpp | 15 +++++ src/dynarmic/ir/ir_emitter.h | 1 + src/dynarmic/ir/microinstruction.cpp | 3 + src/dynarmic/ir/opcodes.inc | 3 + tests/test_generator.cpp | 1 + 11 files changed, 104 insertions(+), 14 deletions(-) diff --git a/src/dynarmic/backend/arm64/emit_arm64_floating_point.cpp b/src/dynarmic/backend/arm64/emit_arm64_floating_point.cpp index 6d85cd23..22ec0ec2 100644 --- a/src/dynarmic/backend/arm64/emit_arm64_floating_point.cpp +++ b/src/dynarmic/backend/arm64/emit_arm64_floating_point.cpp @@ -328,6 +328,24 @@ void EmitIR(oaknut::CodeGenerator& code, EmitContext& ct EmitFourOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& D1, auto& D2) { code.FMADD(Dresult, D1, D2, Da); }); } +template<> +void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFourOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& S1, auto& S2) { code.FMSUB(Sresult, S1, S2, Sa); }); +} + +template<> +void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFourOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& D1, auto& D2) { code.FMSUB(Dresult, D1, D2, Da); }); +} + template<> void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FMULX(Sresult, Sa, Sb); }); diff --git a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp index efa24f68..182c8875 100644 --- a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp +++ b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp @@ -624,9 +624,10 @@ void EmitX64::EmitFPMul64(EmitContext& ctx, IR::Inst* inst) { FPThreeOp<64>(code, ctx, inst, &Xbyak::CodeGenerator::mulsd); } -template +template static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { using FPT = mcl::unsigned_integer_of_size; + const auto fallback_fn = negate_product ? &FP::FPMulSub : &FP::FPMulAdd; auto args = ctx.reg_alloc.GetArgumentInfo(inst); @@ -639,7 +640,11 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]); - FCODE(vfmadd231s)(result, operand2, operand3); + if constexpr (negate_product) { + FCODE(vfnmadd231s)(result, operand2, operand3); + } else { + FCODE(vfmadd231s)(result, operand2, operand3); + } if (ctx.FPCR().DN()) { ForceToDefaultNaN(code, result); } @@ -657,7 +662,11 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); code.movaps(result, operand1); - FCODE(vfmadd231s)(result, operand2, operand3); + if constexpr (negate_product) { + FCODE(vfnmadd231s)(result, operand2, operand3); + } else { + FCODE(vfmadd231s)(result, operand2, operand3); + } if (needs_rounding_correction && needs_nan_correction) { code.vandps(xmm0, result, code.Const(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask)); @@ -703,11 +712,11 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { code.sub(rsp, 16 + ABI_SHADOW_SPACE); code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); code.mov(qword[rsp + ABI_SHADOW_SPACE], rax); - code.CallFunction(&FP::FPMulAdd); + code.CallFunction(fallback_fn); code.add(rsp, 16 + ABI_SHADOW_SPACE); #else code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); - code.CallFunction(&FP::FPMulAdd); + code.CallFunction(fallback_fn); #endif code.movq(result, code.ABI_RETURN); ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); @@ -758,6 +767,9 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { code.ptest(operand2, xmm0); code.jnz(op2_done); code.vorps(result, operand2, xmm0); + if constexpr (negate_product) { + code.xorps(result, code.Const(xword, FP::FPInfo::sign_mask)); + } code.jmp(*end); code.L(op2_done); @@ -769,6 +781,16 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { code.jmp(*end); code.L(op3_done); + // at this point, all SNaNs have been handled + // if op1 was not a QNaN and op2 is, negate the result + if constexpr (negate_product) { + FCODE(ucomis)(operand1, operand1); + code.jp(*end); + FCODE(ucomis)(operand2, operand2); + code.jnp(*end); + code.xorps(result, code.Const(xword, FP::FPInfo::sign_mask)); + } + code.jmp(*end); } }); @@ -782,6 +804,9 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]); const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]); + if constexpr (negate_product) { + code.xorps(operand2, code.Const(xword, FP::FPInfo::sign_mask)); + } FCODE(muls)(operand2, operand3); FCODE(adds)(operand1, operand2); @@ -796,24 +821,36 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); code.mov(qword[rsp + ABI_SHADOW_SPACE], rax); - code.CallFunction(&FP::FPMulAdd); + code.CallFunction(fallback_fn); ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); #else code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); - code.CallFunction(&FP::FPMulAdd); + code.CallFunction(fallback_fn); #endif } void EmitX64::EmitFPMulAdd16(EmitContext& ctx, IR::Inst* inst) { - EmitFPMulAdd<16>(code, ctx, inst); + EmitFPMulAdd<16, false>(code, ctx, inst); } void EmitX64::EmitFPMulAdd32(EmitContext& ctx, IR::Inst* inst) { - EmitFPMulAdd<32>(code, ctx, inst); + EmitFPMulAdd<32, false>(code, ctx, inst); } void EmitX64::EmitFPMulAdd64(EmitContext& ctx, IR::Inst* inst) { - EmitFPMulAdd<64>(code, ctx, inst); + EmitFPMulAdd<64, false>(code, ctx, inst); +} + +void EmitX64::EmitFPMulSub16(EmitContext& ctx, IR::Inst* inst) { + EmitFPMulAdd<16, true>(code, ctx, inst); +} + +void EmitX64::EmitFPMulSub32(EmitContext& ctx, IR::Inst* inst) { + EmitFPMulAdd<32, true>(code, ctx, inst); +} + +void EmitX64::EmitFPMulSub64(EmitContext& ctx, IR::Inst* inst) { + EmitFPMulAdd<64, true>(code, ctx, inst); } template diff --git a/src/dynarmic/common/fp/op/FPMulAdd.cpp b/src/dynarmic/common/fp/op/FPMulAdd.cpp index be699ef8..f97f88de 100644 --- a/src/dynarmic/common/fp/op/FPMulAdd.cpp +++ b/src/dynarmic/common/fp/op/FPMulAdd.cpp @@ -78,4 +78,13 @@ template u16 FPMulAdd(u16 addend, u16 op1, u16 op2, FPCR fpcr, FPSR& fpsr); template u32 FPMulAdd(u32 addend, u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr); template u64 FPMulAdd(u64 addend, u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr); +template +FPT FPMulSub(FPT minuend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) { + return FPMulAdd(minuend, (op1 ^ FPInfo::sign_mask), op2, fpcr, fpsr); +} + +template u16 FPMulSub(u16 minuend, u16 op1, u16 op2, FPCR fpcr, FPSR& fpsr); +template u32 FPMulSub(u32 minuend, u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr); +template u64 FPMulSub(u64 minuend, u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr); + } // namespace Dynarmic::FP diff --git a/src/dynarmic/common/fp/op/FPMulAdd.h b/src/dynarmic/common/fp/op/FPMulAdd.h index 774fe88b..8e0a16cd 100644 --- a/src/dynarmic/common/fp/op/FPMulAdd.h +++ b/src/dynarmic/common/fp/op/FPMulAdd.h @@ -13,4 +13,7 @@ class FPSR; template FPT FPMulAdd(FPT addend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr); +template +FPT FPMulSub(FPT minuend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr); + } // namespace Dynarmic::FP diff --git a/src/dynarmic/frontend/A32/translate/impl/vfp.cpp b/src/dynarmic/frontend/A32/translate/impl/vfp.cpp index 2e7734ca..a4e37f74 100644 --- a/src/dynarmic/frontend/A32/translate/impl/vfp.cpp +++ b/src/dynarmic/frontend/A32/translate/impl/vfp.cpp @@ -293,7 +293,7 @@ bool TranslatorVisitor::vfp_VFNMA(Cond cond, bool D, size_t Vn, size_t Vd, bool const auto reg_n = ir.GetExtendedRegister(n); const auto reg_m = ir.GetExtendedRegister(m); const auto reg_d = ir.GetExtendedRegister(d); - const auto result = ir.FPMulAdd(ir.FPNeg(reg_d), ir.FPNeg(reg_n), reg_m); + const auto result = ir.FPMulSub(ir.FPNeg(reg_d), reg_n, reg_m); ir.SetExtendedRegister(d, result); }); } @@ -333,7 +333,7 @@ bool TranslatorVisitor::vfp_VFMS(Cond cond, bool D, size_t Vn, size_t Vd, bool s const auto reg_n = ir.GetExtendedRegister(n); const auto reg_m = ir.GetExtendedRegister(m); const auto reg_d = ir.GetExtendedRegister(d); - const auto result = ir.FPMulAdd(reg_d, ir.FPNeg(reg_n), reg_m); + const auto result = ir.FPMulSub(reg_d, reg_n, reg_m); ir.SetExtendedRegister(d, result); }); } diff --git a/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_three_register.cpp b/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_three_register.cpp index eaad58c8..75c38c01 100644 --- a/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_three_register.cpp +++ b/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_three_register.cpp @@ -30,7 +30,7 @@ bool TranslatorVisitor::FMSUB_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) const IR::U16U32U64 operanda = V_scalar(*datasize, Va); const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn); const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm); - const IR::U16U32U64 result = ir.FPMulAdd(operanda, ir.FPNeg(operand1), operand2); + const IR::U16U32U64 result = ir.FPMulSub(operanda, operand1, operand2); V_scalar(*datasize, Vd, result); return true; } @@ -44,7 +44,7 @@ bool TranslatorVisitor::FNMADD_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd const IR::U16U32U64 operanda = V_scalar(*datasize, Va); const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn); const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm); - const IR::U16U32U64 result = ir.FPMulAdd(ir.FPNeg(operanda), ir.FPNeg(operand1), operand2); + const IR::U16U32U64 result = ir.FPMulSub(ir.FPNeg(operanda), operand1, operand2); V_scalar(*datasize, Vd, result); return true; } diff --git a/src/dynarmic/ir/ir_emitter.cpp b/src/dynarmic/ir/ir_emitter.cpp index ebb82202..fc4f69b3 100644 --- a/src/dynarmic/ir/ir_emitter.cpp +++ b/src/dynarmic/ir/ir_emitter.cpp @@ -2190,6 +2190,21 @@ U16U32U64 IREmitter::FPMulAdd(const U16U32U64& a, const U16U32U64& b, const U16U } } +U16U32U64 IREmitter::FPMulSub(const U16U32U64& a, const U16U32U64& b, const U16U32U64& c) { + ASSERT(a.GetType() == b.GetType()); + + switch (a.GetType()) { + case Type::U16: + return Inst(Opcode::FPMulSub16, a, b, c); + case Type::U32: + return Inst(Opcode::FPMulSub32, a, b, c); + case Type::U64: + return Inst(Opcode::FPMulSub64, a, b, c); + default: + UNREACHABLE(); + } +} + U32U64 IREmitter::FPMulX(const U32U64& a, const U32U64& b) { ASSERT(a.GetType() == b.GetType()); diff --git a/src/dynarmic/ir/ir_emitter.h b/src/dynarmic/ir/ir_emitter.h index 0efd25c5..d37df245 100644 --- a/src/dynarmic/ir/ir_emitter.h +++ b/src/dynarmic/ir/ir_emitter.h @@ -335,6 +335,7 @@ public: U32U64 FPMinNumeric(const U32U64& a, const U32U64& b); U32U64 FPMul(const U32U64& a, const U32U64& b); U16U32U64 FPMulAdd(const U16U32U64& addend, const U16U32U64& op1, const U16U32U64& op2); + U16U32U64 FPMulSub(const U16U32U64& minuend, const U16U32U64& op1, const U16U32U64& op2); U32U64 FPMulX(const U32U64& a, const U32U64& b); U16U32U64 FPNeg(const U16U32U64& a); U16U32U64 FPRecipEstimate(const U16U32U64& a); diff --git a/src/dynarmic/ir/microinstruction.cpp b/src/dynarmic/ir/microinstruction.cpp index fcebcda7..50af036d 100644 --- a/src/dynarmic/ir/microinstruction.cpp +++ b/src/dynarmic/ir/microinstruction.cpp @@ -308,6 +308,9 @@ bool Inst::ReadsFromAndWritesToFPSRCumulativeExceptionBits() const { case Opcode::FPMulAdd16: case Opcode::FPMulAdd32: case Opcode::FPMulAdd64: + case Opcode::FPMulSub16: + case Opcode::FPMulSub32: + case Opcode::FPMulSub64: case Opcode::FPRecipEstimate16: case Opcode::FPRecipEstimate32: case Opcode::FPRecipEstimate64: diff --git a/src/dynarmic/ir/opcodes.inc b/src/dynarmic/ir/opcodes.inc index 54d43ff4..dd060e0a 100644 --- a/src/dynarmic/ir/opcodes.inc +++ b/src/dynarmic/ir/opcodes.inc @@ -578,6 +578,9 @@ OPCODE(FPMul64, U64, U64, OPCODE(FPMulAdd16, U16, U16, U16, U16 ) OPCODE(FPMulAdd32, U32, U32, U32, U32 ) OPCODE(FPMulAdd64, U64, U64, U64, U64 ) +OPCODE(FPMulSub16, U16, U16, U16, U16 ) +OPCODE(FPMulSub32, U32, U32, U32, U32 ) +OPCODE(FPMulSub64, U64, U64, U64, U64 ) OPCODE(FPMulX32, U32, U32, U32 ) OPCODE(FPMulX64, U64, U64, U64 ) OPCODE(FPNeg16, U16, U16 ) diff --git a/tests/test_generator.cpp b/tests/test_generator.cpp index 90c2d635..cfdd8ea4 100644 --- a/tests/test_generator.cpp +++ b/tests/test_generator.cpp @@ -103,6 +103,7 @@ bool ShouldTestInst(IR::Block& block) { // Half-precision case IR::Opcode::FPAbs16: case IR::Opcode::FPMulAdd16: + case IR::Opcode::FPMulSub16: case IR::Opcode::FPNeg16: case IR::Opcode::FPRecipEstimate16: case IR::Opcode::FPRecipExponent16: