From bb93353f94510e3913b834a88572b661b3024b70 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Tue, 31 Jul 2018 18:12:39 +0100 Subject: [PATCH] emit_x64_vector_floating_point: Correct FMA in FTZ mode x64 rounds before flushing to zero AArch64 rounds after flushing to zero This difference of behaviour is noticable if something would round to a smallest normalized number --- src/backend_x64/emit_x64_floating_point.cpp | 56 ++++++- .../emit_x64_vector_floating_point.cpp | 147 ++++++++---------- tests/A64/a64.cpp | 19 +++ 3 files changed, 130 insertions(+), 92 deletions(-) diff --git a/src/backend_x64/emit_x64_floating_point.cpp b/src/backend_x64/emit_x64_floating_point.cpp index d9a45f87..82531ced 100644 --- a/src/backend_x64/emit_x64_floating_point.cpp +++ b/src/backend_x64/emit_x64_floating_point.cpp @@ -39,10 +39,12 @@ namespace { constexpr u64 f32_negative_zero = 0x80000000u; constexpr u64 f32_nan = 0x7fc00000u; constexpr u64 f32_non_sign_mask = 0x7fffffffu; +constexpr u64 f32_smallest_normal = 0x00800000u; constexpr u64 f64_negative_zero = 0x8000000000000000u; constexpr u64 f64_nan = 0x7ff8000000000000u; constexpr u64 f64_non_sign_mask = 0x7fffffffffffffffu; +constexpr u64 f64_smallest_normal = 0x0010000000000000u; constexpr u64 f64_penultimate_positive_denormal = 0x000ffffffffffffeu; constexpr u64 f64_min_s32 = 0xc1e0000000000000u; // -2147483648 as a double @@ -590,14 +592,52 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { using FPT = mp::unsigned_integer_of_size; if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) { - FPFourOp(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) { - FCODE(vfmadd231s)(result, operand2, operand3); - }, [](FPT a, FPT b, FPT c, FP::FPCR fpcr) -> FPT { - if (FP::IsQNaN(a) && ((FP::IsInf(b) && FP::IsZero(c, fpcr)) || (FP::IsZero(b, fpcr) && FP::IsInf(c)))) { - return FP::FPInfo::DefaultNaN(); - } - return *FP::ProcessNaNs(a, b, c); - }); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Label end, fallback; + + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movaps(result, operand1); + FCODE(vfmadd231s)(result, operand2, operand3); + + code.movaps(tmp, code.MConst(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask)); + code.andps(tmp, result); + FCODE(ucomis)(result, code.MConst(xword, fsize == 32 ? f32_smallest_normal : f64_smallest_normal)); + code.jz(fallback, code.T_NEAR); + code.L(end); + + code.SwitchToFarCode(); + code.L(fallback); + + code.sub(rsp, 8); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.movq(code.ABI_PARAM1, operand1); + code.movq(code.ABI_PARAM2, operand2); + code.movq(code.ABI_PARAM3, operand3); + code.mov(code.ABI_PARAM4.cvt32(), ctx.FPCR()); +#ifdef _WIN32 + code.sub(rsp, 16 + ABI_SHADOW_SPACE); + code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.mov(qword[rsp + ABI_SHADOW_SPACE], rax); + code.CallFunction(&FP::FPMulAdd); + code.add(rsp, 16 + ABI_SHADOW_SPACE); +#else + code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(&FP::FPMulAdd); +#endif + code.movq(result, code.ABI_RETURN); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.add(rsp, 8); + + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); + + ctx.reg_alloc.DefineValue(inst, result); return; } diff --git a/src/backend_x64/emit_x64_vector_floating_point.cpp b/src/backend_x64/emit_x64_vector_floating_point.cpp index 91282a48..cb21a35e 100644 --- a/src/backend_x64/emit_x64_vector_floating_point.cpp +++ b/src/backend_x64/emit_x64_vector_floating_point.cpp @@ -141,6 +141,15 @@ Xbyak::Address GetNegativeZeroVector(BlockOfCode& code) { } } +template +Xbyak::Address GetSmallestNormalVector(BlockOfCode& code) { + if constexpr (fsize == 32) { + return code.MConst(xword, 0x0080'0000'0080'0000, 0x0080'0000'0080'0000); + } else { + return code.MConst(xword, 0x0010'0000'0000'0000, 0x0010'0000'0000'0000); + } +} + template void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) { if (ctx.FPSCR_DN()) { @@ -310,52 +319,6 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i ctx.reg_alloc.DefineValue(inst, result); } -template class Indexer, typename Function> -void EmitFourOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler::function_type nan_handler = NaNHandler::GetDefault()) { - static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); - - if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]); - - if constexpr (std::is_member_function_pointer_v) { - (code.*fn)(xmm_a, xmm_b, xmm_c); - } else { - fn(xmm_a, xmm_b, xmm_c); - } - - ForceToDefaultNaN(code, ctx, xmm_a); - - ctx.reg_alloc.DefineValue(inst, xmm_a); - return; - } - - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]); - const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); - - code.movaps(nan_mask, xmm_b); - code.movaps(result, xmm_a); - FCODE(cmpunordp)(nan_mask, xmm_a); - FCODE(cmpunordp)(nan_mask, xmm_c); - if constexpr (std::is_member_function_pointer_v) { - (code.*fn)(result, xmm_b, xmm_c); - } else { - fn(result, xmm_b, xmm_c); - } - FCODE(cmpunordp)(nan_mask, result); - - HandleNaNs(code, ctx, {result, xmm_a, xmm_b, xmm_c}, nan_mask, nan_handler); - - ctx.reg_alloc.DefineValue(inst, result); -} - template void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { const auto fn = static_cast*>(lambda); @@ -426,16 +389,9 @@ void EmitThreeOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, La } template -void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { +void EmitFourOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result, Xbyak::Xmm arg1, Xbyak::Xmm arg2, Xbyak::Xmm arg3, Lambda lambda) { const auto fn = static_cast*>(lambda); - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]); - const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm arg3 = ctx.reg_alloc.UseXmm(args[2]); - ctx.reg_alloc.EndOfAllocScope(); - ctx.reg_alloc.HostCall(nullptr); - #ifdef _WIN32 constexpr u32 stack_space = 5 * 16; code.sub(rsp, stack_space + ABI_SHADOW_SPACE); @@ -463,12 +419,24 @@ void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lam code.CallFunction(fn); #ifdef _WIN32 - code.movaps(xmm0, xword[rsp + ABI_SHADOW_SPACE + 1 * 16]); + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 1 * 16]); #else - code.movaps(xmm0, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); #endif code.add(rsp, stack_space + ABI_SHADOW_SPACE); +} + +template +void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm arg3 = ctx.reg_alloc.UseXmm(args[2]); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(nullptr); + + EmitFourOpFallbackWithoutRegAlloc(code, ctx, xmm0, arg1, arg2, arg3, lambda); ctx.reg_alloc.DefineValue(inst, xmm0); } @@ -770,37 +738,48 @@ template void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { using FPT = mp::unsigned_integer_of_size; - if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) { - const auto x64_instruction = fsize == 32 ? &Xbyak::CodeGenerator::vfmadd231ps : &Xbyak::CodeGenerator::vfmadd231pd; - EmitFourOpVectorOperation(code, ctx, inst, x64_instruction, - static_cast, 4>& values, FP::FPCR fpcr)>( - [](std::array, 4>& values, FP::FPCR fpcr) { - VectorArray& result = values[0]; - const VectorArray& a = values[1]; - const VectorArray& b = values[2]; - const VectorArray& c = values[3]; - for (size_t i = 0; i < result.size(); i++) { - if (FP::IsQNaN(a[i]) && ((FP::IsInf(b[i]) && FP::IsZero(c[i], fpcr)) || (FP::IsZero(b[i], fpcr) && FP::IsInf(c[i])))) { - result[i] = FP::FPInfo::DefaultNaN(); - } else if (auto r = FP::ProcessNaNs(a[i], b[i], c[i])) { - result[i] = *r; - } else if (FP::IsNaN(result[i])) { - result[i] = FP::FPInfo::DefaultNaN(); - } - } - } - ) - ); + const auto fallback_fn = [](VectorArray& result, const VectorArray& addend, const VectorArray& op1, const VectorArray& op2, FP::FPCR fpcr, FP::FPSR& fpsr) { + for (size_t i = 0; i < result.size(); i++) { + result[i] = FP::FPMulAdd(addend[i], op1[i], op2[i], fpcr, fpsr); + } + }; + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + Xbyak::Label end, fallback; + + code.movaps(result, xmm_a); + FCODE(vfmadd231p)(result, xmm_b, xmm_c); + + code.movaps(tmp, GetNegativeZeroVector(code)); + code.andnps(tmp, result); + FCODE(vcmpeq_uqp)(tmp, tmp, GetSmallestNormalVector(code)); + code.vptest(tmp, tmp); + code.jnz(fallback, code.T_NEAR); + code.L(end); + + code.SwitchToFarCode(); + code.L(fallback); + code.sub(rsp, 8); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + EmitFourOpFallbackWithoutRegAlloc(code, ctx, result, xmm_a, xmm_b, xmm_c, fallback_fn); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.add(rsp, 8); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); + + ctx.reg_alloc.DefineValue(inst, result); return; } - EmitFourOpFallback(code, ctx, inst, - [](VectorArray& result, const VectorArray& addend, const VectorArray& op1, const VectorArray& op2, FP::FPCR fpcr, FP::FPSR& fpsr) { - for (size_t i = 0; i < result.size(); i++) { - result[i] = FP::FPMulAdd(addend[i], op1[i], op2[i], fpcr, fpsr); - } - } - ); + EmitFourOpFallback(code, ctx, inst, fallback_fn); } void EmitX64::EmitFPVectorMulAdd32(EmitContext& ctx, IR::Inst* inst) { diff --git a/tests/A64/a64.cpp b/tests/A64/a64.cpp index 4fa3a8ec..8e505f99 100644 --- a/tests/A64/a64.cpp +++ b/tests/A64/a64.cpp @@ -409,3 +409,22 @@ TEST_CASE("A64: FMLA.4S (denormal)", "[a64]") { REQUIRE(jit.GetVector(12) == Vector{0x7ff800007fc00000, 0xbff0000068e8e581}); } + +TEST_CASE("A64: FMLA.4S (0x80800000)", "[a64]") { + TestEnv env; + Dynarmic::A64::Jit jit{Dynarmic::A64::UserConfig{&env}}; + + env.code_mem[0] = 0x4e38cc2b; // FMLA.4S V11, V1, V24 + env.code_mem[1] = 0x14000000; // B . + + jit.SetPC(0); + jit.SetVector(11, {0xc79b271efff05678, 0xffc0000080800000}); + jit.SetVector(1, {0x00636d2400800000, 0x0966320bb26bddee}); + jit.SetVector(24, {0x460e8c84fff00000, 0x8ba98d2780800002}); + jit.SetFpcr(0x03000000); + + env.ticks_left = 2; + jit.Run(); + + REQUIRE(jit.GetVector(11) == Vector{0xc79b271e7fc00000, 0x7fc0000080000000}); +}