backend_x64: Fix FPVectorMulAdd and FPMulAdd NaN handling with denormals

Denormals should be treated as zero in NaN handler
2018-07-31 16:07:46 +01:00 · 2018-07-31 16:07:46 +01:00 · 822fd4a875
commit 822fd4a875
parent 381821eda3
4 changed files with 36 additions and 11 deletions
--- a/src/backend_x64/emit_x64_floating_point.cpp
+++ b/src/backend_x64/emit_x64_floating_point.cpp
@ -158,7 +158,7 @@ void PreProcessNaNs(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Label&
 }
 template<size_t fsize, typename NaNHandler>
-void PreProcessNaNs(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end, NaNHandler nan_handler) {
+void PreProcessNaNs(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end, NaNHandler nan_handler) {
    using FPT = mp::unsigned_integer_of_size<fsize>;
    Xbyak::Label nan;
@ -175,7 +175,8 @@ void PreProcessNaNs(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c,
    code.movq(code.ABI_PARAM1, a);
    code.movq(code.ABI_PARAM2, b);
    code.movq(code.ABI_PARAM3, c);
-    code.CallFunction(static_cast<FPT(*)(FPT, FPT, FPT)>(nan_handler));
+    code.mov(code.ABI_PARAM4, ctx.FPCR());
    code.CallFunction(static_cast<FPT(*)(FPT, FPT, FPT, FP::FPCR)>(nan_handler));
    code.movq(a, code.ABI_RETURN);
    ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
    code.add(rsp, 8);
@ -317,7 +318,7 @@ void FPFourOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn,
        DenormalsAreZero<fsize>(code, operand3, gpr_scratch);
    }
    if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
-        PreProcessNaNs<fsize>(code, result, operand2, operand3, end, nan_handler);
+        PreProcessNaNs<fsize>(code, ctx, result, operand2, operand3, end, nan_handler);
    }
    fn(result, operand2, operand3);
    if (ctx.FPSCR_FTZ()) {
@ -656,8 +657,8 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
        FPFourOp<fsize>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) {
            FCODE(vfmadd231s)(result, operand2, operand3);
-        }, [](FPT a, FPT b, FPT c) -> FPT {
+        }, [](FPT a, FPT b, FPT c, FP::FPCR fpcr) -> FPT {
-            if (FP::IsQNaN(a) && ((FP::IsInf(b) && FP::IsZero(c)) || (FP::IsZero(b) && FP::IsInf(c)))) {
+            if (FP::IsQNaN(a) && ((FP::IsInf(b) && FP::IsZero(c, fpcr)) || (FP::IsZero(b, fpcr) && FP::IsInf(c)))) {
                return FP::FPInfo<FPT>::DefaultNaN();
            }
            return *FP::ProcessNaNs(a, b, c);
--- a/src/backend_x64/emit_x64_vector_floating_point.cpp
+++ b/src/backend_x64/emit_x64_vector_floating_point.cpp
@ -54,7 +54,7 @@ struct NaNHandler {
 public:
    using FPT = mp::unsigned_integer_of_size<fsize>;
-    using function_type = void(*)(std::array<VectorArray<FPT>, narg>&);
+    using function_type = void(*)(std::array<VectorArray<FPT>, narg>&, FP::FPCR);
    static function_type GetDefault() {
        return GetDefaultImpl(std::make_index_sequence<narg - 1>{});
@ -63,7 +63,7 @@ public:
 private:
    template<size_t... argi>
    static function_type GetDefaultImpl(std::index_sequence<argi...>) {
-        const auto result = [](std::array<VectorArray<FPT>, narg>& values) {
+        const auto result = [](std::array<VectorArray<FPT>, narg>& values, FP::FPCR) {
            VectorArray<FPT>& result = values[0];
            for (size_t elementi = 0; elementi < result.size(); ++elementi) {
                const auto current_values = Indexer<FPT>{}(elementi, values[argi + 1]...);
@ -111,6 +111,7 @@ void HandleNaNs(BlockOfCode& code, EmitContext& ctx, std::array<Xbyak::Xmm, narg
        code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], xmms[i]);
    }
    code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
    code.mov(code.ABI_PARAM2, ctx.FPCR());
    code.CallFunction(nan_handler);
@ -772,14 +773,14 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
        const auto x64_instruction = fsize == 32 ? &Xbyak::CodeGenerator::vfmadd231ps : &Xbyak::CodeGenerator::vfmadd231pd;
        EmitFourOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, x64_instruction,
-            static_cast<void(*)(std::array<VectorArray<FPT>, 4>& values)>(
+            static_cast<void(*)(std::array<VectorArray<FPT>, 4>& values, FP::FPCR fpcr)>(
-                [](std::array<VectorArray<FPT>, 4>& values) {
+                [](std::array<VectorArray<FPT>, 4>& values, FP::FPCR fpcr) {
                    VectorArray<FPT>& result = values[0];
                    const VectorArray<FPT>& a = values[1];
                    const VectorArray<FPT>& b = values[2];
                    const VectorArray<FPT>& c = values[3];
                    for (size_t i = 0; i < result.size(); i++) {
-                        if (FP::IsQNaN(a[i]) && ((FP::IsInf(b[i]) && FP::IsZero(c[i])) || (FP::IsZero(b[i]) && FP::IsInf(c[i])))) {
+                        if (FP::IsQNaN(a[i]) && ((FP::IsInf(b[i]) && FP::IsZero(c[i], fpcr)) || (FP::IsZero(b[i], fpcr) && FP::IsInf(c[i])))) {
                            result[i] = FP::FPInfo<FPT>::DefaultNaN();
                        } else if (auto r = FP::ProcessNaNs(a[i], b[i], c[i])) {
                            result[i] = *r;
--- a/src/common/fp/util.h
+++ b/src/common/fp/util.h
@ -9,13 +9,17 @@
 #include <boost/optional.hpp>
 #include "common/common_types.h"
 #include "common/fp/fpcr.h"
 #include "common/fp/info.h"
 namespace Dynarmic::FP {
 /// Is floating point value a zero?
 template<typename FPT>
-constexpr bool IsZero(FPT value) {
+inline bool IsZero(FPT value, FPCR fpcr) {
    if (fpcr.FZ()) {
        return (value & FPInfo<FPT>::exponent_mask) == 0;
    }
    return (value & ~FPInfo<FPT>::sign_mask) == 0;
 }
--- a/tests/A64/a64.cpp
+++ b/tests/A64/a64.cpp
@ -390,3 +390,22 @@ TEST_CASE("A64: FMADD", "[a64]") {
    REQUIRE(jit.GetVector(10) == Vector{0x3f059921bf0dbfff, 0x0000000000000000});
 }
 TEST_CASE("A64: FMLA.4S (denormal)", "[a64]") {
    TestEnv env;
    Dynarmic::A64::Jit jit{Dynarmic::A64::UserConfig{&env}};
    env.code_mem[0] = 0x4e2fcccc; // FMLA.4S V12, V6, V15
    env.code_mem[1] = 0x14000000; // B .
    jit.SetPC(0);
    jit.SetVector(12, {0x3c9623b17ff80000, 0xbff0000080000076});
    jit.SetVector(6, {0x7ff80000ff800000, 0x09503366c1200000});
    jit.SetVector(15, {0x3ff0000080636d24, 0xbf800000e73a5134});
    jit.SetFpcr(0x01000000);
    env.ticks_left = 2;
    jit.Run();
    REQUIRE(jit.GetVector(12) == Vector{0x7ff800007fc00000, 0xbff0000068e8e581});
 }