From 822fd4a87561ce22997122b1e7fd88c961f0df8d Mon Sep 17 00:00:00 2001
From: MerryMage <MerryMage@users.noreply.github.com>
Date: Tue, 31 Jul 2018 16:07:46 +0100
Subject: [PATCH] backend_x64: Fix FPVectorMulAdd and FPMulAdd NaN handling
 with denormals

Denormals should be treated as zero in NaN handler
---
 src/backend_x64/emit_x64_floating_point.cpp   | 11 ++++++-----
 .../emit_x64_vector_floating_point.cpp        | 11 ++++++-----
 src/common/fp/util.h                          |  6 +++++-
 tests/A64/a64.cpp                             | 19 +++++++++++++++++++
 4 files changed, 36 insertions(+), 11 deletions(-)
diff --git a/src/backend_x64/emit_x64_floating_point.cpp b/src/backend_x64/emit_x64_floating_point.cpp
index 4f2a8bc0..4a37a676 100644
--- a/src/backend_x64/emit_x64_floating_point.cpp
+++ b/src/backend_x64/emit_x64_floating_point.cpp
@@ -158,7 +158,7 @@ void PreProcessNaNs(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Label&
 }
 
 template<size_t fsize, typename NaNHandler>
-void PreProcessNaNs(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end, NaNHandler nan_handler) {
+void PreProcessNaNs(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end, NaNHandler nan_handler) {
     using FPT = mp::unsigned_integer_of_size<fsize>;
 
     Xbyak::Label nan;
@@ -175,7 +175,8 @@ void PreProcessNaNs(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c,
     code.movq(code.ABI_PARAM1, a);
     code.movq(code.ABI_PARAM2, b);
     code.movq(code.ABI_PARAM3, c);
-    code.CallFunction(static_cast<FPT(*)(FPT, FPT, FPT)>(nan_handler));
+    code.mov(code.ABI_PARAM4, ctx.FPCR());
+    code.CallFunction(static_cast<FPT(*)(FPT, FPT, FPT, FP::FPCR)>(nan_handler));
     code.movq(a, code.ABI_RETURN);
     ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
     code.add(rsp, 8);
@@ -317,7 +318,7 @@ void FPFourOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn,
         DenormalsAreZero<fsize>(code, operand3, gpr_scratch);
     }
     if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
-        PreProcessNaNs<fsize>(code, result, operand2, operand3, end, nan_handler);
+        PreProcessNaNs<fsize>(code, ctx, result, operand2, operand3, end, nan_handler);
     }
     fn(result, operand2, operand3);
     if (ctx.FPSCR_FTZ()) {
@@ -656,8 +657,8 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
     if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
         FPFourOp<fsize>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) {
             FCODE(vfmadd231s)(result, operand2, operand3);
-        }, [](FPT a, FPT b, FPT c) -> FPT {
-            if (FP::IsQNaN(a) && ((FP::IsInf(b) && FP::IsZero(c)) || (FP::IsZero(b) && FP::IsInf(c)))) {
+        }, [](FPT a, FPT b, FPT c, FP::FPCR fpcr) -> FPT {
+            if (FP::IsQNaN(a) && ((FP::IsInf(b) && FP::IsZero(c, fpcr)) || (FP::IsZero(b, fpcr) && FP::IsInf(c)))) {
                 return FP::FPInfo<FPT>::DefaultNaN();
             }
             return *FP::ProcessNaNs(a, b, c);
diff --git a/src/backend_x64/emit_x64_vector_floating_point.cpp b/src/backend_x64/emit_x64_vector_floating_point.cpp
index bbb181b6..91282a48 100644
--- a/src/backend_x64/emit_x64_vector_floating_point.cpp
+++ b/src/backend_x64/emit_x64_vector_floating_point.cpp
@@ -54,7 +54,7 @@ struct NaNHandler {
 public:
     using FPT = mp::unsigned_integer_of_size<fsize>;
 
-    using function_type = void(*)(std::array<VectorArray<FPT>, narg>&);
+    using function_type = void(*)(std::array<VectorArray<FPT>, narg>&, FP::FPCR);
 
     static function_type GetDefault() {
         return GetDefaultImpl(std::make_index_sequence<narg - 1>{});
@@ -63,7 +63,7 @@ public:
 private:
     template<size_t... argi>
     static function_type GetDefaultImpl(std::index_sequence<argi...>) {
-        const auto result = [](std::array<VectorArray<FPT>, narg>& values) {
+        const auto result = [](std::array<VectorArray<FPT>, narg>& values, FP::FPCR) {
             VectorArray<FPT>& result = values[0];
             for (size_t elementi = 0; elementi < result.size(); ++elementi) {
                 const auto current_values = Indexer<FPT>{}(elementi, values[argi + 1]...);
@@ -111,6 +111,7 @@ void HandleNaNs(BlockOfCode& code, EmitContext& ctx, std::array<Xbyak::Xmm, narg
         code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], xmms[i]);
     }
     code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+    code.mov(code.ABI_PARAM2, ctx.FPCR());
 
     code.CallFunction(nan_handler);
 
@@ -772,14 +773,14 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
     if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
         const auto x64_instruction = fsize == 32 ? &Xbyak::CodeGenerator::vfmadd231ps : &Xbyak::CodeGenerator::vfmadd231pd;
         EmitFourOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, x64_instruction,
-            static_cast<void(*)(std::array<VectorArray<FPT>, 4>& values)>(
-                [](std::array<VectorArray<FPT>, 4>& values) {
+            static_cast<void(*)(std::array<VectorArray<FPT>, 4>& values, FP::FPCR fpcr)>(
+                [](std::array<VectorArray<FPT>, 4>& values, FP::FPCR fpcr) {
                     VectorArray<FPT>& result = values[0];
                     const VectorArray<FPT>& a = values[1];
                     const VectorArray<FPT>& b = values[2];
                     const VectorArray<FPT>& c = values[3];
                     for (size_t i = 0; i < result.size(); i++) {
-                        if (FP::IsQNaN(a[i]) && ((FP::IsInf(b[i]) && FP::IsZero(c[i])) || (FP::IsZero(b[i]) && FP::IsInf(c[i])))) {
+                        if (FP::IsQNaN(a[i]) && ((FP::IsInf(b[i]) && FP::IsZero(c[i], fpcr)) || (FP::IsZero(b[i], fpcr) && FP::IsInf(c[i])))) {
                             result[i] = FP::FPInfo<FPT>::DefaultNaN();
                         } else if (auto r = FP::ProcessNaNs(a[i], b[i], c[i])) {
                             result[i] = *r;
diff --git a/src/common/fp/util.h b/src/common/fp/util.h
index 1284bbe4..94ab2b50 100644
--- a/src/common/fp/util.h
+++ b/src/common/fp/util.h
@@ -9,13 +9,17 @@
 #include <boost/optional.hpp>
 
 #include "common/common_types.h"
+#include "common/fp/fpcr.h"
 #include "common/fp/info.h"
 
 namespace Dynarmic::FP {
 
 /// Is floating point value a zero?
 template<typename FPT>
-constexpr bool IsZero(FPT value) {
+inline bool IsZero(FPT value, FPCR fpcr) {
+    if (fpcr.FZ()) {
+        return (value & FPInfo<FPT>::exponent_mask) == 0;
+    }
     return (value & ~FPInfo<FPT>::sign_mask) == 0;
 }
 
diff --git a/tests/A64/a64.cpp b/tests/A64/a64.cpp
index 9a13e4ca..4fa3a8ec 100644
--- a/tests/A64/a64.cpp
+++ b/tests/A64/a64.cpp
@@ -390,3 +390,22 @@ TEST_CASE("A64: FMADD", "[a64]") {
 
     REQUIRE(jit.GetVector(10) == Vector{0x3f059921bf0dbfff, 0x0000000000000000});
 }
+
+TEST_CASE("A64: FMLA.4S (denormal)", "[a64]") {
+    TestEnv env;
+    Dynarmic::A64::Jit jit{Dynarmic::A64::UserConfig{&env}};
+
+    env.code_mem[0] = 0x4e2fcccc; // FMLA.4S V12, V6, V15
+    env.code_mem[1] = 0x14000000; // B .
+
+    jit.SetPC(0);
+    jit.SetVector(12, {0x3c9623b17ff80000, 0xbff0000080000076});
+    jit.SetVector(6, {0x7ff80000ff800000, 0x09503366c1200000});
+    jit.SetVector(15, {0x3ff0000080636d24, 0xbf800000e73a5134});
+    jit.SetFpcr(0x01000000);
+
+    env.ticks_left = 2;
+    jit.Run();
+
+    REQUIRE(jit.GetVector(12) == Vector{0x7ff800007fc00000, 0xbff0000068e8e581});
+}