backend_x64: Fix FPVectorMulAdd and FPMulAdd NaN handling with denormals

Denormals should be treated as zero in NaN handler
This commit is contained in:
MerryMage 2018-07-31 16:07:46 +01:00
parent 381821eda3
commit 822fd4a875
4 changed files with 36 additions and 11 deletions

View file

@ -158,7 +158,7 @@ void PreProcessNaNs(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Label&
} }
template<size_t fsize, typename NaNHandler> template<size_t fsize, typename NaNHandler>
void PreProcessNaNs(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end, NaNHandler nan_handler) { void PreProcessNaNs(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end, NaNHandler nan_handler) {
using FPT = mp::unsigned_integer_of_size<fsize>; using FPT = mp::unsigned_integer_of_size<fsize>;
Xbyak::Label nan; Xbyak::Label nan;
@ -175,7 +175,8 @@ void PreProcessNaNs(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c,
code.movq(code.ABI_PARAM1, a); code.movq(code.ABI_PARAM1, a);
code.movq(code.ABI_PARAM2, b); code.movq(code.ABI_PARAM2, b);
code.movq(code.ABI_PARAM3, c); code.movq(code.ABI_PARAM3, c);
code.CallFunction(static_cast<FPT(*)(FPT, FPT, FPT)>(nan_handler)); code.mov(code.ABI_PARAM4, ctx.FPCR());
code.CallFunction(static_cast<FPT(*)(FPT, FPT, FPT, FP::FPCR)>(nan_handler));
code.movq(a, code.ABI_RETURN); code.movq(a, code.ABI_RETURN);
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx())); ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
code.add(rsp, 8); code.add(rsp, 8);
@ -317,7 +318,7 @@ void FPFourOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn,
DenormalsAreZero<fsize>(code, operand3, gpr_scratch); DenormalsAreZero<fsize>(code, operand3, gpr_scratch);
} }
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) { if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
PreProcessNaNs<fsize>(code, result, operand2, operand3, end, nan_handler); PreProcessNaNs<fsize>(code, ctx, result, operand2, operand3, end, nan_handler);
} }
fn(result, operand2, operand3); fn(result, operand2, operand3);
if (ctx.FPSCR_FTZ()) { if (ctx.FPSCR_FTZ()) {
@ -656,8 +657,8 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) { if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
FPFourOp<fsize>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) { FPFourOp<fsize>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) {
FCODE(vfmadd231s)(result, operand2, operand3); FCODE(vfmadd231s)(result, operand2, operand3);
}, [](FPT a, FPT b, FPT c) -> FPT { }, [](FPT a, FPT b, FPT c, FP::FPCR fpcr) -> FPT {
if (FP::IsQNaN(a) && ((FP::IsInf(b) && FP::IsZero(c)) || (FP::IsZero(b) && FP::IsInf(c)))) { if (FP::IsQNaN(a) && ((FP::IsInf(b) && FP::IsZero(c, fpcr)) || (FP::IsZero(b, fpcr) && FP::IsInf(c)))) {
return FP::FPInfo<FPT>::DefaultNaN(); return FP::FPInfo<FPT>::DefaultNaN();
} }
return *FP::ProcessNaNs(a, b, c); return *FP::ProcessNaNs(a, b, c);

View file

@ -54,7 +54,7 @@ struct NaNHandler {
public: public:
using FPT = mp::unsigned_integer_of_size<fsize>; using FPT = mp::unsigned_integer_of_size<fsize>;
using function_type = void(*)(std::array<VectorArray<FPT>, narg>&); using function_type = void(*)(std::array<VectorArray<FPT>, narg>&, FP::FPCR);
static function_type GetDefault() { static function_type GetDefault() {
return GetDefaultImpl(std::make_index_sequence<narg - 1>{}); return GetDefaultImpl(std::make_index_sequence<narg - 1>{});
@ -63,7 +63,7 @@ public:
private: private:
template<size_t... argi> template<size_t... argi>
static function_type GetDefaultImpl(std::index_sequence<argi...>) { static function_type GetDefaultImpl(std::index_sequence<argi...>) {
const auto result = [](std::array<VectorArray<FPT>, narg>& values) { const auto result = [](std::array<VectorArray<FPT>, narg>& values, FP::FPCR) {
VectorArray<FPT>& result = values[0]; VectorArray<FPT>& result = values[0];
for (size_t elementi = 0; elementi < result.size(); ++elementi) { for (size_t elementi = 0; elementi < result.size(); ++elementi) {
const auto current_values = Indexer<FPT>{}(elementi, values[argi + 1]...); const auto current_values = Indexer<FPT>{}(elementi, values[argi + 1]...);
@ -111,6 +111,7 @@ void HandleNaNs(BlockOfCode& code, EmitContext& ctx, std::array<Xbyak::Xmm, narg
code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], xmms[i]); code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], xmms[i]);
} }
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
code.mov(code.ABI_PARAM2, ctx.FPCR());
code.CallFunction(nan_handler); code.CallFunction(nan_handler);
@ -772,14 +773,14 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) { if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
const auto x64_instruction = fsize == 32 ? &Xbyak::CodeGenerator::vfmadd231ps : &Xbyak::CodeGenerator::vfmadd231pd; const auto x64_instruction = fsize == 32 ? &Xbyak::CodeGenerator::vfmadd231ps : &Xbyak::CodeGenerator::vfmadd231pd;
EmitFourOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, x64_instruction, EmitFourOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, x64_instruction,
static_cast<void(*)(std::array<VectorArray<FPT>, 4>& values)>( static_cast<void(*)(std::array<VectorArray<FPT>, 4>& values, FP::FPCR fpcr)>(
[](std::array<VectorArray<FPT>, 4>& values) { [](std::array<VectorArray<FPT>, 4>& values, FP::FPCR fpcr) {
VectorArray<FPT>& result = values[0]; VectorArray<FPT>& result = values[0];
const VectorArray<FPT>& a = values[1]; const VectorArray<FPT>& a = values[1];
const VectorArray<FPT>& b = values[2]; const VectorArray<FPT>& b = values[2];
const VectorArray<FPT>& c = values[3]; const VectorArray<FPT>& c = values[3];
for (size_t i = 0; i < result.size(); i++) { for (size_t i = 0; i < result.size(); i++) {
if (FP::IsQNaN(a[i]) && ((FP::IsInf(b[i]) && FP::IsZero(c[i])) || (FP::IsZero(b[i]) && FP::IsInf(c[i])))) { if (FP::IsQNaN(a[i]) && ((FP::IsInf(b[i]) && FP::IsZero(c[i], fpcr)) || (FP::IsZero(b[i], fpcr) && FP::IsInf(c[i])))) {
result[i] = FP::FPInfo<FPT>::DefaultNaN(); result[i] = FP::FPInfo<FPT>::DefaultNaN();
} else if (auto r = FP::ProcessNaNs(a[i], b[i], c[i])) { } else if (auto r = FP::ProcessNaNs(a[i], b[i], c[i])) {
result[i] = *r; result[i] = *r;

View file

@ -9,13 +9,17 @@
#include <boost/optional.hpp> #include <boost/optional.hpp>
#include "common/common_types.h" #include "common/common_types.h"
#include "common/fp/fpcr.h"
#include "common/fp/info.h" #include "common/fp/info.h"
namespace Dynarmic::FP { namespace Dynarmic::FP {
/// Is floating point value a zero? /// Is floating point value a zero?
template<typename FPT> template<typename FPT>
constexpr bool IsZero(FPT value) { inline bool IsZero(FPT value, FPCR fpcr) {
if (fpcr.FZ()) {
return (value & FPInfo<FPT>::exponent_mask) == 0;
}
return (value & ~FPInfo<FPT>::sign_mask) == 0; return (value & ~FPInfo<FPT>::sign_mask) == 0;
} }

View file

@ -390,3 +390,22 @@ TEST_CASE("A64: FMADD", "[a64]") {
REQUIRE(jit.GetVector(10) == Vector{0x3f059921bf0dbfff, 0x0000000000000000}); REQUIRE(jit.GetVector(10) == Vector{0x3f059921bf0dbfff, 0x0000000000000000});
} }
TEST_CASE("A64: FMLA.4S (denormal)", "[a64]") {
TestEnv env;
Dynarmic::A64::Jit jit{Dynarmic::A64::UserConfig{&env}};
env.code_mem[0] = 0x4e2fcccc; // FMLA.4S V12, V6, V15
env.code_mem[1] = 0x14000000; // B .
jit.SetPC(0);
jit.SetVector(12, {0x3c9623b17ff80000, 0xbff0000080000076});
jit.SetVector(6, {0x7ff80000ff800000, 0x09503366c1200000});
jit.SetVector(15, {0x3ff0000080636d24, 0xbf800000e73a5134});
jit.SetFpcr(0x01000000);
env.ticks_left = 2;
jit.Run();
REQUIRE(jit.GetVector(12) == Vector{0x7ff800007fc00000, 0xbff0000068e8e581});
}