backend_x64: Fix FPVectorMulAdd and FPMulAdd NaN handling with denormals
Denormals should be treated as zero in NaN handler
This commit is contained in:
parent
381821eda3
commit
822fd4a875
4 changed files with 36 additions and 11 deletions
|
@ -158,7 +158,7 @@ void PreProcessNaNs(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Label&
|
||||||
}
|
}
|
||||||
|
|
||||||
template<size_t fsize, typename NaNHandler>
|
template<size_t fsize, typename NaNHandler>
|
||||||
void PreProcessNaNs(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end, NaNHandler nan_handler) {
|
void PreProcessNaNs(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end, NaNHandler nan_handler) {
|
||||||
using FPT = mp::unsigned_integer_of_size<fsize>;
|
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||||
|
|
||||||
Xbyak::Label nan;
|
Xbyak::Label nan;
|
||||||
|
@ -175,7 +175,8 @@ void PreProcessNaNs(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c,
|
||||||
code.movq(code.ABI_PARAM1, a);
|
code.movq(code.ABI_PARAM1, a);
|
||||||
code.movq(code.ABI_PARAM2, b);
|
code.movq(code.ABI_PARAM2, b);
|
||||||
code.movq(code.ABI_PARAM3, c);
|
code.movq(code.ABI_PARAM3, c);
|
||||||
code.CallFunction(static_cast<FPT(*)(FPT, FPT, FPT)>(nan_handler));
|
code.mov(code.ABI_PARAM4, ctx.FPCR());
|
||||||
|
code.CallFunction(static_cast<FPT(*)(FPT, FPT, FPT, FP::FPCR)>(nan_handler));
|
||||||
code.movq(a, code.ABI_RETURN);
|
code.movq(a, code.ABI_RETURN);
|
||||||
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
|
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
|
||||||
code.add(rsp, 8);
|
code.add(rsp, 8);
|
||||||
|
@ -317,7 +318,7 @@ void FPFourOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn,
|
||||||
DenormalsAreZero<fsize>(code, operand3, gpr_scratch);
|
DenormalsAreZero<fsize>(code, operand3, gpr_scratch);
|
||||||
}
|
}
|
||||||
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
|
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
|
||||||
PreProcessNaNs<fsize>(code, result, operand2, operand3, end, nan_handler);
|
PreProcessNaNs<fsize>(code, ctx, result, operand2, operand3, end, nan_handler);
|
||||||
}
|
}
|
||||||
fn(result, operand2, operand3);
|
fn(result, operand2, operand3);
|
||||||
if (ctx.FPSCR_FTZ()) {
|
if (ctx.FPSCR_FTZ()) {
|
||||||
|
@ -656,8 +657,8 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
|
||||||
FPFourOp<fsize>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) {
|
FPFourOp<fsize>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) {
|
||||||
FCODE(vfmadd231s)(result, operand2, operand3);
|
FCODE(vfmadd231s)(result, operand2, operand3);
|
||||||
}, [](FPT a, FPT b, FPT c) -> FPT {
|
}, [](FPT a, FPT b, FPT c, FP::FPCR fpcr) -> FPT {
|
||||||
if (FP::IsQNaN(a) && ((FP::IsInf(b) && FP::IsZero(c)) || (FP::IsZero(b) && FP::IsInf(c)))) {
|
if (FP::IsQNaN(a) && ((FP::IsInf(b) && FP::IsZero(c, fpcr)) || (FP::IsZero(b, fpcr) && FP::IsInf(c)))) {
|
||||||
return FP::FPInfo<FPT>::DefaultNaN();
|
return FP::FPInfo<FPT>::DefaultNaN();
|
||||||
}
|
}
|
||||||
return *FP::ProcessNaNs(a, b, c);
|
return *FP::ProcessNaNs(a, b, c);
|
||||||
|
|
|
@ -54,7 +54,7 @@ struct NaNHandler {
|
||||||
public:
|
public:
|
||||||
using FPT = mp::unsigned_integer_of_size<fsize>;
|
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||||
|
|
||||||
using function_type = void(*)(std::array<VectorArray<FPT>, narg>&);
|
using function_type = void(*)(std::array<VectorArray<FPT>, narg>&, FP::FPCR);
|
||||||
|
|
||||||
static function_type GetDefault() {
|
static function_type GetDefault() {
|
||||||
return GetDefaultImpl(std::make_index_sequence<narg - 1>{});
|
return GetDefaultImpl(std::make_index_sequence<narg - 1>{});
|
||||||
|
@ -63,7 +63,7 @@ public:
|
||||||
private:
|
private:
|
||||||
template<size_t... argi>
|
template<size_t... argi>
|
||||||
static function_type GetDefaultImpl(std::index_sequence<argi...>) {
|
static function_type GetDefaultImpl(std::index_sequence<argi...>) {
|
||||||
const auto result = [](std::array<VectorArray<FPT>, narg>& values) {
|
const auto result = [](std::array<VectorArray<FPT>, narg>& values, FP::FPCR) {
|
||||||
VectorArray<FPT>& result = values[0];
|
VectorArray<FPT>& result = values[0];
|
||||||
for (size_t elementi = 0; elementi < result.size(); ++elementi) {
|
for (size_t elementi = 0; elementi < result.size(); ++elementi) {
|
||||||
const auto current_values = Indexer<FPT>{}(elementi, values[argi + 1]...);
|
const auto current_values = Indexer<FPT>{}(elementi, values[argi + 1]...);
|
||||||
|
@ -111,6 +111,7 @@ void HandleNaNs(BlockOfCode& code, EmitContext& ctx, std::array<Xbyak::Xmm, narg
|
||||||
code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], xmms[i]);
|
code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], xmms[i]);
|
||||||
}
|
}
|
||||||
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
|
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
|
||||||
|
code.mov(code.ABI_PARAM2, ctx.FPCR());
|
||||||
|
|
||||||
code.CallFunction(nan_handler);
|
code.CallFunction(nan_handler);
|
||||||
|
|
||||||
|
@ -772,14 +773,14 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
|
||||||
const auto x64_instruction = fsize == 32 ? &Xbyak::CodeGenerator::vfmadd231ps : &Xbyak::CodeGenerator::vfmadd231pd;
|
const auto x64_instruction = fsize == 32 ? &Xbyak::CodeGenerator::vfmadd231ps : &Xbyak::CodeGenerator::vfmadd231pd;
|
||||||
EmitFourOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, x64_instruction,
|
EmitFourOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, x64_instruction,
|
||||||
static_cast<void(*)(std::array<VectorArray<FPT>, 4>& values)>(
|
static_cast<void(*)(std::array<VectorArray<FPT>, 4>& values, FP::FPCR fpcr)>(
|
||||||
[](std::array<VectorArray<FPT>, 4>& values) {
|
[](std::array<VectorArray<FPT>, 4>& values, FP::FPCR fpcr) {
|
||||||
VectorArray<FPT>& result = values[0];
|
VectorArray<FPT>& result = values[0];
|
||||||
const VectorArray<FPT>& a = values[1];
|
const VectorArray<FPT>& a = values[1];
|
||||||
const VectorArray<FPT>& b = values[2];
|
const VectorArray<FPT>& b = values[2];
|
||||||
const VectorArray<FPT>& c = values[3];
|
const VectorArray<FPT>& c = values[3];
|
||||||
for (size_t i = 0; i < result.size(); i++) {
|
for (size_t i = 0; i < result.size(); i++) {
|
||||||
if (FP::IsQNaN(a[i]) && ((FP::IsInf(b[i]) && FP::IsZero(c[i])) || (FP::IsZero(b[i]) && FP::IsInf(c[i])))) {
|
if (FP::IsQNaN(a[i]) && ((FP::IsInf(b[i]) && FP::IsZero(c[i], fpcr)) || (FP::IsZero(b[i], fpcr) && FP::IsInf(c[i])))) {
|
||||||
result[i] = FP::FPInfo<FPT>::DefaultNaN();
|
result[i] = FP::FPInfo<FPT>::DefaultNaN();
|
||||||
} else if (auto r = FP::ProcessNaNs(a[i], b[i], c[i])) {
|
} else if (auto r = FP::ProcessNaNs(a[i], b[i], c[i])) {
|
||||||
result[i] = *r;
|
result[i] = *r;
|
||||||
|
|
|
@ -9,13 +9,17 @@
|
||||||
#include <boost/optional.hpp>
|
#include <boost/optional.hpp>
|
||||||
|
|
||||||
#include "common/common_types.h"
|
#include "common/common_types.h"
|
||||||
|
#include "common/fp/fpcr.h"
|
||||||
#include "common/fp/info.h"
|
#include "common/fp/info.h"
|
||||||
|
|
||||||
namespace Dynarmic::FP {
|
namespace Dynarmic::FP {
|
||||||
|
|
||||||
/// Is floating point value a zero?
|
/// Is floating point value a zero?
|
||||||
template<typename FPT>
|
template<typename FPT>
|
||||||
constexpr bool IsZero(FPT value) {
|
inline bool IsZero(FPT value, FPCR fpcr) {
|
||||||
|
if (fpcr.FZ()) {
|
||||||
|
return (value & FPInfo<FPT>::exponent_mask) == 0;
|
||||||
|
}
|
||||||
return (value & ~FPInfo<FPT>::sign_mask) == 0;
|
return (value & ~FPInfo<FPT>::sign_mask) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -390,3 +390,22 @@ TEST_CASE("A64: FMADD", "[a64]") {
|
||||||
|
|
||||||
REQUIRE(jit.GetVector(10) == Vector{0x3f059921bf0dbfff, 0x0000000000000000});
|
REQUIRE(jit.GetVector(10) == Vector{0x3f059921bf0dbfff, 0x0000000000000000});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE("A64: FMLA.4S (denormal)", "[a64]") {
|
||||||
|
TestEnv env;
|
||||||
|
Dynarmic::A64::Jit jit{Dynarmic::A64::UserConfig{&env}};
|
||||||
|
|
||||||
|
env.code_mem[0] = 0x4e2fcccc; // FMLA.4S V12, V6, V15
|
||||||
|
env.code_mem[1] = 0x14000000; // B .
|
||||||
|
|
||||||
|
jit.SetPC(0);
|
||||||
|
jit.SetVector(12, {0x3c9623b17ff80000, 0xbff0000080000076});
|
||||||
|
jit.SetVector(6, {0x7ff80000ff800000, 0x09503366c1200000});
|
||||||
|
jit.SetVector(15, {0x3ff0000080636d24, 0xbf800000e73a5134});
|
||||||
|
jit.SetFpcr(0x01000000);
|
||||||
|
|
||||||
|
env.ticks_left = 2;
|
||||||
|
jit.Run();
|
||||||
|
|
||||||
|
REQUIRE(jit.GetVector(12) == Vector{0x7ff800007fc00000, 0xbff0000068e8e581});
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue