backend_x64: Fix bugs when FPCR.FZ=1

Bugs:
* DenormalsAreZero flushed to positive zero instead of preserving sign.
* FMAXNM/FMINNM (scalar) should perform DAZ *before* special zero handling.
* FMAX/FMIN/FMAXNM/FMINNM (vector) did not DAZ.
This commit is contained in:
MerryMage 2018-07-31 15:32:14 +01:00
parent 2b538b471f
commit b393e15ab6
2 changed files with 49 additions and 8 deletions

View file

@ -92,7 +92,7 @@ void DenormalsAreZero(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Reg64 gpr_
// SSE doesn't do this for us when SSE's DAZ is enabled. // SSE doesn't do this for us when SSE's DAZ is enabled.
code.ja(end); code.ja(end);
code.pxor(xmm_value, xmm_value); code.andps(xmm_value, code.MConst(xword, fsize == 32 ? f32_negative_zero : f64_negative_zero));
code.mov(dword[r15 + code.GetJitStateInfo().offsetof_FPSCR_IDC], u32(1 << 7)); code.mov(dword[r15 + code.GetJitStateInfo().offsetof_FPSCR_IDC], u32(1 << 7));
code.L(end); code.L(end);
} }
@ -267,13 +267,13 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, [[maybe_unus
Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]); Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr(); Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr();
if constexpr(!std::is_same_v<PreprocessFunction, std::nullptr_t>) {
preprocess(result, operand, gpr_scratch, end);
}
if (ctx.FPSCR_FTZ()) { if (ctx.FPSCR_FTZ()) {
DenormalsAreZero<fsize>(code, result, gpr_scratch); DenormalsAreZero<fsize>(code, result, gpr_scratch);
DenormalsAreZero<fsize>(code, operand, gpr_scratch); DenormalsAreZero<fsize>(code, operand, gpr_scratch);
} }
if constexpr(!std::is_same_v<PreprocessFunction, std::nullptr_t>) {
preprocess(result, operand, gpr_scratch, end);
}
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) { if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
PreProcessNaNs<fsize>(code, result, operand, end); PreProcessNaNs<fsize>(code, result, operand, end);
} }

View file

@ -131,6 +131,15 @@ Xbyak::Address GetNaNVector(BlockOfCode& code) {
} }
} }
template<size_t fsize>
Xbyak::Address GetNegativeZeroVector(BlockOfCode& code) {
if constexpr (fsize == 32) {
return code.MConst(xword, 0x8000'0000'8000'0000, 0x8000'0000'8000'0000);
} else {
return code.MConst(xword, 0x8000'0000'0000'0000, 0x8000'0000'0000'0000);
}
}
template<size_t fsize> template<size_t fsize>
void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) { void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
if (ctx.FPSCR_DN()) { if (ctx.FPSCR_DN()) {
@ -146,6 +155,20 @@ void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
} }
} }
template<size_t fsize>
void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) {
if (ctx.FPSCR_FTZ()) {
if (ctx.FPSCR_RMode() != FP::RoundingMode::TowardsMinusInfinity) {
code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
} else {
code.xorps(tmp, tmp);
}
for (const Xbyak::Xmm& xmm : to_daz) {
FCODE(addp)(xmm, tmp);
}
}
}
template<typename T> template<typename T>
struct DefaultIndexer { struct DefaultIndexer {
std::tuple<T> operator()(size_t i, const VectorArray<T>& a) { std::tuple<T> operator()(size_t i, const VectorArray<T>& a) {
@ -565,12 +588,14 @@ static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
if (ctx.FPSCR_DN()) { if (ctx.FPSCR_DN()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm xmm_b = ctx.FPSCR_FTZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
FCODE(vcmpeqp)(mask, result, xmm_b); FCODE(vcmpeqp)(mask, result, xmm_b);
FCODE(vcmpunordp)(nan_mask, result, xmm_b); FCODE(vcmpunordp)(nan_mask, result, xmm_b);
@ -602,10 +627,17 @@ static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
return; return;
} }
EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){ EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b){
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
if (ctx.FPSCR_FTZ()) {
const Xbyak::Xmm prev_xmm_b = xmm_b;
xmm_b = ctx.reg_alloc.ScratchXmm();
code.movaps(xmm_b, prev_xmm_b);
DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
}
// What we are doing here is handling the case when the inputs are differently signed zeros. // What we are doing here is handling the case when the inputs are differently signed zeros.
// x86-64 treats differently signed zeros as equal while ARM does not. // x86-64 treats differently signed zeros as equal while ARM does not.
// Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero. // Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero.
@ -643,12 +675,14 @@ static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
if (ctx.FPSCR_DN()) { if (ctx.FPSCR_DN()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm xmm_b = ctx.FPSCR_FTZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
FCODE(vcmpeqp)(mask, result, xmm_b); FCODE(vcmpeqp)(mask, result, xmm_b);
FCODE(vcmpunordp)(nan_mask, result, xmm_b); FCODE(vcmpunordp)(nan_mask, result, xmm_b);
@ -680,10 +714,17 @@ static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
return; return;
} }
EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){ EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b){
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
if (ctx.FPSCR_FTZ()) {
const Xbyak::Xmm prev_xmm_b = xmm_b;
xmm_b = ctx.reg_alloc.ScratchXmm();
code.movaps(xmm_b, prev_xmm_b);
DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
}
// What we are doing here is handling the case when the inputs are differently signed zeros. // What we are doing here is handling the case when the inputs are differently signed zeros.
// x86-64 treats differently signed zeros as equal while ARM does not. // x86-64 treats differently signed zeros as equal while ARM does not.
// Thus if we OR together things that x86-64 thinks are equal we'll get the negative zero. // Thus if we OR together things that x86-64 thinks are equal we'll get the negative zero.