backend_x64: Fix bugs when FPCR.FZ=1

Bugs: * DenormalsAreZero flushed to positive zero instead of preserving sign. * FMAXNM/FMINNM (scalar) should perform DAZ *before* special zero handling. * FMAX/FMIN/FMAXNM/FMINNM (vector) did not DAZ.
2018-07-31 15:32:14 +01:00 · 2018-07-31 15:32:14 +01:00 · b393e15ab6
commit b393e15ab6
parent 2b538b471f
2 changed files with 49 additions and 8 deletions
--- a/src/backend_x64/emit_x64_floating_point.cpp
+++ b/src/backend_x64/emit_x64_floating_point.cpp
@ -92,7 +92,7 @@ void DenormalsAreZero(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Reg64 gpr_
    // SSE doesn't do this for us when SSE's DAZ is enabled.
    code.ja(end);
-    code.pxor(xmm_value, xmm_value);
+    code.andps(xmm_value, code.MConst(xword, fsize == 32 ? f32_negative_zero : f64_negative_zero));
    code.mov(dword[r15 + code.GetJitStateInfo().offsetof_FPSCR_IDC], u32(1 << 7));
    code.L(end);
 }
@ -267,13 +267,13 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, [[maybe_unus
    Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
    Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr();
    if constexpr(!std::is_same_v<PreprocessFunction, std::nullptr_t>) {
        preprocess(result, operand, gpr_scratch, end);
    }
    if (ctx.FPSCR_FTZ()) {
        DenormalsAreZero<fsize>(code, result, gpr_scratch);
        DenormalsAreZero<fsize>(code, operand, gpr_scratch);
    }
    if constexpr(!std::is_same_v<PreprocessFunction, std::nullptr_t>) {
        preprocess(result, operand, gpr_scratch, end);
    }
    if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
        PreProcessNaNs<fsize>(code, result, operand, end);
    }
--- a/src/backend_x64/emit_x64_vector_floating_point.cpp
+++ b/src/backend_x64/emit_x64_vector_floating_point.cpp
@ -131,6 +131,15 @@ Xbyak::Address GetNaNVector(BlockOfCode& code) {
    }
 }
 template<size_t fsize>
 Xbyak::Address GetNegativeZeroVector(BlockOfCode& code) {
    if constexpr (fsize == 32) {
        return code.MConst(xword, 0x8000'0000'8000'0000, 0x8000'0000'8000'0000);
    } else {
        return code.MConst(xword, 0x8000'0000'0000'0000, 0x8000'0000'0000'0000);
    }
 }
 template<size_t fsize>
 void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
    if (ctx.FPSCR_DN()) {
@ -146,6 +155,20 @@ void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
    }
 }
 template<size_t fsize>
 void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) {
    if (ctx.FPSCR_FTZ()) {
        if (ctx.FPSCR_RMode() != FP::RoundingMode::TowardsMinusInfinity) {
            code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
        } else {
            code.xorps(tmp, tmp);
        }
        for (const Xbyak::Xmm& xmm : to_daz) {
            FCODE(addp)(xmm, tmp);
        }
    }
 }
 template<typename T>
 struct DefaultIndexer {
    std::tuple<T> operator()(size_t i, const VectorArray<T>& a) {
@ -565,12 +588,14 @@ static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
    if (ctx.FPSCR_DN()) {
        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
        const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
-        const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+        const Xbyak::Xmm xmm_b = ctx.FPSCR_FTZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
        const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
        const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
        const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
        DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
        if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
            FCODE(vcmpeqp)(mask, result, xmm_b);
            FCODE(vcmpunordp)(nan_mask, result, xmm_b);
@ -602,10 +627,17 @@ static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
        return;
    }
-    EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){
+    EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b){
        const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
        const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
        if (ctx.FPSCR_FTZ()) {
            const Xbyak::Xmm prev_xmm_b = xmm_b;
            xmm_b = ctx.reg_alloc.ScratchXmm();
            code.movaps(xmm_b, prev_xmm_b);
            DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
        }
        // What we are doing here is handling the case when the inputs are differently signed zeros.
        // x86-64 treats differently signed zeros as equal while ARM does not.
        // Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero.
@ -643,12 +675,14 @@ static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
    if (ctx.FPSCR_DN()) {
        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
        const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
-        const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+        const Xbyak::Xmm xmm_b = ctx.FPSCR_FTZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
        const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
        const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
        const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
        DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
        if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
            FCODE(vcmpeqp)(mask, result, xmm_b);
            FCODE(vcmpunordp)(nan_mask, result, xmm_b);
@ -680,10 +714,17 @@ static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
        return;
    }
-    EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){
+    EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b){
        const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
        const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
        if (ctx.FPSCR_FTZ()) {
            const Xbyak::Xmm prev_xmm_b = xmm_b;
            xmm_b = ctx.reg_alloc.ScratchXmm();
            code.movaps(xmm_b, prev_xmm_b);
            DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
        }
        // What we are doing here is handling the case when the inputs are differently signed zeros.
        // x86-64 treats differently signed zeros as equal while ARM does not.
        // Thus if we OR together things that x86-64 thinks are equal we'll get the negative zero.