diff --git a/src/backend_x64/emit_x64_floating_point.cpp b/src/backend_x64/emit_x64_floating_point.cpp index 3b651f80..4f2a8bc0 100644 --- a/src/backend_x64/emit_x64_floating_point.cpp +++ b/src/backend_x64/emit_x64_floating_point.cpp @@ -92,7 +92,7 @@ void DenormalsAreZero(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Reg64 gpr_ // SSE doesn't do this for us when SSE's DAZ is enabled. code.ja(end); - code.pxor(xmm_value, xmm_value); + code.andps(xmm_value, code.MConst(xword, fsize == 32 ? f32_negative_zero : f64_negative_zero)); code.mov(dword[r15 + code.GetJitStateInfo().offsetof_FPSCR_IDC], u32(1 << 7)); code.L(end); } @@ -267,13 +267,13 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, [[maybe_unus Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]); Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr(); - if constexpr(!std::is_same_v) { - preprocess(result, operand, gpr_scratch, end); - } if (ctx.FPSCR_FTZ()) { DenormalsAreZero(code, result, gpr_scratch); DenormalsAreZero(code, operand, gpr_scratch); } + if constexpr(!std::is_same_v) { + preprocess(result, operand, gpr_scratch, end); + } if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) { PreProcessNaNs(code, result, operand, end); } diff --git a/src/backend_x64/emit_x64_vector_floating_point.cpp b/src/backend_x64/emit_x64_vector_floating_point.cpp index 452b2337..bbb181b6 100644 --- a/src/backend_x64/emit_x64_vector_floating_point.cpp +++ b/src/backend_x64/emit_x64_vector_floating_point.cpp @@ -131,6 +131,15 @@ Xbyak::Address GetNaNVector(BlockOfCode& code) { } } +template +Xbyak::Address GetNegativeZeroVector(BlockOfCode& code) { + if constexpr (fsize == 32) { + return code.MConst(xword, 0x8000'0000'8000'0000, 0x8000'0000'8000'0000); + } else { + return code.MConst(xword, 0x8000'0000'0000'0000, 0x8000'0000'0000'0000); + } +} + template void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) { if (ctx.FPSCR_DN()) { @@ -146,6 +155,20 @@ void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) { } } +template +void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list to_daz, Xbyak::Xmm tmp) { + if (ctx.FPSCR_FTZ()) { + if (ctx.FPSCR_RMode() != FP::RoundingMode::TowardsMinusInfinity) { + code.movaps(tmp, GetNegativeZeroVector(code)); + } else { + code.xorps(tmp, tmp); + } + for (const Xbyak::Xmm& xmm : to_daz) { + FCODE(addp)(xmm, tmp); + } + } +} + template struct DefaultIndexer { std::tuple operator()(size_t i, const VectorArray& a) { @@ -565,12 +588,14 @@ static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) if (ctx.FPSCR_DN()) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm xmm_b = ctx.FPSCR_FTZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); + DenormalsAreZero(code, ctx, {result, xmm_b}, mask); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { FCODE(vcmpeqp)(mask, result, xmm_b); FCODE(vcmpunordp)(nan_mask, result, xmm_b); @@ -602,10 +627,17 @@ static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) return; } - EmitThreeOpVectorOperation(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){ + EmitThreeOpVectorOperation(code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b){ const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm(); + if (ctx.FPSCR_FTZ()) { + const Xbyak::Xmm prev_xmm_b = xmm_b; + xmm_b = ctx.reg_alloc.ScratchXmm(); + code.movaps(xmm_b, prev_xmm_b); + DenormalsAreZero(code, ctx, {result, xmm_b}, mask); + } + // What we are doing here is handling the case when the inputs are differently signed zeros. // x86-64 treats differently signed zeros as equal while ARM does not. // Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero. @@ -643,12 +675,14 @@ static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) if (ctx.FPSCR_DN()) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm xmm_b = ctx.FPSCR_FTZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); + DenormalsAreZero(code, ctx, {result, xmm_b}, mask); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { FCODE(vcmpeqp)(mask, result, xmm_b); FCODE(vcmpunordp)(nan_mask, result, xmm_b); @@ -680,10 +714,17 @@ static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) return; } - EmitThreeOpVectorOperation(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){ + EmitThreeOpVectorOperation(code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b){ const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm(); + if (ctx.FPSCR_FTZ()) { + const Xbyak::Xmm prev_xmm_b = xmm_b; + xmm_b = ctx.reg_alloc.ScratchXmm(); + code.movaps(xmm_b, prev_xmm_b); + DenormalsAreZero(code, ctx, {result, xmm_b}, mask); + } + // What we are doing here is handling the case when the inputs are differently signed zeros. // x86-64 treats differently signed zeros as equal while ARM does not. // Thus if we OR together things that x86-64 thinks are equal we'll get the negative zero.