backend_x64: Fix bugs when FPCR.FZ=1
Bugs: * DenormalsAreZero flushed to positive zero instead of preserving sign. * FMAXNM/FMINNM (scalar) should perform DAZ *before* special zero handling. * FMAX/FMIN/FMAXNM/FMINNM (vector) did not DAZ.
This commit is contained in:
parent
2b538b471f
commit
b393e15ab6
2 changed files with 49 additions and 8 deletions
|
@ -92,7 +92,7 @@ void DenormalsAreZero(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Reg64 gpr_
|
||||||
// SSE doesn't do this for us when SSE's DAZ is enabled.
|
// SSE doesn't do this for us when SSE's DAZ is enabled.
|
||||||
|
|
||||||
code.ja(end);
|
code.ja(end);
|
||||||
code.pxor(xmm_value, xmm_value);
|
code.andps(xmm_value, code.MConst(xword, fsize == 32 ? f32_negative_zero : f64_negative_zero));
|
||||||
code.mov(dword[r15 + code.GetJitStateInfo().offsetof_FPSCR_IDC], u32(1 << 7));
|
code.mov(dword[r15 + code.GetJitStateInfo().offsetof_FPSCR_IDC], u32(1 << 7));
|
||||||
code.L(end);
|
code.L(end);
|
||||||
}
|
}
|
||||||
|
@ -267,13 +267,13 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, [[maybe_unus
|
||||||
Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
|
Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr();
|
Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr();
|
||||||
|
|
||||||
if constexpr(!std::is_same_v<PreprocessFunction, std::nullptr_t>) {
|
|
||||||
preprocess(result, operand, gpr_scratch, end);
|
|
||||||
}
|
|
||||||
if (ctx.FPSCR_FTZ()) {
|
if (ctx.FPSCR_FTZ()) {
|
||||||
DenormalsAreZero<fsize>(code, result, gpr_scratch);
|
DenormalsAreZero<fsize>(code, result, gpr_scratch);
|
||||||
DenormalsAreZero<fsize>(code, operand, gpr_scratch);
|
DenormalsAreZero<fsize>(code, operand, gpr_scratch);
|
||||||
}
|
}
|
||||||
|
if constexpr(!std::is_same_v<PreprocessFunction, std::nullptr_t>) {
|
||||||
|
preprocess(result, operand, gpr_scratch, end);
|
||||||
|
}
|
||||||
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
|
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
|
||||||
PreProcessNaNs<fsize>(code, result, operand, end);
|
PreProcessNaNs<fsize>(code, result, operand, end);
|
||||||
}
|
}
|
||||||
|
|
|
@ -131,6 +131,15 @@ Xbyak::Address GetNaNVector(BlockOfCode& code) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<size_t fsize>
|
||||||
|
Xbyak::Address GetNegativeZeroVector(BlockOfCode& code) {
|
||||||
|
if constexpr (fsize == 32) {
|
||||||
|
return code.MConst(xword, 0x8000'0000'8000'0000, 0x8000'0000'8000'0000);
|
||||||
|
} else {
|
||||||
|
return code.MConst(xword, 0x8000'0000'0000'0000, 0x8000'0000'0000'0000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<size_t fsize>
|
template<size_t fsize>
|
||||||
void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
|
void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
|
||||||
if (ctx.FPSCR_DN()) {
|
if (ctx.FPSCR_DN()) {
|
||||||
|
@ -146,6 +155,20 @@ void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<size_t fsize>
|
||||||
|
void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) {
|
||||||
|
if (ctx.FPSCR_FTZ()) {
|
||||||
|
if (ctx.FPSCR_RMode() != FP::RoundingMode::TowardsMinusInfinity) {
|
||||||
|
code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
|
||||||
|
} else {
|
||||||
|
code.xorps(tmp, tmp);
|
||||||
|
}
|
||||||
|
for (const Xbyak::Xmm& xmm : to_daz) {
|
||||||
|
FCODE(addp)(xmm, tmp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
struct DefaultIndexer {
|
struct DefaultIndexer {
|
||||||
std::tuple<T> operator()(size_t i, const VectorArray<T>& a) {
|
std::tuple<T> operator()(size_t i, const VectorArray<T>& a) {
|
||||||
|
@ -565,12 +588,14 @@ static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
|
||||||
if (ctx.FPSCR_DN()) {
|
if (ctx.FPSCR_DN()) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm xmm_b = ctx.FPSCR_FTZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
|
||||||
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
|
||||||
|
|
||||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||||
FCODE(vcmpeqp)(mask, result, xmm_b);
|
FCODE(vcmpeqp)(mask, result, xmm_b);
|
||||||
FCODE(vcmpunordp)(nan_mask, result, xmm_b);
|
FCODE(vcmpunordp)(nan_mask, result, xmm_b);
|
||||||
|
@ -602,10 +627,17 @@ static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){
|
EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b){
|
||||||
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
if (ctx.FPSCR_FTZ()) {
|
||||||
|
const Xbyak::Xmm prev_xmm_b = xmm_b;
|
||||||
|
xmm_b = ctx.reg_alloc.ScratchXmm();
|
||||||
|
code.movaps(xmm_b, prev_xmm_b);
|
||||||
|
DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
|
||||||
|
}
|
||||||
|
|
||||||
// What we are doing here is handling the case when the inputs are differently signed zeros.
|
// What we are doing here is handling the case when the inputs are differently signed zeros.
|
||||||
// x86-64 treats differently signed zeros as equal while ARM does not.
|
// x86-64 treats differently signed zeros as equal while ARM does not.
|
||||||
// Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero.
|
// Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero.
|
||||||
|
@ -643,12 +675,14 @@ static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
|
||||||
if (ctx.FPSCR_DN()) {
|
if (ctx.FPSCR_DN()) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm xmm_b = ctx.FPSCR_FTZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
|
||||||
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
|
||||||
|
|
||||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||||
FCODE(vcmpeqp)(mask, result, xmm_b);
|
FCODE(vcmpeqp)(mask, result, xmm_b);
|
||||||
FCODE(vcmpunordp)(nan_mask, result, xmm_b);
|
FCODE(vcmpunordp)(nan_mask, result, xmm_b);
|
||||||
|
@ -680,10 +714,17 @@ static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){
|
EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b){
|
||||||
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
if (ctx.FPSCR_FTZ()) {
|
||||||
|
const Xbyak::Xmm prev_xmm_b = xmm_b;
|
||||||
|
xmm_b = ctx.reg_alloc.ScratchXmm();
|
||||||
|
code.movaps(xmm_b, prev_xmm_b);
|
||||||
|
DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
|
||||||
|
}
|
||||||
|
|
||||||
// What we are doing here is handling the case when the inputs are differently signed zeros.
|
// What we are doing here is handling the case when the inputs are differently signed zeros.
|
||||||
// x86-64 treats differently signed zeros as equal while ARM does not.
|
// x86-64 treats differently signed zeros as equal while ARM does not.
|
||||||
// Thus if we OR together things that x86-64 thinks are equal we'll get the negative zero.
|
// Thus if we OR together things that x86-64 thinks are equal we'll get the negative zero.
|
||||||
|
|
Loading…
Reference in a new issue