diff --git a/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp b/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp index 4bb1614a..16c7ea05 100644 --- a/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp +++ b/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp @@ -416,6 +416,16 @@ void EmitIR(oaknut::CodeGenerator& code, EmitContext& EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMAX(Vresult, Va, Vb); }); } +template<> +void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMAXNM(Vresult, Va, Vb); }); +} + +template<> +void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMAXNM(Vresult, Va, Vb); }); +} + template<> void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMIN(Vresult, Va, Vb); }); @@ -426,6 +436,16 @@ void EmitIR(oaknut::CodeGenerator& code, EmitContext& EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMIN(Vresult, Va, Vb); }); } +template<> +void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMINNM(Vresult, Va, Vb); }); +} + +template<> +void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMINNM(Vresult, Va, Vb); }); +} + template<> void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMUL(Vresult, Va, Vb); }); diff --git a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp index 02ce4710..d97e906d 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp @@ -1060,6 +1060,117 @@ static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* in CheckInputNaN::Yes); } +template +static void EmitFPVectorMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + const bool fpcr_controlled = inst->GetArg(2).GetU1(); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm intermediate_result = ctx.reg_alloc.ScratchXmm(); + + const Xbyak::Xmm tmp1 = xmm0; + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + + // NaN requirements: + // op1 op2 result + // SNaN anything op1 + // !SNaN SNaN op2 + // QNaN !NaN op2 + // !NaN QNaN op1 + // QNaN QNaN op1 + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + using FPT = mcl::unsigned_integer_of_size; + + // result = xmm_a == SNaN || xmm_b == QNaN + { + // evaluate xmm_b == QNaN + code.xorps(tmp1, tmp1); + FCODE(cmpunordp)(tmp1, xmm_b); + code.movaps(tmp2, xmm_b); + ICODE(psll)(tmp2, static_cast(fsize - FP::FPInfo::explicit_mantissa_width)); + { + code.psrad(tmp2, 31); + if constexpr (fsize == 64) { + code.pshufd(tmp2, tmp2, 0b11110101); + } + } + code.andps(tmp1, tmp2); + + code.movaps(result, tmp1); + + // evaluate xmm_a == SNaN + code.xorps(tmp1, tmp1); + FCODE(cmpunordp)(tmp1, xmm_a); + code.movaps(tmp2, xmm_a); + ICODE(psll)(tmp2, static_cast(fsize - FP::FPInfo::explicit_mantissa_width)); + { + code.psrad(tmp2, 31); + if constexpr (fsize == 64) { + code.pshufd(tmp2, tmp2, 0b11110101); + } + } + code.andnps(tmp2, tmp1); + + code.orps(result, tmp2); + } + + // Denormalization quiets SNaNs, therefore should happen after SNaN detection! + DenormalsAreZero(code, ctx.FPCR(fpcr_controlled), {xmm_a, xmm_b}, tmp1); + + // intermediate result = max/min(xmm_a, xmm_b) + { + const Xbyak::Xmm eq_mask = tmp1; + const Xbyak::Xmm eq = tmp2; + + code.movaps(eq_mask, xmm_a); + FCODE(cmpneqp)(eq_mask, xmm_b); + + code.movaps(eq, xmm_a); + code.movaps(intermediate_result, xmm_a); + if constexpr (is_max) { + code.andps(eq, xmm_b); + FCODE(maxp)(intermediate_result, xmm_b); + } else { + code.orps(eq, xmm_b); + FCODE(minp)(intermediate_result, xmm_b); + } + + code.andps(intermediate_result, eq_mask); + code.andnps(eq_mask, eq); + code.orps(intermediate_result, eq_mask); + } + + { + code.andps(xmm_a, result); + code.andnps(result, intermediate_result); + code.orps(result, xmm_a); + } + + if (ctx.FPCR(fpcr_controlled).DN()) { + const Xbyak::Xmm ord_mask = tmp1; + + code.xorps(ord_mask, ord_mask); + FCODE(cmpordp)(ord_mask, result); + + code.andps(result, ord_mask); + code.andnps(ord_mask, GetNaNVector(code)); + code.orps(result, ord_mask); + } else { + const Xbyak::Xmm nan_mask = tmp1; + + code.xorps(nan_mask, nan_mask); + FCODE(cmpunordp)(nan_mask, result); + code.andps(nan_mask, GetVectorOf::mantissa_msb>(code)); + code.orps(result, nan_mask); + } + }); + + ctx.reg_alloc.DefineValue(inst, result); +} + void EmitX64::EmitFPVectorMax32(EmitContext& ctx, IR::Inst* inst) { EmitFPVectorMinMax<32, true>(code, ctx, inst); } @@ -1068,6 +1179,14 @@ void EmitX64::EmitFPVectorMax64(EmitContext& ctx, IR::Inst* inst) { EmitFPVectorMinMax<64, true>(code, ctx, inst); } +void EmitX64::EmitFPVectorMaxNumeric32(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMaxNumeric<32, true>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMaxNumeric64(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMaxNumeric<64, true>(code, ctx, inst); +} + void EmitX64::EmitFPVectorMin32(EmitContext& ctx, IR::Inst* inst) { EmitFPVectorMinMax<32, false>(code, ctx, inst); } @@ -1076,6 +1195,14 @@ void EmitX64::EmitFPVectorMin64(EmitContext& ctx, IR::Inst* inst) { EmitFPVectorMinMax<64, false>(code, ctx, inst); } +void EmitX64::EmitFPVectorMinNumeric32(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMaxNumeric<32, false>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMinNumeric64(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMaxNumeric<64, false>(code, ctx, inst); +} + void EmitX64::EmitFPVectorMul32(EmitContext& ctx, IR::Inst* inst) { EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulps); } diff --git a/src/dynarmic/ir/ir_emitter.cpp b/src/dynarmic/ir/ir_emitter.cpp index d025e0fc..ae61f1bf 100644 --- a/src/dynarmic/ir/ir_emitter.cpp +++ b/src/dynarmic/ir/ir_emitter.cpp @@ -2634,6 +2634,16 @@ U128 IREmitter::FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpc UNREACHABLE(); } +U128 IREmitter::FPVectorMaxNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst(Opcode::FPVectorMaxNumeric32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst(Opcode::FPVectorMaxNumeric64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { switch (esize) { case 32: @@ -2644,6 +2654,16 @@ U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpc UNREACHABLE(); } +U128 IREmitter::FPVectorMinNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst(Opcode::FPVectorMinNumeric32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst(Opcode::FPVectorMinNumeric64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { switch (esize) { case 32: diff --git a/src/dynarmic/ir/ir_emitter.h b/src/dynarmic/ir/ir_emitter.h index 8a12c940..0efd25c5 100644 --- a/src/dynarmic/ir/ir_emitter.h +++ b/src/dynarmic/ir/ir_emitter.h @@ -372,7 +372,9 @@ public: U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorMaxNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorMinNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2, bool fpcr_controlled = true); U128 FPVectorMulX(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); diff --git a/src/dynarmic/ir/opcodes.inc b/src/dynarmic/ir/opcodes.inc index 114ad333..4da1f8ff 100644 --- a/src/dynarmic/ir/opcodes.inc +++ b/src/dynarmic/ir/opcodes.inc @@ -668,8 +668,12 @@ OPCODE(FPVectorGreaterEqual32, U128, U128 OPCODE(FPVectorGreaterEqual64, U128, U128, U128, U1 ) OPCODE(FPVectorMax32, U128, U128, U128, U1 ) OPCODE(FPVectorMax64, U128, U128, U128, U1 ) +OPCODE(FPVectorMaxNumeric32, U128, U128, U128, U1 ) +OPCODE(FPVectorMaxNumeric64, U128, U128, U128, U1 ) OPCODE(FPVectorMin32, U128, U128, U128, U1 ) OPCODE(FPVectorMin64, U128, U128, U128, U1 ) +OPCODE(FPVectorMinNumeric32, U128, U128, U128, U1 ) +OPCODE(FPVectorMinNumeric64, U128, U128, U128, U1 ) OPCODE(FPVectorMul32, U128, U128, U128, U1 ) OPCODE(FPVectorMul64, U128, U128, U128, U1 ) OPCODE(FPVectorMulAdd16, U128, U128, U128, U128, U1 )