From 2780b2318d3e1ffcc5c026df8f9f357f7dc28eab Mon Sep 17 00:00:00 2001 From: zmt00 Date: Mon, 1 Jan 2024 19:18:55 -0800 Subject: [PATCH] emit_x64_vector: Implement SSE4.1 PairedMinMaxLower32 --- src/dynarmic/backend/x64/emit_x64_vector.cpp | 38 ++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/backend/x64/emit_x64_vector.cpp index 032bb121..eead80b1 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector.cpp @@ -2717,6 +2717,24 @@ static void LowerPairedMin(VectorArray& result, const VectorArray& x, cons LowerPairedOperation(result, x, y, [](auto a, auto b) { return std::min(a, b); }); } +static void EmitVectorPairedMinMaxLower32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + // tmp = x[1], y[1], 0, 0 + code.movaps(tmp, y); + code.insertps(tmp, x, 0b01001100); + // x = x[0], y[0], 0, 0 + code.insertps(x, y, 0b00011100); + + (code.*fn)(x, tmp); + + ctx.reg_alloc.DefineValue(inst, x); +} + void EmitX64::EmitVectorPairedMaxS8(EmitContext& ctx, IR::Inst* inst) { EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { PairedMax(result, a, b); @@ -2900,6 +2918,11 @@ void EmitX64::EmitVectorPairedMaxLowerS16(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitVectorPairedMaxLowerS32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd); + return; + } + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { LowerPairedMax(result, a, b); }); @@ -2918,6 +2941,11 @@ void EmitX64::EmitVectorPairedMaxLowerU16(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitVectorPairedMaxLowerU32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud); + return; + } + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { LowerPairedMax(result, a, b); }); @@ -2936,6 +2964,11 @@ void EmitX64::EmitVectorPairedMinLowerS16(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitVectorPairedMinLowerS32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pminsd); + return; + } + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { LowerPairedMin(result, a, b); }); @@ -2954,6 +2987,11 @@ void EmitX64::EmitVectorPairedMinLowerU16(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitVectorPairedMinLowerU32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pminud); + return; + } + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { LowerPairedMin(result, a, b); });