From ba9009abd87285c8088f39d4c9301a31b74d4da1 Mon Sep 17 00:00:00 2001 From: zmt00 Date: Sat, 20 Jan 2024 17:34:54 -0800 Subject: [PATCH] emit_x64_vector: Optimize VectorSignedAbsoluteDifference --- src/dynarmic/backend/x64/emit_x64_vector.cpp | 68 ++++++++++++-------- 1 file changed, 42 insertions(+), 26 deletions(-) diff --git a/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/backend/x64/emit_x64_vector.cpp index a878746c..85fa8d38 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector.cpp @@ -3744,36 +3744,52 @@ static void EmitVectorSignedAbsoluteDifference(size_t esize, EmitContext& ctx, I auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); - code.movdqa(mask, x); - code.movdqa(tmp1, y); + // only signed 16-bit min/max are available below SSE4.1 + if (code.HasHostFeature(HostFeature::SSE41) || esize == 16) { + code.movdqa(tmp, x); - switch (esize) { - case 8: - code.pcmpgtb(mask, y); - code.psubb(tmp1, x); - code.psubb(x, y); - break; - case 16: - code.pcmpgtw(mask, y); - code.psubw(tmp1, x); - code.psubw(x, y); - break; - case 32: - code.pcmpgtd(mask, y); - code.psubd(tmp1, x); - code.psubd(x, y); - break; + switch (esize) { + case 8: + code.pminsb(tmp, y); + code.pmaxsb(x, y); + code.psubb(x, tmp); + break; + case 16: + code.pminsw(tmp, y); + code.pmaxsw(x, y); + code.psubw(x, tmp); + break; + case 32: + code.pminsd(tmp, y); + code.pmaxsd(x, y); + code.psubd(x, tmp); + break; + default: + UNREACHABLE(); + } + } else { + code.movdqa(tmp, y); + + switch (esize) { + case 8: + code.pcmpgtb(tmp, x); + code.psubb(x, y); + code.pxor(x, tmp); + code.psubb(x, tmp); + break; + case 32: + code.pcmpgtd(tmp, x); + code.psubd(x, y); + code.pxor(x, tmp); + code.psubd(x, tmp); + break; + default: + UNREACHABLE(); + } } - code.movdqa(tmp2, mask); - code.pand(x, mask); - code.pandn(tmp2, tmp1); - code.por(x, tmp2); - ctx.reg_alloc.DefineValue(inst, x); }