From a43c176fc3984446d15e473fd2803493536412b6 Mon Sep 17 00:00:00 2001 From: zmt00 Date: Sat, 9 Dec 2023 15:43:08 -0800 Subject: [PATCH] emit_x64_vector: Add SSSE3 implementation of VUZP{1,2}.4H --- src/dynarmic/backend/x64/emit_x64_vector.cpp | 44 +++++++++++++------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/backend/x64/emit_x64_vector.cpp index 5c7c53ee..bd393dc7 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector.cpp @@ -1129,17 +1129,25 @@ void EmitX64::EmitVectorDeinterleaveEvenLower8(EmitContext& ctx, IR::Inst* inst) void EmitX64::EmitVectorDeinterleaveEvenLower16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); - code.pslld(lhs, 16); - code.psrad(lhs, 16); + if (code.HasHostFeature(HostFeature::SSSE3)) { + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); - code.pslld(rhs, 16); - code.psrad(rhs, 16); + code.punpcklwd(lhs, rhs); + code.pshufb(lhs, code.MConst(xword, 0x0B0A'0302'0908'0100, 0x8080'8080'8080'8080)); + } else { + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); - code.packssdw(lhs, rhs); - code.pshufd(lhs, lhs, 0b11011000); - code.movq(lhs, lhs); + code.pslld(lhs, 16); + code.psrad(lhs, 16); + + code.pslld(rhs, 16); + code.psrad(rhs, 16); + + code.packssdw(lhs, rhs); + code.pshufd(lhs, lhs, 0b11011000); + code.movq(lhs, lhs); + } ctx.reg_alloc.DefineValue(inst, lhs); } @@ -1221,13 +1229,21 @@ void EmitX64::EmitVectorDeinterleaveOddLower8(EmitContext& ctx, IR::Inst* inst) void EmitX64::EmitVectorDeinterleaveOddLower16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); - code.psrad(lhs, 16); - code.psrad(rhs, 16); - code.packssdw(lhs, rhs); - code.pshufd(lhs, lhs, 0b11011000); - code.movq(lhs, lhs); + if (code.HasHostFeature(HostFeature::SSSE3)) { + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + + code.punpcklwd(lhs, rhs); + code.pshufb(lhs, code.MConst(xword, 0x0F0E'0706'0D0C'0504, 0x8080'8080'8080'8080)); + } else { + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.psrad(lhs, 16); + code.psrad(rhs, 16); + code.packssdw(lhs, rhs); + code.pshufd(lhs, lhs, 0b11011000); + code.movq(lhs, lhs); + } ctx.reg_alloc.DefineValue(inst, lhs); }