From 8ef0f2b54f83f0c359e05fc9f7711d4e83dcd210 Mon Sep 17 00:00:00 2001 From: zmt00 Date: Thu, 14 Dec 2023 18:06:21 -0800 Subject: [PATCH] emit_x64_vector: Add SSSE3 implementation of VUZP{1,2}.8B --- src/dynarmic/backend/x64/emit_x64_vector.cpp | 44 +++++++++++++------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/backend/x64/emit_x64_vector.cpp index fbcfc8c5..032bb121 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector.cpp @@ -1122,15 +1122,23 @@ void EmitX64::EmitVectorDeinterleaveEven64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorDeinterleaveEvenLower8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); - code.movdqa(tmp, code.MConst(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); - code.pand(lhs, tmp); - code.pand(rhs, tmp); - code.packuswb(lhs, rhs); - code.pshufd(lhs, lhs, 0b11011000); - code.movq(lhs, lhs); + if (code.HasHostFeature(HostFeature::SSSE3)) { + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + + code.punpcklbw(lhs, rhs); + code.pshufb(lhs, code.MConst(xword, 0x0D'09'05'01'0C'08'04'00, 0x8080808080808080)); + } else { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.movdqa(tmp, code.MConst(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); + code.pand(lhs, tmp); + code.pand(rhs, tmp); + code.packuswb(lhs, rhs); + code.pshufd(lhs, lhs, 0b11011000); + code.movq(lhs, lhs); + } ctx.reg_alloc.DefineValue(inst, lhs); } @@ -1224,13 +1232,21 @@ void EmitX64::EmitVectorDeinterleaveOdd64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorDeinterleaveOddLower8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); - code.psraw(lhs, 8); - code.psraw(rhs, 8); - code.packsswb(lhs, rhs); - code.pshufd(lhs, lhs, 0b11011000); - code.movq(lhs, lhs); + if (code.HasHostFeature(HostFeature::SSSE3)) { + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + + code.punpcklbw(lhs, rhs); + code.pshufb(lhs, code.MConst(xword, 0x0F'0B'07'03'0E'0A'06'02, 0x8080808080808080)); + } else { + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.psraw(lhs, 8); + code.psraw(rhs, 8); + code.packsswb(lhs, rhs); + code.pshufd(lhs, lhs, 0b11011000); + code.movq(lhs, lhs); + } ctx.reg_alloc.DefineValue(inst, lhs); }