emit_x64_vector: Add SSE4.1 implementation of VUZP{1,2}.2S

This commit is contained in:
zmt00 2023-12-02 10:36:27 -08:00 committed by merry
parent d68b916f57
commit 4c2bd4ed29

View file

@ -1149,8 +1149,13 @@ void EmitX64::EmitVectorDeinterleaveEvenLower32(EmitContext& ctx, IR::Inst* inst
const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
if (code.HasHostFeature(HostFeature::SSE41)) {
// copy bytes 0:3 of rhs to lhs, zero out upper 8 bytes
code.insertps(lhs, rhs, 0b00011100);
} else {
code.unpcklps(lhs, rhs);
code.movq(lhs, lhs);
}
ctx.reg_alloc.DefineValue(inst, lhs);
}
@ -1229,6 +1234,16 @@ void EmitX64::EmitVectorDeinterleaveOddLower16(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorDeinterleaveOddLower32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm lhs = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
// copy bytes 4:7 of lhs to bytes 0:3 of rhs, zero out upper 8 bytes
code.insertps(rhs, lhs, 0b01001100);
ctx.reg_alloc.DefineValue(inst, rhs);
} else {
const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
@ -1239,6 +1254,7 @@ void EmitX64::EmitVectorDeinterleaveOddLower32(EmitContext& ctx, IR::Inst* inst)
ctx.reg_alloc.DefineValue(inst, lhs);
}
}
void EmitX64::EmitVectorEor(EmitContext& ctx, IR::Inst* inst) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pxor);