diff --git a/src/backend_x64/emit_x64_vector.cpp b/src/backend_x64/emit_x64_vector.cpp index 455f1f96..3bbf9f46 100644 --- a/src/backend_x64/emit_x64_vector.cpp +++ b/src/backend_x64/emit_x64_vector.cpp @@ -491,6 +491,116 @@ void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, a); } +void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp, code.MConst(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); + code.pand(lhs, tmp); + code.pand(rhs, tmp); + code.packuswb(lhs, rhs); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveEven16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.pslld(lhs, 16); + code.psrad(lhs, 16); + + code.pslld(rhs, 16); + code.psrad(rhs, 16); + + code.packssdw(lhs, rhs); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveEven32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.pshufd(lhs, lhs, 0b10001000); + code.pshufd(rhs, rhs, 0b10001000); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + code.pblendw(lhs, rhs, 0b11110000); + } else { + code.punpcklqdq(lhs, rhs); + } + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveEven64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.movq(lhs, lhs); + code.pslldq(rhs, 8); + code.por(lhs, rhs); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveOdd8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.psraw(lhs, 8); + code.psraw(rhs, 8); + code.packsswb(lhs, rhs); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveOdd16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.psrad(lhs, 16); + code.psrad(rhs, 16); + code.packssdw(lhs, rhs); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveOdd32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.pshufd(lhs, lhs, 0b11011101); + code.pshufd(rhs, rhs, 0b11011101); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + code.pblendw(lhs, rhs, 0b11110000); + } else { + code.punpcklqdq(lhs, rhs); + } + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveOdd64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.punpckhqdq(lhs, rhs); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + void EmitX64::EmitVectorEor(EmitContext& ctx, IR::Inst* inst) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pxor); } diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index fd86d9c5..deb290c5 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -794,6 +794,36 @@ U128 IREmitter::VectorBroadcast(size_t esize, const UAny& a) { return {}; } +U128 IREmitter::VectorDeinterleaveEven(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst(Opcode::VectorDeinterleaveEven8, a, b); + case 16: + return Inst(Opcode::VectorDeinterleaveEven16, a, b); + case 32: + return Inst(Opcode::VectorDeinterleaveEven32, a, b); + case 64: + return Inst(Opcode::VectorDeinterleaveEven64, a, b); + } + UNREACHABLE(); + return {}; +} + +U128 IREmitter::VectorDeinterleaveOdd(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst(Opcode::VectorDeinterleaveOdd8, a, b); + case 16: + return Inst(Opcode::VectorDeinterleaveOdd16, a, b); + case 32: + return Inst(Opcode::VectorDeinterleaveOdd32, a, b); + case 64: + return Inst(Opcode::VectorDeinterleaveOdd64, a, b); + } + UNREACHABLE(); + return {}; +} + U128 IREmitter::VectorEor(const U128& a, const U128& b) { return Inst(Opcode::VectorEor, a, b); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 880a165f..bbe3fde2 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -197,6 +197,8 @@ public: U128 VectorBroadcast(size_t esize, const UAny& a); U128 VectorBroadcastLower(size_t esize, const UAny& a); U128 VectorEor(const U128& a, const U128& b); + U128 VectorDeinterleaveEven(size_t esize, const U128& a, const U128& b); + U128 VectorDeinterleaveOdd(size_t esize, const U128& a, const U128& b); U128 VectorEqual(size_t esize, const U128& a, const U128& b); U128 VectorExtract(const U128& a, const U128& b, size_t position); U128 VectorExtractLower(const U128& a, const U128& b, size_t position); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 4c9aca9d..86fb76ed 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -231,6 +231,14 @@ OPCODE(VectorBroadcast8, T::U128, T::U8 OPCODE(VectorBroadcast16, T::U128, T::U16 ) OPCODE(VectorBroadcast32, T::U128, T::U32 ) OPCODE(VectorBroadcast64, T::U128, T::U64 ) +OPCODE(VectorDeinterleaveEven8, T::U128, T::U128, T::U128 ) +OPCODE(VectorDeinterleaveEven16, T::U128, T::U128, T::U128 ) +OPCODE(VectorDeinterleaveEven32, T::U128, T::U128, T::U128 ) +OPCODE(VectorDeinterleaveEven64, T::U128, T::U128, T::U128 ) +OPCODE(VectorDeinterleaveOdd8, T::U128, T::U128, T::U128 ) +OPCODE(VectorDeinterleaveOdd16, T::U128, T::U128, T::U128 ) +OPCODE(VectorDeinterleaveOdd32, T::U128, T::U128, T::U128 ) +OPCODE(VectorDeinterleaveOdd64, T::U128, T::U128, T::U128 ) OPCODE(VectorEor, T::U128, T::U128, T::U128 ) OPCODE(VectorEqual8, T::U128, T::U128, T::U128 ) OPCODE(VectorEqual16, T::U128, T::U128, T::U128 )