diff --git a/src/backend_x64/emit_x64_vector.cpp b/src/backend_x64/emit_x64_vector.cpp index 727d3bce..bd84b00c 100644 --- a/src/backend_x64/emit_x64_vector.cpp +++ b/src/backend_x64/emit_x64_vector.cpp @@ -8,6 +8,7 @@ #include "backend_x64/block_of_code.h" #include "backend_x64/emit_x64.h" #include "common/assert.h" +#include "common/bit_util.h" #include "common/common_types.h" #include "common/mp.h" #include "frontend/ir/basic_block.h" @@ -882,6 +883,39 @@ void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, a); } +void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) { + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm low_a = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm high_a = ctx.reg_alloc.ScratchXmm(); + Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(high_a, low_a); + code.psrlw(high_a, 4); + code.movdqa(tmp1, code.MConst(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F)); + code.pand(high_a, tmp1); // High nibbles + code.pand(low_a, tmp1); // Low nibbles + + code.movdqa(tmp1, code.MConst(0x0302020102010100, 0x0403030203020201)); + code.movdqa(tmp2, tmp1); + code.pshufb(tmp1, low_a); + code.pshufb(tmp2, high_a); + + code.paddb(tmp1, tmp2); + + ctx.reg_alloc.DefineValue(inst, tmp1); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](std::array& result, const std::array& a){ + for (size_t i = 0; i < 16; ++i) { + result[i] = static_cast(Common::BitCount(a[i])); + } + }); +} + void EmitX64::EmitVectorSub8(EmitContext& ctx, IR::Inst* inst) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubb); } diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 5c59c10c..402319e3 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -977,6 +977,10 @@ U128 IREmitter::VectorPairedAdd(size_t esize, const U128& a, const U128& b) { return {}; } +U128 IREmitter::VectorPopulationCount(const U128& a) { + return Inst(Opcode::VectorPopulationCount, a); +} + U128 IREmitter::VectorSub(size_t esize, const U128& a, const U128& b) { switch (esize) { case 8: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 94ee4184..436f8d11 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -223,6 +223,7 @@ public: U128 VectorOr(const U128& a, const U128& b); U128 VectorPairedAdd(size_t esize, const U128& a, const U128& b); U128 VectorPairedAddLower(size_t esize, const U128& a, const U128& b); + U128 VectorPopulationCount(const U128& a); U128 VectorSub(size_t esize, const U128& a, const U128& b); U128 VectorZeroExtend(size_t original_esize, const U128& a); U128 VectorZeroUpper(const U128& a); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 467d7420..54690b71 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -248,6 +248,7 @@ OPCODE(VectorPairedAdd8, T::U128, T::U128, T::U128 OPCODE(VectorPairedAdd16, T::U128, T::U128, T::U128 ) OPCODE(VectorPairedAdd32, T::U128, T::U128, T::U128 ) OPCODE(VectorPairedAdd64, T::U128, T::U128, T::U128 ) +OPCODE(VectorPopulationCount, T::U128, T::U128 ) OPCODE(VectorSub8, T::U128, T::U128, T::U128 ) OPCODE(VectorSub16, T::U128, T::U128, T::U128 ) OPCODE(VectorSub32, T::U128, T::U128, T::U128 )