IR: Implement VectorPopulationCount
This commit is contained in:
parent
1dd2b33b87
commit
303088a51e
4 changed files with 40 additions and 0 deletions
|
@ -8,6 +8,7 @@
|
||||||
#include "backend_x64/block_of_code.h"
|
#include "backend_x64/block_of_code.h"
|
||||||
#include "backend_x64/emit_x64.h"
|
#include "backend_x64/emit_x64.h"
|
||||||
#include "common/assert.h"
|
#include "common/assert.h"
|
||||||
|
#include "common/bit_util.h"
|
||||||
#include "common/common_types.h"
|
#include "common/common_types.h"
|
||||||
#include "common/mp.h"
|
#include "common/mp.h"
|
||||||
#include "frontend/ir/basic_block.h"
|
#include "frontend/ir/basic_block.h"
|
||||||
|
@ -882,6 +883,39 @@ void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
ctx.reg_alloc.DefineValue(inst, a);
|
ctx.reg_alloc.DefineValue(inst, a);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Xmm low_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
Xbyak::Xmm high_a = ctx.reg_alloc.ScratchXmm();
|
||||||
|
Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
|
||||||
|
Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
code.movdqa(high_a, low_a);
|
||||||
|
code.psrlw(high_a, 4);
|
||||||
|
code.movdqa(tmp1, code.MConst(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F));
|
||||||
|
code.pand(high_a, tmp1); // High nibbles
|
||||||
|
code.pand(low_a, tmp1); // Low nibbles
|
||||||
|
|
||||||
|
code.movdqa(tmp1, code.MConst(0x0302020102010100, 0x0403030203020201));
|
||||||
|
code.movdqa(tmp2, tmp1);
|
||||||
|
code.pshufb(tmp1, low_a);
|
||||||
|
code.pshufb(tmp2, high_a);
|
||||||
|
|
||||||
|
code.paddb(tmp1, tmp2);
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, tmp1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
EmitTwoArgumentFallback(code, ctx, inst, [](std::array<u8, 16>& result, const std::array<u8, 16>& a){
|
||||||
|
for (size_t i = 0; i < 16; ++i) {
|
||||||
|
result[i] = static_cast<u8>(Common::BitCount(a[i]));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
void EmitX64::EmitVectorSub8(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitVectorSub8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubb);
|
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubb);
|
||||||
}
|
}
|
||||||
|
|
|
@ -977,6 +977,10 @@ U128 IREmitter::VectorPairedAdd(size_t esize, const U128& a, const U128& b) {
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::VectorPopulationCount(const U128& a) {
|
||||||
|
return Inst<U128>(Opcode::VectorPopulationCount, a);
|
||||||
|
}
|
||||||
|
|
||||||
U128 IREmitter::VectorSub(size_t esize, const U128& a, const U128& b) {
|
U128 IREmitter::VectorSub(size_t esize, const U128& a, const U128& b) {
|
||||||
switch (esize) {
|
switch (esize) {
|
||||||
case 8:
|
case 8:
|
||||||
|
|
|
@ -223,6 +223,7 @@ public:
|
||||||
U128 VectorOr(const U128& a, const U128& b);
|
U128 VectorOr(const U128& a, const U128& b);
|
||||||
U128 VectorPairedAdd(size_t esize, const U128& a, const U128& b);
|
U128 VectorPairedAdd(size_t esize, const U128& a, const U128& b);
|
||||||
U128 VectorPairedAddLower(size_t esize, const U128& a, const U128& b);
|
U128 VectorPairedAddLower(size_t esize, const U128& a, const U128& b);
|
||||||
|
U128 VectorPopulationCount(const U128& a);
|
||||||
U128 VectorSub(size_t esize, const U128& a, const U128& b);
|
U128 VectorSub(size_t esize, const U128& a, const U128& b);
|
||||||
U128 VectorZeroExtend(size_t original_esize, const U128& a);
|
U128 VectorZeroExtend(size_t original_esize, const U128& a);
|
||||||
U128 VectorZeroUpper(const U128& a);
|
U128 VectorZeroUpper(const U128& a);
|
||||||
|
|
|
@ -248,6 +248,7 @@ OPCODE(VectorPairedAdd8, T::U128, T::U128, T::U128
|
||||||
OPCODE(VectorPairedAdd16, T::U128, T::U128, T::U128 )
|
OPCODE(VectorPairedAdd16, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(VectorPairedAdd32, T::U128, T::U128, T::U128 )
|
OPCODE(VectorPairedAdd32, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(VectorPairedAdd64, T::U128, T::U128, T::U128 )
|
OPCODE(VectorPairedAdd64, T::U128, T::U128, T::U128 )
|
||||||
|
OPCODE(VectorPopulationCount, T::U128, T::U128 )
|
||||||
OPCODE(VectorSub8, T::U128, T::U128, T::U128 )
|
OPCODE(VectorSub8, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(VectorSub16, T::U128, T::U128, T::U128 )
|
OPCODE(VectorSub16, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(VectorSub32, T::U128, T::U128, T::U128 )
|
OPCODE(VectorSub32, T::U128, T::U128, T::U128 )
|
||||||
|
|
Loading…
Reference in a new issue