emit_x64_vector: SSSE3 variant of EmitVectorCountLeadingZeros8()
pshufb lyfe
This commit is contained in:
parent
9130ed45b9
commit
da2e7fad87
1 changed files with 18 additions and 31 deletions
|
@ -632,41 +632,28 @@ static void EmitVectorCountLeadingZeros(VectorArray<T>& result, const VectorArra
|
|||
}
|
||||
|
||||
void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const Xbyak::Xmm data_hi = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm data_lo = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm tmp_hi = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm tmp_lo = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
code.vpsllw(data_lo, data_hi, 8);
|
||||
code.vpsrlw(tmp_hi, data_hi, 1);
|
||||
code.vpsrlw(tmp_lo, data_lo, 1);
|
||||
code.vpor(data_hi, data_hi, tmp_hi);
|
||||
code.vpor(data_lo, data_lo, tmp_lo);
|
||||
code.vpsrlw(tmp_hi, data_hi, 2);
|
||||
code.vpsrlw(tmp_lo, data_lo, 2);
|
||||
code.vpor(data_hi, data_hi, tmp_hi);
|
||||
code.vpor(data_lo, data_lo, tmp_lo);
|
||||
code.vpsrlw(tmp_hi, data_hi, 4);
|
||||
code.vpsrlw(tmp_lo, data_lo, 4);
|
||||
code.vpor(data_hi, data_hi, tmp_hi);
|
||||
code.vpor(data_lo, data_lo, tmp_lo);
|
||||
code.vpsrlw(data_hi, data_hi, 8);
|
||||
code.vpsrlw(data_lo, data_lo, 8);
|
||||
code.vmovdqa(result, code.MConst(xword, 0xe700e700e700e700, 0xe700e700e700e700));
|
||||
code.vpmullw(data_hi, data_hi, result);
|
||||
code.vpmullw(data_lo, data_lo, result);
|
||||
code.vpsrlw(data_hi, data_hi, 12);
|
||||
code.vpsrlw(data_lo, data_lo, 12);
|
||||
code.vpsllw(data_hi, data_hi, 8);
|
||||
code.vpor(data_hi, data_hi, data_lo);
|
||||
code.vmovdqa(result, code.MConst(xword, 0xaaaa05aaaaaa0008, 0x030702aa06aa0104));
|
||||
code.vpshufb(result, result, data_hi);
|
||||
code.movdqa(tmp1, code.MConst(xword, 0x0101010102020304, 0x0000000000000000));
|
||||
code.movdqa(tmp2, tmp1);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
code.pshufb(tmp2, data);
|
||||
code.psrlw(data, 4);
|
||||
code.pand(data, code.MConst(xword, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F));
|
||||
code.pshufb(tmp1, data);
|
||||
|
||||
code.movdqa(data, code.MConst(xword, 0x0404040404040404, 0x0404040404040404));
|
||||
|
||||
code.pcmpeqb(data, tmp1);
|
||||
code.pand(data, tmp2);
|
||||
code.paddb(data, tmp1);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, data);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue