ir: Add opcodes for vector CLZ operations

We can optimize these cases further for with the use of a fair bit of
shuffling via pshufb and the use of masks, but given the uncommon use of
this instruction, I wouldn't consider it to be beneficial in terms of
amount of code to be worth it over a simple manageable naive solution
like this.

If we ever do hit a case where vectorized CLZ happens to be a
bottleneck, then we can revisit this. At least with AVX-512CD, this can
be done with a single instruction for the 32-bit word case.
This commit is contained in:
Lioncash 2018-09-09 17:06:47 -04:00 committed by MerryMage
parent d4c37a68a8
commit e739624296
4 changed files with 54 additions and 0 deletions

View file

@ -616,6 +616,43 @@ void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(inst, a);
}
template <typename T>
static void EmitVectorCountLeadingZeros(VectorArray<T>& result, const VectorArray<T>& data) {
for (size_t i = 0; i < result.size(); i++) {
T element = data[i];
size_t count = Common::BitSize<T>();
while (element != 0) {
element >>= 1;
--count;
}
result[i] = static_cast<T>(count);
}
}
void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u8>);
}
void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u16>);
}
void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512CD) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
code.vplzcntd(data, data);
ctx.reg_alloc.DefineValue(inst, data);
return;
}
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u32>);
}
void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);

View file

@ -916,6 +916,19 @@ U128 IREmitter::VectorBroadcast(size_t esize, const UAny& a) {
return {};
}
U128 IREmitter::VectorCountLeadingZeros(size_t esize, const U128& a) {
switch (esize) {
case 8:
return Inst<U128>(Opcode::VectorCountLeadingZeros8, a);
case 16:
return Inst<U128>(Opcode::VectorCountLeadingZeros16, a);
case 32:
return Inst<U128>(Opcode::VectorCountLeadingZeros32, a);
}
UNREACHABLE();
return {};
}
U128 IREmitter::VectorDeinterleaveEven(size_t esize, const U128& a, const U128& b) {
switch (esize) {
case 8:

View file

@ -209,6 +209,7 @@ public:
U128 VectorArithmeticShiftRight(size_t esize, const U128& a, u8 shift_amount);
U128 VectorBroadcast(size_t esize, const UAny& a);
U128 VectorBroadcastLower(size_t esize, const UAny& a);
U128 VectorCountLeadingZeros(size_t esize, const U128& a);
U128 VectorEor(const U128& a, const U128& b);
U128 VectorDeinterleaveEven(size_t esize, const U128& a, const U128& b);
U128 VectorDeinterleaveOdd(size_t esize, const U128& a, const U128& b);

View file

@ -258,6 +258,9 @@ OPCODE(VectorBroadcast8, U128, U8
OPCODE(VectorBroadcast16, U128, U16 )
OPCODE(VectorBroadcast32, U128, U32 )
OPCODE(VectorBroadcast64, U128, U64 )
OPCODE(VectorCountLeadingZeros8, U128, U128 )
OPCODE(VectorCountLeadingZeros16, U128, U128 )
OPCODE(VectorCountLeadingZeros32, U128, U128 )
OPCODE(VectorDeinterleaveEven8, U128, U128, U128 )
OPCODE(VectorDeinterleaveEven16, U128, U128, U128 )
OPCODE(VectorDeinterleaveEven32, U128, U128, U128 )