Implement IR instruction PackedSelect, reimplement SEL
This commit is contained in:
parent
18f11972c6
commit
d1e0a29cd9
5 changed files with 25 additions and 14 deletions
|
@ -2128,6 +2128,24 @@ void EmitX64::EmitPackedAbsDiffSumS8(RegAlloc& reg_alloc, IR::Block&, IR::Inst*
|
|||
EmitPackedOperation(code, reg_alloc, inst, &Xbyak::CodeGenerator::psadbw);
|
||||
}
|
||||
|
||||
void EmitX64::EmitPackedSelect(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
||||
auto args = reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
Xbyak::Reg32 ge = reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||
Xbyak::Reg32 to = reg_alloc.UseScratchGpr(args[1]).cvt32();
|
||||
Xbyak::Reg32 from = reg_alloc.UseScratchGpr(args[2]).cvt32();
|
||||
Xbyak::Reg32 tmp = reg_alloc.ScratchGpr().cvt32();
|
||||
|
||||
code->mov(tmp, 0x01010101);
|
||||
code->pdep(ge, ge, tmp);
|
||||
code->imul(ge, ge, 0xFF);
|
||||
code->and_(from, ge);
|
||||
code->andn(to, ge, to);
|
||||
code->or_(from, to);
|
||||
|
||||
reg_alloc.DefineValue(inst, from);
|
||||
}
|
||||
|
||||
static void DenormalsAreZero32(BlockOfCode* code, Xbyak::Xmm xmm_value, Xbyak::Reg32 gpr_scratch) {
|
||||
using namespace Xbyak::util;
|
||||
Xbyak::Label end;
|
||||
|
|
|
@ -510,6 +510,10 @@ Value IREmitter::PackedAbsDiffSumS8(const Value& a, const Value& b) {
|
|||
return Inst(Opcode::PackedAbsDiffSumS8, {a, b});
|
||||
}
|
||||
|
||||
Value IREmitter::PackedSelect(const Value& ge, const Value& a, const Value& b) {
|
||||
return Inst(Opcode::PackedSelect, {ge, a, b});
|
||||
}
|
||||
|
||||
Value IREmitter::TransferToFP32(const Value& a) {
|
||||
return Inst(Opcode::TransferToFP32, {a});
|
||||
}
|
||||
|
|
|
@ -174,6 +174,7 @@ public:
|
|||
Value PackedSaturatedSubU16(const Value& a, const Value& b);
|
||||
Value PackedSaturatedSubS16(const Value& a, const Value& b);
|
||||
Value PackedAbsDiffSumS8(const Value& a, const Value& b);
|
||||
Value PackedSelect(const Value& ge, const Value& a, const Value& b);
|
||||
|
||||
Value TransferToFP32(const Value& a);
|
||||
Value TransferToFP64(const Value& a);
|
||||
|
|
|
@ -115,6 +115,7 @@ OPCODE(PackedSaturatedAddS16, T::U32, T::U32, T::U32
|
|||
OPCODE(PackedSaturatedSubU16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedSaturatedSubS16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedAbsDiffSumS8, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedSelect, T::U32, T::U32, T::U32, T::U32 )
|
||||
|
||||
// Floating-point operations
|
||||
OPCODE(TransferToFP32, T::F32, T::U32 )
|
||||
|
|
|
@ -23,22 +23,9 @@ bool ArmTranslatorVisitor::arm_SEL(Cond cond, Reg n, Reg d, Reg m) {
|
|||
return UnpredictableInstruction();
|
||||
|
||||
if (ConditionPassed(cond)) {
|
||||
auto ge = ir.GetGEFlags();
|
||||
|
||||
// Perform some arithmetic to expand 0bXYZW into 0bXXXXXXXXYYYYYYYYZZZZZZZZWWWWWWWW => 0xXXYYZZWW
|
||||
// The logic behind this is as follows:
|
||||
// 0000 0000 0000 0000 | 0000 0000 0000 xyzw
|
||||
// 0000 000x yzw0 00xy | zw00 0xyz w000 xyzw (x * 0x00204081)
|
||||
// 0000 000x 0000 000y | 0000 000z 0000 000w (x & 0x01010101)
|
||||
// xxxx xxxx yyyy yyyy | zzzz zzzz wwww wwww (x * 0xff)
|
||||
|
||||
auto x2 = ir.Mul(ge, ir.Imm32(0x00204081));
|
||||
auto x3 = ir.And(x2, ir.Imm32(0x01010101));
|
||||
auto mask = ir.Mul(x3, ir.Imm32(0xFF));
|
||||
|
||||
auto to = ir.GetRegister(m);
|
||||
auto from = ir.GetRegister(n);
|
||||
auto result = ir.Or(ir.And(from, mask), ir.And(to, ir.Not(mask)));
|
||||
auto result = ir.PackedSelect(ir.GetGEFlags(), to, from);
|
||||
ir.SetRegister(d, result);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue