Implement IR instruction PackedSelect, reimplement SEL
This commit is contained in:
parent
18f11972c6
commit
d1e0a29cd9
5 changed files with 25 additions and 14 deletions
|
@ -2128,6 +2128,24 @@ void EmitX64::EmitPackedAbsDiffSumS8(RegAlloc& reg_alloc, IR::Block&, IR::Inst*
|
||||||
EmitPackedOperation(code, reg_alloc, inst, &Xbyak::CodeGenerator::psadbw);
|
EmitPackedOperation(code, reg_alloc, inst, &Xbyak::CodeGenerator::psadbw);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitPackedSelect(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
||||||
|
auto args = reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Reg32 ge = reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||||
|
Xbyak::Reg32 to = reg_alloc.UseScratchGpr(args[1]).cvt32();
|
||||||
|
Xbyak::Reg32 from = reg_alloc.UseScratchGpr(args[2]).cvt32();
|
||||||
|
Xbyak::Reg32 tmp = reg_alloc.ScratchGpr().cvt32();
|
||||||
|
|
||||||
|
code->mov(tmp, 0x01010101);
|
||||||
|
code->pdep(ge, ge, tmp);
|
||||||
|
code->imul(ge, ge, 0xFF);
|
||||||
|
code->and_(from, ge);
|
||||||
|
code->andn(to, ge, to);
|
||||||
|
code->or_(from, to);
|
||||||
|
|
||||||
|
reg_alloc.DefineValue(inst, from);
|
||||||
|
}
|
||||||
|
|
||||||
static void DenormalsAreZero32(BlockOfCode* code, Xbyak::Xmm xmm_value, Xbyak::Reg32 gpr_scratch) {
|
static void DenormalsAreZero32(BlockOfCode* code, Xbyak::Xmm xmm_value, Xbyak::Reg32 gpr_scratch) {
|
||||||
using namespace Xbyak::util;
|
using namespace Xbyak::util;
|
||||||
Xbyak::Label end;
|
Xbyak::Label end;
|
||||||
|
|
|
@ -510,6 +510,10 @@ Value IREmitter::PackedAbsDiffSumS8(const Value& a, const Value& b) {
|
||||||
return Inst(Opcode::PackedAbsDiffSumS8, {a, b});
|
return Inst(Opcode::PackedAbsDiffSumS8, {a, b});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Value IREmitter::PackedSelect(const Value& ge, const Value& a, const Value& b) {
|
||||||
|
return Inst(Opcode::PackedSelect, {ge, a, b});
|
||||||
|
}
|
||||||
|
|
||||||
Value IREmitter::TransferToFP32(const Value& a) {
|
Value IREmitter::TransferToFP32(const Value& a) {
|
||||||
return Inst(Opcode::TransferToFP32, {a});
|
return Inst(Opcode::TransferToFP32, {a});
|
||||||
}
|
}
|
||||||
|
|
|
@ -174,6 +174,7 @@ public:
|
||||||
Value PackedSaturatedSubU16(const Value& a, const Value& b);
|
Value PackedSaturatedSubU16(const Value& a, const Value& b);
|
||||||
Value PackedSaturatedSubS16(const Value& a, const Value& b);
|
Value PackedSaturatedSubS16(const Value& a, const Value& b);
|
||||||
Value PackedAbsDiffSumS8(const Value& a, const Value& b);
|
Value PackedAbsDiffSumS8(const Value& a, const Value& b);
|
||||||
|
Value PackedSelect(const Value& ge, const Value& a, const Value& b);
|
||||||
|
|
||||||
Value TransferToFP32(const Value& a);
|
Value TransferToFP32(const Value& a);
|
||||||
Value TransferToFP64(const Value& a);
|
Value TransferToFP64(const Value& a);
|
||||||
|
|
|
@ -115,6 +115,7 @@ OPCODE(PackedSaturatedAddS16, T::U32, T::U32, T::U32
|
||||||
OPCODE(PackedSaturatedSubU16, T::U32, T::U32, T::U32 )
|
OPCODE(PackedSaturatedSubU16, T::U32, T::U32, T::U32 )
|
||||||
OPCODE(PackedSaturatedSubS16, T::U32, T::U32, T::U32 )
|
OPCODE(PackedSaturatedSubS16, T::U32, T::U32, T::U32 )
|
||||||
OPCODE(PackedAbsDiffSumS8, T::U32, T::U32, T::U32 )
|
OPCODE(PackedAbsDiffSumS8, T::U32, T::U32, T::U32 )
|
||||||
|
OPCODE(PackedSelect, T::U32, T::U32, T::U32, T::U32 )
|
||||||
|
|
||||||
// Floating-point operations
|
// Floating-point operations
|
||||||
OPCODE(TransferToFP32, T::F32, T::U32 )
|
OPCODE(TransferToFP32, T::F32, T::U32 )
|
||||||
|
|
|
@ -23,22 +23,9 @@ bool ArmTranslatorVisitor::arm_SEL(Cond cond, Reg n, Reg d, Reg m) {
|
||||||
return UnpredictableInstruction();
|
return UnpredictableInstruction();
|
||||||
|
|
||||||
if (ConditionPassed(cond)) {
|
if (ConditionPassed(cond)) {
|
||||||
auto ge = ir.GetGEFlags();
|
|
||||||
|
|
||||||
// Perform some arithmetic to expand 0bXYZW into 0bXXXXXXXXYYYYYYYYZZZZZZZZWWWWWWWW => 0xXXYYZZWW
|
|
||||||
// The logic behind this is as follows:
|
|
||||||
// 0000 0000 0000 0000 | 0000 0000 0000 xyzw
|
|
||||||
// 0000 000x yzw0 00xy | zw00 0xyz w000 xyzw (x * 0x00204081)
|
|
||||||
// 0000 000x 0000 000y | 0000 000z 0000 000w (x & 0x01010101)
|
|
||||||
// xxxx xxxx yyyy yyyy | zzzz zzzz wwww wwww (x * 0xff)
|
|
||||||
|
|
||||||
auto x2 = ir.Mul(ge, ir.Imm32(0x00204081));
|
|
||||||
auto x3 = ir.And(x2, ir.Imm32(0x01010101));
|
|
||||||
auto mask = ir.Mul(x3, ir.Imm32(0xFF));
|
|
||||||
|
|
||||||
auto to = ir.GetRegister(m);
|
auto to = ir.GetRegister(m);
|
||||||
auto from = ir.GetRegister(n);
|
auto from = ir.GetRegister(n);
|
||||||
auto result = ir.Or(ir.And(from, mask), ir.And(to, ir.Not(mask)));
|
auto result = ir.PackedSelect(ir.GetGEFlags(), to, from);
|
||||||
ir.SetRegister(d, result);
|
ir.SetRegister(d, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue