parallel: Add and Subtract with Exchange improvements
* Remove asx argument from PackedHalvingSubAdd{U16,S16} IR instruction * Implement Packed{Halving,}{AddSub,SubAdd}{U16,S16} IR instructions * Implement SASX, SSAX, UASX, USAX
This commit is contained in:
parent
fd068ed6b8
commit
05e97058c3
5 changed files with 157 additions and 41 deletions
|
@ -1986,17 +1986,15 @@ void EmitX64::EmitPackedHalvingSubS16(RegAlloc& reg_alloc, IR::Block&, IR::Inst*
|
|||
reg_alloc.DefineValue(inst, minuend);
|
||||
}
|
||||
|
||||
void EmitPackedHalvingSubAdd(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, bool is_signed) {
|
||||
void EmitPackedSubAdd(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, bool hi_is_sum, bool is_signed, bool is_halving) {
|
||||
auto args = reg_alloc.GetArgumentInfo(inst);
|
||||
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
||||
|
||||
Xbyak::Reg32 reg_a_hi = reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||
Xbyak::Reg32 reg_b_hi = reg_alloc.UseScratchGpr(args[1]).cvt32();
|
||||
Xbyak::Reg32 reg_a_lo = reg_alloc.ScratchGpr().cvt32();
|
||||
Xbyak::Reg32 reg_b_lo = reg_alloc.ScratchGpr().cvt32();
|
||||
|
||||
// If asx is true, the high word contains the sum and the low word the difference.
|
||||
// If false, the high word contains the difference and the low word the sum.
|
||||
bool asx = args[2].GetImmediateU1();
|
||||
Xbyak::Reg32 reg_sum, reg_diff;
|
||||
|
||||
if (is_signed) {
|
||||
code->movsx(reg_a_lo, reg_a_hi.cvt16());
|
||||
|
@ -2010,22 +2008,48 @@ void EmitPackedHalvingSubAdd(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* i
|
|||
code->shr(reg_b_hi, 16);
|
||||
}
|
||||
|
||||
if (asx) {
|
||||
// Calculate diff such that reg_a_lo<31:16> contains diff<16:1>.
|
||||
if (hi_is_sum) {
|
||||
code->sub(reg_a_lo, reg_b_hi);
|
||||
code->shl(reg_a_lo, 15);
|
||||
|
||||
// Calculate sum such that reg_a_hi<15:0> contains sum<16:1>.
|
||||
code->add(reg_a_hi, reg_b_lo);
|
||||
reg_diff = reg_a_lo;
|
||||
reg_sum = reg_a_hi;
|
||||
} else {
|
||||
code->add(reg_a_lo, reg_b_hi);
|
||||
code->sub(reg_a_hi, reg_b_lo);
|
||||
reg_diff = reg_a_hi;
|
||||
reg_sum = reg_a_lo;
|
||||
}
|
||||
|
||||
if (ge_inst) {
|
||||
EraseInstruction(block, ge_inst);
|
||||
|
||||
// The reg_b registers are no longer required.
|
||||
Xbyak::Reg32 ge_sum = reg_b_hi;
|
||||
Xbyak::Reg32 ge_diff = reg_b_lo;
|
||||
|
||||
code->mov(ge_sum, reg_sum);
|
||||
code->mov(ge_diff, reg_diff);
|
||||
|
||||
if (!is_signed) {
|
||||
code->shl(ge_sum, 15);
|
||||
code->sar(ge_sum, 16);
|
||||
} else {
|
||||
code->not(ge_sum);
|
||||
}
|
||||
code->not(ge_diff);
|
||||
code->and(ge_sum, hi_is_sum ? 0xC0000000 : 0x30000000);
|
||||
code->and(ge_diff, hi_is_sum ? 0x30000000 : 0xC0000000);
|
||||
code->or_(ge_sum, ge_diff);
|
||||
code->shr(ge_sum, 28);
|
||||
|
||||
reg_alloc.DefineValue(ge_inst, ge_sum);
|
||||
}
|
||||
|
||||
if (is_halving) {
|
||||
code->shl(reg_a_lo, 15);
|
||||
code->shr(reg_a_hi, 1);
|
||||
} else {
|
||||
// Calculate sum such that reg_a_lo<31:16> contains sum<16:1>.
|
||||
code->add(reg_a_lo, reg_b_hi);
|
||||
code->shl(reg_a_lo, 15);
|
||||
|
||||
// Calculate diff such that reg_a_hi<15:0> contains diff<16:1>.
|
||||
code->sub(reg_a_hi, reg_b_lo);
|
||||
code->shr(reg_a_hi, 1);
|
||||
code->shl(reg_a_lo, 16);
|
||||
}
|
||||
|
||||
// reg_a_lo now contains the low word and reg_a_hi now contains the high word.
|
||||
|
@ -2035,12 +2059,36 @@ void EmitPackedHalvingSubAdd(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* i
|
|||
reg_alloc.DefineValue(inst, reg_a_hi);
|
||||
}
|
||||
|
||||
void EmitX64::EmitPackedHalvingSubAddU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
||||
EmitPackedHalvingSubAdd(code, reg_alloc, inst, false);
|
||||
void EmitX64::EmitPackedAddSubU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
|
||||
EmitPackedSubAdd(code, reg_alloc, block, inst, true, false, false);
|
||||
}
|
||||
|
||||
void EmitX64::EmitPackedHalvingSubAddS16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
||||
EmitPackedHalvingSubAdd(code, reg_alloc, inst, true);
|
||||
void EmitX64::EmitPackedAddSubS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
|
||||
EmitPackedSubAdd(code, reg_alloc, block, inst, true, true, false);
|
||||
}
|
||||
|
||||
void EmitX64::EmitPackedSubAddU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
|
||||
EmitPackedSubAdd(code, reg_alloc, block, inst, false, false, false);
|
||||
}
|
||||
|
||||
void EmitX64::EmitPackedSubAddS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
|
||||
EmitPackedSubAdd(code, reg_alloc, block, inst, false, true, false);
|
||||
}
|
||||
|
||||
void EmitX64::EmitPackedHalvingAddSubU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
|
||||
EmitPackedSubAdd(code, reg_alloc, block, inst, true, false, true);
|
||||
}
|
||||
|
||||
void EmitX64::EmitPackedHalvingAddSubS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
|
||||
EmitPackedSubAdd(code, reg_alloc, block, inst, true, true, true);
|
||||
}
|
||||
|
||||
void EmitX64::EmitPackedHalvingSubAddU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
|
||||
EmitPackedSubAdd(code, reg_alloc, block, inst, false, false, true);
|
||||
}
|
||||
|
||||
void EmitX64::EmitPackedHalvingSubAddS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
|
||||
EmitPackedSubAdd(code, reg_alloc, block, inst, false, true, true);
|
||||
}
|
||||
|
||||
static void EmitPackedOperation(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) {
|
||||
|
|
|
@ -402,6 +402,30 @@ IREmitter::ResultAndGE IREmitter::PackedSubS16(const Value& a, const Value& b) {
|
|||
return {result, ge};
|
||||
}
|
||||
|
||||
IREmitter::ResultAndGE IREmitter::PackedAddSubU16(const Value& a, const Value& b) {
|
||||
auto result = Inst(Opcode::PackedAddSubU16, {a, b});
|
||||
auto ge = Inst(Opcode::GetGEFromOp, {result});
|
||||
return {result, ge};
|
||||
}
|
||||
|
||||
IREmitter::ResultAndGE IREmitter::PackedAddSubS16(const Value& a, const Value& b) {
|
||||
auto result = Inst(Opcode::PackedAddSubS16, {a, b});
|
||||
auto ge = Inst(Opcode::GetGEFromOp, {result});
|
||||
return {result, ge};
|
||||
}
|
||||
|
||||
IREmitter::ResultAndGE IREmitter::PackedSubAddU16(const Value& a, const Value& b) {
|
||||
auto result = Inst(Opcode::PackedSubAddU16, {a, b});
|
||||
auto ge = Inst(Opcode::GetGEFromOp, {result});
|
||||
return {result, ge};
|
||||
}
|
||||
|
||||
IREmitter::ResultAndGE IREmitter::PackedSubAddS16(const Value& a, const Value& b) {
|
||||
auto result = Inst(Opcode::PackedSubAddS16, {a, b});
|
||||
auto ge = Inst(Opcode::GetGEFromOp, {result});
|
||||
return {result, ge};
|
||||
}
|
||||
|
||||
Value IREmitter::PackedHalvingAddU8(const Value& a, const Value& b) {
|
||||
return Inst(Opcode::PackedHalvingAddU8, {a, b});
|
||||
}
|
||||
|
@ -434,12 +458,20 @@ Value IREmitter::PackedHalvingSubS16(const Value& a, const Value& b) {
|
|||
return Inst(Opcode::PackedHalvingSubS16, {a, b});
|
||||
}
|
||||
|
||||
Value IREmitter::PackedHalvingSubAddU16(const Value& a, const Value& b, bool asx) {
|
||||
return Inst(Opcode::PackedHalvingSubAddU16, {a, b, Imm1(asx)});
|
||||
Value IREmitter::PackedHalvingAddSubU16(const Value& a, const Value& b) {
|
||||
return Inst(Opcode::PackedHalvingAddSubU16, {a, b});
|
||||
}
|
||||
|
||||
Value IREmitter::PackedHalvingSubAddS16(const Value& a, const Value& b, bool asx) {
|
||||
return Inst(Opcode::PackedHalvingSubAddS16, {a, b, Imm1(asx)});
|
||||
Value IREmitter::PackedHalvingAddSubS16(const Value& a, const Value& b) {
|
||||
return Inst(Opcode::PackedHalvingAddSubS16, {a, b});
|
||||
}
|
||||
|
||||
Value IREmitter::PackedHalvingSubAddU16(const Value& a, const Value& b) {
|
||||
return Inst(Opcode::PackedHalvingSubAddU16, {a, b});
|
||||
}
|
||||
|
||||
Value IREmitter::PackedHalvingSubAddS16(const Value& a, const Value& b) {
|
||||
return Inst(Opcode::PackedHalvingSubAddS16, {a, b});
|
||||
}
|
||||
|
||||
Value IREmitter::PackedSaturatedAddU8(const Value& a, const Value& b) {
|
||||
|
|
|
@ -149,6 +149,10 @@ public:
|
|||
ResultAndGE PackedSubS8(const Value& a, const Value& b);
|
||||
ResultAndGE PackedSubU16(const Value& a, const Value& b);
|
||||
ResultAndGE PackedSubS16(const Value& a, const Value& b);
|
||||
ResultAndGE PackedAddSubU16(const Value& a, const Value& b);
|
||||
ResultAndGE PackedAddSubS16(const Value& a, const Value& b);
|
||||
ResultAndGE PackedSubAddU16(const Value& a, const Value& b);
|
||||
ResultAndGE PackedSubAddS16(const Value& a, const Value& b);
|
||||
Value PackedHalvingAddU8(const Value& a, const Value& b);
|
||||
Value PackedHalvingAddS8(const Value& a, const Value& b);
|
||||
Value PackedHalvingSubU8(const Value& a, const Value& b);
|
||||
|
@ -157,8 +161,10 @@ public:
|
|||
Value PackedHalvingAddS16(const Value& a, const Value& b);
|
||||
Value PackedHalvingSubU16(const Value& a, const Value& b);
|
||||
Value PackedHalvingSubS16(const Value& a, const Value& b);
|
||||
Value PackedHalvingSubAddU16(const Value& a, const Value& b, bool asx);
|
||||
Value PackedHalvingSubAddS16(const Value& a, const Value& b, bool asx);
|
||||
Value PackedHalvingAddSubU16(const Value& a, const Value& b);
|
||||
Value PackedHalvingAddSubS16(const Value& a, const Value& b);
|
||||
Value PackedHalvingSubAddU16(const Value& a, const Value& b);
|
||||
Value PackedHalvingSubAddS16(const Value& a, const Value& b);
|
||||
Value PackedSaturatedAddU8(const Value& a, const Value& b);
|
||||
Value PackedSaturatedAddS8(const Value& a, const Value& b);
|
||||
Value PackedSaturatedSubU8(const Value& a, const Value& b);
|
||||
|
|
|
@ -90,6 +90,10 @@ OPCODE(PackedAddU16, T::U32, T::U32, T::U32
|
|||
OPCODE(PackedAddS16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedSubU16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedSubS16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedAddSubU16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedAddSubS16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedSubAddU16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedSubAddS16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedHalvingAddU8, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedHalvingAddS8, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedHalvingSubU8, T::U32, T::U32, T::U32 )
|
||||
|
@ -98,8 +102,10 @@ OPCODE(PackedHalvingAddU16, T::U32, T::U32, T::U32
|
|||
OPCODE(PackedHalvingAddS16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedHalvingSubU16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedHalvingSubS16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedHalvingSubAddU16, T::U32, T::U32, T::U32, T::U1 )
|
||||
OPCODE(PackedHalvingSubAddS16, T::U32, T::U32, T::U32, T::U1 )
|
||||
OPCODE(PackedHalvingAddSubU16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedHalvingAddSubS16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedHalvingSubAddU16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedHalvingSubAddS16, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedSaturatedAddU8, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedSaturatedAddS8, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedSaturatedSubU8, T::U32, T::U32, T::U32 )
|
||||
|
|
|
@ -33,13 +33,25 @@ bool ArmTranslatorVisitor::arm_SADD16(Cond cond, Reg n, Reg d, Reg m) {
|
|||
}
|
||||
|
||||
bool ArmTranslatorVisitor::arm_SASX(Cond cond, Reg n, Reg d, Reg m) {
|
||||
UNUSED(cond, n, d, m);
|
||||
return InterpretThisInstruction();
|
||||
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
|
||||
return UnpredictableInstruction();
|
||||
if (ConditionPassed(cond)) {
|
||||
auto result = ir.PackedAddSubS16(ir.GetRegister(n), ir.GetRegister(m));
|
||||
ir.SetRegister(d, result.result);
|
||||
ir.SetGEFlags(result.ge);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ArmTranslatorVisitor::arm_SSAX(Cond cond, Reg n, Reg d, Reg m) {
|
||||
UNUSED(cond, n, d, m);
|
||||
return InterpretThisInstruction();
|
||||
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
|
||||
return UnpredictableInstruction();
|
||||
if (ConditionPassed(cond)) {
|
||||
auto result = ir.PackedSubAddS16(ir.GetRegister(n), ir.GetRegister(m));
|
||||
ir.SetRegister(d, result.result);
|
||||
ir.SetGEFlags(result.ge);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ArmTranslatorVisitor::arm_SSUB8(Cond cond, Reg n, Reg d, Reg m) {
|
||||
|
@ -87,13 +99,25 @@ bool ArmTranslatorVisitor::arm_UADD16(Cond cond, Reg n, Reg d, Reg m) {
|
|||
}
|
||||
|
||||
bool ArmTranslatorVisitor::arm_UASX(Cond cond, Reg n, Reg d, Reg m) {
|
||||
UNUSED(cond, n, d, m);
|
||||
return InterpretThisInstruction();
|
||||
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
|
||||
return UnpredictableInstruction();
|
||||
if (ConditionPassed(cond)) {
|
||||
auto result = ir.PackedAddSubU16(ir.GetRegister(n), ir.GetRegister(m));
|
||||
ir.SetRegister(d, result.result);
|
||||
ir.SetGEFlags(result.ge);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ArmTranslatorVisitor::arm_USAX(Cond cond, Reg n, Reg d, Reg m) {
|
||||
UNUSED(cond, n, d, m);
|
||||
return InterpretThisInstruction();
|
||||
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
|
||||
return UnpredictableInstruction();
|
||||
if (ConditionPassed(cond)) {
|
||||
auto result = ir.PackedSubAddU16(ir.GetRegister(n), ir.GetRegister(m));
|
||||
ir.SetRegister(d, result.result);
|
||||
ir.SetGEFlags(result.ge);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ArmTranslatorVisitor::arm_USAD8(Cond cond, Reg d, Reg m, Reg n) {
|
||||
|
@ -261,7 +285,7 @@ bool ArmTranslatorVisitor::arm_SHASX(Cond cond, Reg n, Reg d, Reg m) {
|
|||
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
|
||||
return UnpredictableInstruction();
|
||||
if (ConditionPassed(cond)) {
|
||||
auto result = ir.PackedHalvingSubAddS16(ir.GetRegister(n), ir.GetRegister(m), true);
|
||||
auto result = ir.PackedHalvingAddSubS16(ir.GetRegister(n), ir.GetRegister(m));
|
||||
ir.SetRegister(d, result);
|
||||
}
|
||||
return true;
|
||||
|
@ -271,7 +295,7 @@ bool ArmTranslatorVisitor::arm_SHSAX(Cond cond, Reg n, Reg d, Reg m) {
|
|||
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
|
||||
return UnpredictableInstruction();
|
||||
if (ConditionPassed(cond)) {
|
||||
auto result = ir.PackedHalvingSubAddS16(ir.GetRegister(n), ir.GetRegister(m), false);
|
||||
auto result = ir.PackedHalvingSubAddS16(ir.GetRegister(n), ir.GetRegister(m));
|
||||
ir.SetRegister(d, result);
|
||||
}
|
||||
return true;
|
||||
|
@ -321,7 +345,7 @@ bool ArmTranslatorVisitor::arm_UHASX(Cond cond, Reg n, Reg d, Reg m) {
|
|||
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
|
||||
return UnpredictableInstruction();
|
||||
if (ConditionPassed(cond)) {
|
||||
auto result = ir.PackedHalvingSubAddU16(ir.GetRegister(n), ir.GetRegister(m), true);
|
||||
auto result = ir.PackedHalvingAddSubU16(ir.GetRegister(n), ir.GetRegister(m));
|
||||
ir.SetRegister(d, result);
|
||||
}
|
||||
return true;
|
||||
|
@ -331,7 +355,7 @@ bool ArmTranslatorVisitor::arm_UHSAX(Cond cond, Reg n, Reg d, Reg m) {
|
|||
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
|
||||
return UnpredictableInstruction();
|
||||
if (ConditionPassed(cond)) {
|
||||
auto result = ir.PackedHalvingSubAddU16(ir.GetRegister(n), ir.GetRegister(m), false);
|
||||
auto result = ir.PackedHalvingSubAddU16(ir.GetRegister(n), ir.GetRegister(m));
|
||||
ir.SetRegister(d, result);
|
||||
}
|
||||
return true;
|
||||
|
|
Loading…
Reference in a new issue