parallel: Add and Subtract with Exchange improvements

* Remove asx argument from PackedHalvingSubAdd{U16,S16} IR instruction
* Implement Packed{Halving,}{AddSub,SubAdd}{U16,S16} IR instructions
* Implement SASX, SSAX, UASX, USAX
This commit is contained in:
MerryMage 2017-03-24 15:56:24 +00:00
parent fd068ed6b8
commit 05e97058c3
5 changed files with 157 additions and 41 deletions

View file

@ -1986,17 +1986,15 @@ void EmitX64::EmitPackedHalvingSubS16(RegAlloc& reg_alloc, IR::Block&, IR::Inst*
reg_alloc.DefineValue(inst, minuend);
}
void EmitPackedHalvingSubAdd(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, bool is_signed) {
void EmitPackedSubAdd(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, bool hi_is_sum, bool is_signed, bool is_halving) {
auto args = reg_alloc.GetArgumentInfo(inst);
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
Xbyak::Reg32 reg_a_hi = reg_alloc.UseScratchGpr(args[0]).cvt32();
Xbyak::Reg32 reg_b_hi = reg_alloc.UseScratchGpr(args[1]).cvt32();
Xbyak::Reg32 reg_a_lo = reg_alloc.ScratchGpr().cvt32();
Xbyak::Reg32 reg_b_lo = reg_alloc.ScratchGpr().cvt32();
// If asx is true, the high word contains the sum and the low word the difference.
// If false, the high word contains the difference and the low word the sum.
bool asx = args[2].GetImmediateU1();
Xbyak::Reg32 reg_sum, reg_diff;
if (is_signed) {
code->movsx(reg_a_lo, reg_a_hi.cvt16());
@ -2010,22 +2008,48 @@ void EmitPackedHalvingSubAdd(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* i
code->shr(reg_b_hi, 16);
}
if (asx) {
// Calculate diff such that reg_a_lo<31:16> contains diff<16:1>.
if (hi_is_sum) {
code->sub(reg_a_lo, reg_b_hi);
code->shl(reg_a_lo, 15);
// Calculate sum such that reg_a_hi<15:0> contains sum<16:1>.
code->add(reg_a_hi, reg_b_lo);
reg_diff = reg_a_lo;
reg_sum = reg_a_hi;
} else {
code->add(reg_a_lo, reg_b_hi);
code->sub(reg_a_hi, reg_b_lo);
reg_diff = reg_a_hi;
reg_sum = reg_a_lo;
}
if (ge_inst) {
EraseInstruction(block, ge_inst);
// The reg_b registers are no longer required.
Xbyak::Reg32 ge_sum = reg_b_hi;
Xbyak::Reg32 ge_diff = reg_b_lo;
code->mov(ge_sum, reg_sum);
code->mov(ge_diff, reg_diff);
if (!is_signed) {
code->shl(ge_sum, 15);
code->sar(ge_sum, 16);
} else {
code->not(ge_sum);
}
code->not(ge_diff);
code->and(ge_sum, hi_is_sum ? 0xC0000000 : 0x30000000);
code->and(ge_diff, hi_is_sum ? 0x30000000 : 0xC0000000);
code->or_(ge_sum, ge_diff);
code->shr(ge_sum, 28);
reg_alloc.DefineValue(ge_inst, ge_sum);
}
if (is_halving) {
code->shl(reg_a_lo, 15);
code->shr(reg_a_hi, 1);
} else {
// Calculate sum such that reg_a_lo<31:16> contains sum<16:1>.
code->add(reg_a_lo, reg_b_hi);
code->shl(reg_a_lo, 15);
// Calculate diff such that reg_a_hi<15:0> contains diff<16:1>.
code->sub(reg_a_hi, reg_b_lo);
code->shr(reg_a_hi, 1);
code->shl(reg_a_lo, 16);
}
// reg_a_lo now contains the low word and reg_a_hi now contains the high word.
@ -2035,12 +2059,36 @@ void EmitPackedHalvingSubAdd(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* i
reg_alloc.DefineValue(inst, reg_a_hi);
}
void EmitX64::EmitPackedHalvingSubAddU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
EmitPackedHalvingSubAdd(code, reg_alloc, inst, false);
void EmitX64::EmitPackedAddSubU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
EmitPackedSubAdd(code, reg_alloc, block, inst, true, false, false);
}
void EmitX64::EmitPackedHalvingSubAddS16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
EmitPackedHalvingSubAdd(code, reg_alloc, inst, true);
void EmitX64::EmitPackedAddSubS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
EmitPackedSubAdd(code, reg_alloc, block, inst, true, true, false);
}
void EmitX64::EmitPackedSubAddU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
EmitPackedSubAdd(code, reg_alloc, block, inst, false, false, false);
}
void EmitX64::EmitPackedSubAddS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
EmitPackedSubAdd(code, reg_alloc, block, inst, false, true, false);
}
void EmitX64::EmitPackedHalvingAddSubU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
EmitPackedSubAdd(code, reg_alloc, block, inst, true, false, true);
}
void EmitX64::EmitPackedHalvingAddSubS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
EmitPackedSubAdd(code, reg_alloc, block, inst, true, true, true);
}
void EmitX64::EmitPackedHalvingSubAddU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
EmitPackedSubAdd(code, reg_alloc, block, inst, false, false, true);
}
void EmitX64::EmitPackedHalvingSubAddS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
EmitPackedSubAdd(code, reg_alloc, block, inst, false, true, true);
}
static void EmitPackedOperation(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) {

View file

@ -402,6 +402,30 @@ IREmitter::ResultAndGE IREmitter::PackedSubS16(const Value& a, const Value& b) {
return {result, ge};
}
IREmitter::ResultAndGE IREmitter::PackedAddSubU16(const Value& a, const Value& b) {
auto result = Inst(Opcode::PackedAddSubU16, {a, b});
auto ge = Inst(Opcode::GetGEFromOp, {result});
return {result, ge};
}
IREmitter::ResultAndGE IREmitter::PackedAddSubS16(const Value& a, const Value& b) {
auto result = Inst(Opcode::PackedAddSubS16, {a, b});
auto ge = Inst(Opcode::GetGEFromOp, {result});
return {result, ge};
}
IREmitter::ResultAndGE IREmitter::PackedSubAddU16(const Value& a, const Value& b) {
auto result = Inst(Opcode::PackedSubAddU16, {a, b});
auto ge = Inst(Opcode::GetGEFromOp, {result});
return {result, ge};
}
IREmitter::ResultAndGE IREmitter::PackedSubAddS16(const Value& a, const Value& b) {
auto result = Inst(Opcode::PackedSubAddS16, {a, b});
auto ge = Inst(Opcode::GetGEFromOp, {result});
return {result, ge};
}
Value IREmitter::PackedHalvingAddU8(const Value& a, const Value& b) {
return Inst(Opcode::PackedHalvingAddU8, {a, b});
}
@ -434,12 +458,20 @@ Value IREmitter::PackedHalvingSubS16(const Value& a, const Value& b) {
return Inst(Opcode::PackedHalvingSubS16, {a, b});
}
Value IREmitter::PackedHalvingSubAddU16(const Value& a, const Value& b, bool asx) {
return Inst(Opcode::PackedHalvingSubAddU16, {a, b, Imm1(asx)});
Value IREmitter::PackedHalvingAddSubU16(const Value& a, const Value& b) {
return Inst(Opcode::PackedHalvingAddSubU16, {a, b});
}
Value IREmitter::PackedHalvingSubAddS16(const Value& a, const Value& b, bool asx) {
return Inst(Opcode::PackedHalvingSubAddS16, {a, b, Imm1(asx)});
Value IREmitter::PackedHalvingAddSubS16(const Value& a, const Value& b) {
return Inst(Opcode::PackedHalvingAddSubS16, {a, b});
}
Value IREmitter::PackedHalvingSubAddU16(const Value& a, const Value& b) {
return Inst(Opcode::PackedHalvingSubAddU16, {a, b});
}
Value IREmitter::PackedHalvingSubAddS16(const Value& a, const Value& b) {
return Inst(Opcode::PackedHalvingSubAddS16, {a, b});
}
Value IREmitter::PackedSaturatedAddU8(const Value& a, const Value& b) {

View file

@ -149,6 +149,10 @@ public:
ResultAndGE PackedSubS8(const Value& a, const Value& b);
ResultAndGE PackedSubU16(const Value& a, const Value& b);
ResultAndGE PackedSubS16(const Value& a, const Value& b);
ResultAndGE PackedAddSubU16(const Value& a, const Value& b);
ResultAndGE PackedAddSubS16(const Value& a, const Value& b);
ResultAndGE PackedSubAddU16(const Value& a, const Value& b);
ResultAndGE PackedSubAddS16(const Value& a, const Value& b);
Value PackedHalvingAddU8(const Value& a, const Value& b);
Value PackedHalvingAddS8(const Value& a, const Value& b);
Value PackedHalvingSubU8(const Value& a, const Value& b);
@ -157,8 +161,10 @@ public:
Value PackedHalvingAddS16(const Value& a, const Value& b);
Value PackedHalvingSubU16(const Value& a, const Value& b);
Value PackedHalvingSubS16(const Value& a, const Value& b);
Value PackedHalvingSubAddU16(const Value& a, const Value& b, bool asx);
Value PackedHalvingSubAddS16(const Value& a, const Value& b, bool asx);
Value PackedHalvingAddSubU16(const Value& a, const Value& b);
Value PackedHalvingAddSubS16(const Value& a, const Value& b);
Value PackedHalvingSubAddU16(const Value& a, const Value& b);
Value PackedHalvingSubAddS16(const Value& a, const Value& b);
Value PackedSaturatedAddU8(const Value& a, const Value& b);
Value PackedSaturatedAddS8(const Value& a, const Value& b);
Value PackedSaturatedSubU8(const Value& a, const Value& b);

View file

@ -90,6 +90,10 @@ OPCODE(PackedAddU16, T::U32, T::U32, T::U32
OPCODE(PackedAddS16, T::U32, T::U32, T::U32 )
OPCODE(PackedSubU16, T::U32, T::U32, T::U32 )
OPCODE(PackedSubS16, T::U32, T::U32, T::U32 )
OPCODE(PackedAddSubU16, T::U32, T::U32, T::U32 )
OPCODE(PackedAddSubS16, T::U32, T::U32, T::U32 )
OPCODE(PackedSubAddU16, T::U32, T::U32, T::U32 )
OPCODE(PackedSubAddS16, T::U32, T::U32, T::U32 )
OPCODE(PackedHalvingAddU8, T::U32, T::U32, T::U32 )
OPCODE(PackedHalvingAddS8, T::U32, T::U32, T::U32 )
OPCODE(PackedHalvingSubU8, T::U32, T::U32, T::U32 )
@ -98,8 +102,10 @@ OPCODE(PackedHalvingAddU16, T::U32, T::U32, T::U32
OPCODE(PackedHalvingAddS16, T::U32, T::U32, T::U32 )
OPCODE(PackedHalvingSubU16, T::U32, T::U32, T::U32 )
OPCODE(PackedHalvingSubS16, T::U32, T::U32, T::U32 )
OPCODE(PackedHalvingSubAddU16, T::U32, T::U32, T::U32, T::U1 )
OPCODE(PackedHalvingSubAddS16, T::U32, T::U32, T::U32, T::U1 )
OPCODE(PackedHalvingAddSubU16, T::U32, T::U32, T::U32 )
OPCODE(PackedHalvingAddSubS16, T::U32, T::U32, T::U32 )
OPCODE(PackedHalvingSubAddU16, T::U32, T::U32, T::U32 )
OPCODE(PackedHalvingSubAddS16, T::U32, T::U32, T::U32 )
OPCODE(PackedSaturatedAddU8, T::U32, T::U32, T::U32 )
OPCODE(PackedSaturatedAddS8, T::U32, T::U32, T::U32 )
OPCODE(PackedSaturatedSubU8, T::U32, T::U32, T::U32 )

View file

@ -33,13 +33,25 @@ bool ArmTranslatorVisitor::arm_SADD16(Cond cond, Reg n, Reg d, Reg m) {
}
bool ArmTranslatorVisitor::arm_SASX(Cond cond, Reg n, Reg d, Reg m) {
UNUSED(cond, n, d, m);
return InterpretThisInstruction();
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
return UnpredictableInstruction();
if (ConditionPassed(cond)) {
auto result = ir.PackedAddSubS16(ir.GetRegister(n), ir.GetRegister(m));
ir.SetRegister(d, result.result);
ir.SetGEFlags(result.ge);
}
return true;
}
bool ArmTranslatorVisitor::arm_SSAX(Cond cond, Reg n, Reg d, Reg m) {
UNUSED(cond, n, d, m);
return InterpretThisInstruction();
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
return UnpredictableInstruction();
if (ConditionPassed(cond)) {
auto result = ir.PackedSubAddS16(ir.GetRegister(n), ir.GetRegister(m));
ir.SetRegister(d, result.result);
ir.SetGEFlags(result.ge);
}
return true;
}
bool ArmTranslatorVisitor::arm_SSUB8(Cond cond, Reg n, Reg d, Reg m) {
@ -87,13 +99,25 @@ bool ArmTranslatorVisitor::arm_UADD16(Cond cond, Reg n, Reg d, Reg m) {
}
bool ArmTranslatorVisitor::arm_UASX(Cond cond, Reg n, Reg d, Reg m) {
UNUSED(cond, n, d, m);
return InterpretThisInstruction();
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
return UnpredictableInstruction();
if (ConditionPassed(cond)) {
auto result = ir.PackedAddSubU16(ir.GetRegister(n), ir.GetRegister(m));
ir.SetRegister(d, result.result);
ir.SetGEFlags(result.ge);
}
return true;
}
bool ArmTranslatorVisitor::arm_USAX(Cond cond, Reg n, Reg d, Reg m) {
UNUSED(cond, n, d, m);
return InterpretThisInstruction();
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
return UnpredictableInstruction();
if (ConditionPassed(cond)) {
auto result = ir.PackedSubAddU16(ir.GetRegister(n), ir.GetRegister(m));
ir.SetRegister(d, result.result);
ir.SetGEFlags(result.ge);
}
return true;
}
bool ArmTranslatorVisitor::arm_USAD8(Cond cond, Reg d, Reg m, Reg n) {
@ -261,7 +285,7 @@ bool ArmTranslatorVisitor::arm_SHASX(Cond cond, Reg n, Reg d, Reg m) {
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
return UnpredictableInstruction();
if (ConditionPassed(cond)) {
auto result = ir.PackedHalvingSubAddS16(ir.GetRegister(n), ir.GetRegister(m), true);
auto result = ir.PackedHalvingAddSubS16(ir.GetRegister(n), ir.GetRegister(m));
ir.SetRegister(d, result);
}
return true;
@ -271,7 +295,7 @@ bool ArmTranslatorVisitor::arm_SHSAX(Cond cond, Reg n, Reg d, Reg m) {
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
return UnpredictableInstruction();
if (ConditionPassed(cond)) {
auto result = ir.PackedHalvingSubAddS16(ir.GetRegister(n), ir.GetRegister(m), false);
auto result = ir.PackedHalvingSubAddS16(ir.GetRegister(n), ir.GetRegister(m));
ir.SetRegister(d, result);
}
return true;
@ -321,7 +345,7 @@ bool ArmTranslatorVisitor::arm_UHASX(Cond cond, Reg n, Reg d, Reg m) {
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
return UnpredictableInstruction();
if (ConditionPassed(cond)) {
auto result = ir.PackedHalvingSubAddU16(ir.GetRegister(n), ir.GetRegister(m), true);
auto result = ir.PackedHalvingAddSubU16(ir.GetRegister(n), ir.GetRegister(m));
ir.SetRegister(d, result);
}
return true;
@ -331,7 +355,7 @@ bool ArmTranslatorVisitor::arm_UHSAX(Cond cond, Reg n, Reg d, Reg m) {
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
return UnpredictableInstruction();
if (ConditionPassed(cond)) {
auto result = ir.PackedHalvingSubAddU16(ir.GetRegister(n), ir.GetRegister(m), false);
auto result = ir.PackedHalvingSubAddU16(ir.GetRegister(n), ir.GetRegister(m));
ir.SetRegister(d, result);
}
return true;