From 677f62dd6f9aaa5ba2f531908b5210a8e1d45f80 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Thu, 22 Dec 2016 07:02:24 -0500 Subject: [PATCH] Implement SHSUB8 and SHSUB16 (#74) * Implement IR operations PackedHalvingSubS8 and PackedHalvingSubS16 --- src/backend_x64/emit_x64.cpp | 70 ++++++++++++++++++- src/frontend/ir/ir_emitter.cpp | 8 +++ src/frontend/ir/ir_emitter.h | 2 + src/frontend/ir/opcodes.inc | 2 + .../translate/translate_arm/parallel.cpp | 18 +++-- 5 files changed, 95 insertions(+), 5 deletions(-) diff --git a/src/backend_x64/emit_x64.cpp b/src/backend_x64/emit_x64.cpp index 703e5c81..770fe34e 100644 --- a/src/backend_x64/emit_x64.cpp +++ b/src/backend_x64/emit_x64.cpp @@ -1928,6 +1928,41 @@ void EmitX64::EmitPackedHalvingSubU8(IR::Block&, IR::Inst* inst) { // minuend now contains the desired result. } +void EmitX64::EmitPackedHalvingSubS8(IR::Block&, IR::Inst* inst) { + IR::Value a = inst->GetArg(0); + IR::Value b = inst->GetArg(1); + + Xbyak::Reg32 minuend = reg_alloc.UseDefGpr(a, inst).cvt32(); + Xbyak::Reg32 subtrahend = reg_alloc.UseScratchGpr(b).cvt32(); + + Xbyak::Reg32 carry = reg_alloc.ScratchGpr().cvt32(); + + // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1). + // Note that x^y always contains the LSB of the result. + // Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y). + + code->xor(minuend, subtrahend); + code->and(subtrahend, minuend); + code->mov(carry, minuend); + code->and(carry, 0x80808080); + code->shr(minuend, 1); + + // At this point, + // minuend := (a^b) >> 1 + // subtrahend := (a^b) & b + // carry := (a^b) & 0x80808080 + + // We must now perform a partitioned subtraction. + // We can do this because minuend contains 7 bit fields. + // We use the extra bit in minuend as a bit to borrow from; we set this bit. + // We invert this bit at the end as this tells us if that bit was borrowed from. + // We then sign extend the result into this bit. + code->or(minuend, 0x80808080); + code->sub(minuend, subtrahend); + code->xor(minuend, 0x80808080); + code->xor(minuend, carry); +} + void EmitX64::EmitPackedHalvingSubU16(IR::Block&, IR::Inst* inst) { IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); @@ -1954,8 +1989,41 @@ void EmitX64::EmitPackedHalvingSubU16(IR::Block&, IR::Inst* inst) { code->or(minuend, 0x80008000); code->sub(minuend, subtrahend); code->xor(minuend, 0x80008000); +} - // minuend now contains the desired result. +void EmitX64::EmitPackedHalvingSubS16(IR::Block&, IR::Inst* inst) { + IR::Value a = inst->GetArg(0); + IR::Value b = inst->GetArg(1); + + Xbyak::Reg32 minuend = reg_alloc.UseDefGpr(a, inst).cvt32(); + Xbyak::Reg32 subtrahend = reg_alloc.UseScratchGpr(b).cvt32(); + + Xbyak::Reg32 carry = reg_alloc.ScratchGpr().cvt32(); + + // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1). + // Note that x^y always contains the LSB of the result. + // Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y). + + code->xor(minuend, subtrahend); + code->and(subtrahend, minuend); + code->mov(carry, minuend); + code->and(carry, 0x80008000); + code->shr(minuend, 1); + + // At this point, + // minuend := (a^b) >> 1 + // subtrahend := (a^b) & b + // carry := (a^b) & 0x80008000 + + // We must now perform a partitioned subtraction. + // We can do this because minuend contains 7 bit fields. + // We use the extra bit in minuend as a bit to borrow from; we set this bit. + // We invert this bit at the end as this tells us if that bit was borrowed from. + // We then sign extend the result into this bit. + code->or(minuend, 0x80008000); + code->sub(minuend, subtrahend); + code->xor(minuend, 0x80008000); + code->xor(minuend, carry); } static void EmitPackedOperation(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) { diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index a58ec842..562c96b7 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -422,6 +422,10 @@ Value IREmitter::PackedHalvingSubU8(const Value& a, const Value& b) { return Inst(Opcode::PackedHalvingSubU8, {a, b}); } +Value IREmitter::PackedHalvingSubS8(const Value& a, const Value& b) { + return Inst(Opcode::PackedHalvingSubS8, {a, b}); +} + Value IREmitter::PackedHalvingAddU16(const Value& a, const Value& b) { return Inst(Opcode::PackedHalvingAddU16, {a, b}); } @@ -434,6 +438,10 @@ Value IREmitter::PackedHalvingSubU16(const Value& a, const Value& b) { return Inst(Opcode::PackedHalvingSubU16, {a, b}); } +Value IREmitter::PackedHalvingSubS16(const Value& a, const Value& b) { + return Inst(Opcode::PackedHalvingSubS16, {a, b}); +} + Value IREmitter::PackedSaturatedAddU8(const Value& a, const Value& b) { return Inst(Opcode::PackedSaturatedAddU8, {a, b}); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 0a3d0327..e229b537 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -152,9 +152,11 @@ public: Value PackedHalvingAddU8(const Value& a, const Value& b); Value PackedHalvingAddS8(const Value& a, const Value& b); Value PackedHalvingSubU8(const Value& a, const Value& b); + Value PackedHalvingSubS8(const Value& a, const Value& b); Value PackedHalvingAddU16(const Value& a, const Value& b); Value PackedHalvingAddS16(const Value& a, const Value& b); Value PackedHalvingSubU16(const Value& a, const Value& b); + Value PackedHalvingSubS16(const Value& a, const Value& b); Value PackedSaturatedAddU8(const Value& a, const Value& b); Value PackedSaturatedAddS8(const Value& a, const Value& b); Value PackedSaturatedSubU8(const Value& a, const Value& b); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 59c79e94..3afa58b3 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -94,9 +94,11 @@ OPCODE(PackedSubS16, T::U32, T::U32, T::U32 OPCODE(PackedHalvingAddU8, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingAddS8, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingSubU8, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubS8, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingAddU16, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingAddS16, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingSubU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubS16, T::U32, T::U32, T::U32 ) OPCODE(PackedSaturatedAddU8, T::U32, T::U32, T::U32 ) OPCODE(PackedSaturatedAddS8, T::U32, T::U32, T::U32 ) OPCODE(PackedSaturatedSubU8, T::U32, T::U32, T::U32 ) diff --git a/src/frontend/translate/translate_arm/parallel.cpp b/src/frontend/translate/translate_arm/parallel.cpp index 292e5811..7e5b1ebc 100644 --- a/src/frontend/translate/translate_arm/parallel.cpp +++ b/src/frontend/translate/translate_arm/parallel.cpp @@ -268,13 +268,23 @@ bool ArmTranslatorVisitor::arm_SHSAX(Cond cond, Reg n, Reg d, Reg m) { } bool ArmTranslatorVisitor::arm_SHSUB8(Cond cond, Reg n, Reg d, Reg m) { - UNUSED(cond, n, d, m); - return InterpretThisInstruction(); + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) + return UnpredictableInstruction(); + if (ConditionPassed(cond)) { + auto result = ir.PackedHalvingSubS8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + } + return true; } bool ArmTranslatorVisitor::arm_SHSUB16(Cond cond, Reg n, Reg d, Reg m) { - UNUSED(cond, n, d, m); - return InterpretThisInstruction(); + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) + return UnpredictableInstruction(); + if (ConditionPassed(cond)) { + auto result = ir.PackedHalvingSubS16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + } + return true; } bool ArmTranslatorVisitor::arm_UHADD8(Cond cond, Reg n, Reg d, Reg m) {