Implemented the ARM UHADD8 instruction. (#45)
The x64 implementation uses the SSSE3 instruction PSHUFB. A non-SSE fallback is provided in case the CPU doesn't support it.
This commit is contained in:
parent
f32921d493
commit
4d44474ad4
6 changed files with 80 additions and 3 deletions
|
@ -7,8 +7,6 @@
|
|||
#include <unordered_map>
|
||||
#include <common/bit_util.h>
|
||||
|
||||
#include <xbyak.h>
|
||||
|
||||
#include "backend_x64/abi.h"
|
||||
#include "backend_x64/emit_x64.h"
|
||||
#include "backend_x64/jitstate.h"
|
||||
|
@ -1258,6 +1256,68 @@ static void EmitPackedOperation(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst
|
|||
code->movd(result, xmm_scratch_a);
|
||||
}
|
||||
|
||||
void EmitX64::EmitPackedHalvingAddU8(IR::Block& block, IR::Inst* inst) {
|
||||
IR::Value a = inst->GetArg(0);
|
||||
IR::Value b = inst->GetArg(1);
|
||||
|
||||
// This code path requires SSSE3 because of the PSHUFB instruction.
|
||||
// A fallback implementation is provided below.
|
||||
if (cpu_info.has(Xbyak::util::Cpu::tSSSE3)) {
|
||||
Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32();
|
||||
Xbyak::Reg32 arg = reg_alloc.UseGpr(b).cvt32();
|
||||
|
||||
// Load the operands into Xmm registers
|
||||
Xbyak::Xmm xmm_scratch_a = reg_alloc.ScratchXmm();
|
||||
Xbyak::Xmm xmm_scratch_b = reg_alloc.ScratchXmm();
|
||||
|
||||
Xbyak::Xmm xmm_mask = reg_alloc.ScratchXmm();
|
||||
Xbyak::Reg64 mask = reg_alloc.ScratchGpr();
|
||||
|
||||
code->movd(xmm_scratch_a, result);
|
||||
code->movd(xmm_scratch_b, arg);
|
||||
|
||||
// Set the mask to expand the values
|
||||
// 0xAABBCCDD becomes 0x00AA00BB00CC00DD
|
||||
code->mov(mask, 0x8003800280018000);
|
||||
code->movq(xmm_mask, mask);
|
||||
|
||||
// Expand each 8-bit value to 16-bit
|
||||
code->pshufb(xmm_scratch_a, xmm_mask);
|
||||
code->pshufb(xmm_scratch_b, xmm_mask);
|
||||
|
||||
// Add the individual 16-bit values
|
||||
code->paddw(xmm_scratch_a, xmm_scratch_b);
|
||||
|
||||
// Shift the 16-bit values to the right to halve them
|
||||
code->psrlw(xmm_scratch_a, 1);
|
||||
|
||||
// Set the mask to pack the values again
|
||||
// 0x00AA00BB00CC00DD becomes 0xAABBCCDD
|
||||
code->mov(mask, 0x06040200);
|
||||
code->movq(xmm_mask, mask);
|
||||
|
||||
// Shuffle them back to 8-bit values
|
||||
code->pshufb(xmm_scratch_a, xmm_mask);
|
||||
|
||||
code->movd(result, xmm_scratch_a);
|
||||
return;
|
||||
}
|
||||
|
||||
// Fallback implementation in case the CPU doesn't support SSSE3
|
||||
Xbyak::Reg32 reg_a = reg_alloc.UseDefGpr(a, inst).cvt32();
|
||||
Xbyak::Reg32 reg_b = reg_alloc.UseGpr(b).cvt32();
|
||||
Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32();
|
||||
Xbyak::Reg32 and_a_b = reg_a;
|
||||
Xbyak::Reg32 result = reg_a;
|
||||
|
||||
code->mov(xor_a_b, reg_a);
|
||||
code->and(and_a_b, reg_b);
|
||||
code->xor(xor_a_b, reg_b);
|
||||
code->shr(xor_a_b, 1);
|
||||
code->and(xor_a_b, 0x7F7F7F7F);
|
||||
code->add(result, xor_a_b);
|
||||
}
|
||||
|
||||
void EmitX64::EmitPackedSaturatedAddU8(IR::Block& block, IR::Inst* inst) {
|
||||
EmitPackedOperation(code, reg_alloc, inst, &Xbyak::CodeGenerator::paddusb);
|
||||
}
|
||||
|
|
|
@ -11,6 +11,8 @@
|
|||
|
||||
#include <boost/optional.hpp>
|
||||
|
||||
#include <xbyak_util.h>
|
||||
|
||||
#include "backend_x64/block_of_code.h"
|
||||
#include "backend_x64/reg_alloc.h"
|
||||
#include "dynarmic/callbacks.h"
|
||||
|
@ -76,6 +78,9 @@ private:
|
|||
void EmitTerminalCheckHalt(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location);
|
||||
void Patch(IR::LocationDescriptor desc, CodePtr bb);
|
||||
|
||||
// Global CPU information
|
||||
Xbyak::util::Cpu cpu_info;
|
||||
|
||||
// Per-block state
|
||||
RegAlloc reg_alloc;
|
||||
|
||||
|
|
|
@ -320,6 +320,10 @@ Value IREmitter::ByteReverseDual(const Value& a) {
|
|||
return Inst(Opcode::ByteReverseDual, {a});
|
||||
}
|
||||
|
||||
Value IREmitter::PackedHalvingAddU8(const Value& a, const Value& b) {
|
||||
return Inst(Opcode::PackedHalvingAddU8, { a, b });
|
||||
}
|
||||
|
||||
Value IREmitter::PackedSaturatedAddU8(const Value& a, const Value& b) {
|
||||
return Inst(Opcode::PackedSaturatedAddU8, {a, b});
|
||||
}
|
||||
|
|
|
@ -121,6 +121,7 @@ public:
|
|||
Value ByteReverseWord(const Value& a);
|
||||
Value ByteReverseHalf(const Value& a);
|
||||
Value ByteReverseDual(const Value& a);
|
||||
Value PackedHalvingAddU8(const Value& a, const Value& b);
|
||||
Value PackedSaturatedAddU8(const Value& a, const Value& b);
|
||||
Value PackedSaturatedAddS8(const Value& a, const Value& b);
|
||||
Value PackedSaturatedSubU8(const Value& a, const Value& b);
|
||||
|
|
|
@ -71,6 +71,7 @@ OPCODE(ZeroExtendByteToWord, T::U32, T::U8
|
|||
OPCODE(ByteReverseWord, T::U32, T::U32 )
|
||||
OPCODE(ByteReverseHalf, T::U16, T::U16 )
|
||||
OPCODE(ByteReverseDual, T::U64, T::U64 )
|
||||
OPCODE(PackedHalvingAddU8, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedSaturatedAddU8, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedSaturatedAddS8, T::U32, T::U32, T::U32 )
|
||||
OPCODE(PackedSaturatedSubU8, T::U32, T::U32, T::U32 )
|
||||
|
|
|
@ -179,7 +179,13 @@ bool ArmTranslatorVisitor::arm_SHSUB16(Cond cond, Reg n, Reg d, Reg m) {
|
|||
}
|
||||
|
||||
bool ArmTranslatorVisitor::arm_UHADD8(Cond cond, Reg n, Reg d, Reg m) {
|
||||
return InterpretThisInstruction();
|
||||
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
|
||||
return UnpredictableInstruction();
|
||||
if (ConditionPassed(cond)) {
|
||||
auto result = ir.PackedHalvingAddU8(ir.GetRegister(n), ir.GetRegister(m));
|
||||
ir.SetRegister(d, result);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ArmTranslatorVisitor::arm_UHADD16(Cond cond, Reg n, Reg d, Reg m) {
|
||||
|
|
Loading…
Reference in a new issue