A32: Implement ASIMD VTBX
This commit is contained in:
parent
06f7229c57
commit
8bbc9fdbb6
8 changed files with 220 additions and 26 deletions
|
@ -40,6 +40,9 @@ using A64FullVectorWidth = std::integral_constant<size_t, 128>;
|
|||
template <typename T>
|
||||
using VectorArray = std::array<T, A64FullVectorWidth::value / Common::BitSize<T>()>;
|
||||
|
||||
template <typename T>
|
||||
using HalfVectorArray = std::array<T, A64FullVectorWidth::value / Common::BitSize<T>() / 2>;
|
||||
|
||||
struct EmitContext {
|
||||
EmitContext(RegAlloc& reg_alloc, IR::Block& block);
|
||||
|
||||
|
|
|
@ -4029,7 +4029,174 @@ void EmitX64::EmitVectorTable(EmitContext&, IR::Inst* inst) {
|
|||
ASSERT_MSG(inst->UseCount() == 1, "Table cannot be used multiple times");
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorTableLookup(EmitContext& ctx, IR::Inst* inst) {
|
||||
void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
|
||||
ASSERT(inst->GetArg(1).GetInst()->GetOpcode() == IR::Opcode::VectorTable);
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
auto table = ctx.reg_alloc.GetArgumentInfo(inst->GetArg(1).GetInst());
|
||||
|
||||
const size_t table_size = std::count_if(table.begin(), table.end(), [](const auto& elem){ return !elem.IsVoid(); });
|
||||
const bool is_defaults_zero = inst->GetArg(0).IsZero();
|
||||
|
||||
// TODO: AVX512VL implementation when available (VPERMB / VPERMI2B / VPERMT2B)
|
||||
|
||||
const std::array<u64, 5> sat_const{
|
||||
0,
|
||||
0x7878787878787878,
|
||||
0x7070707070707070,
|
||||
0x6868686868686868,
|
||||
0x6060606060606060,
|
||||
};
|
||||
|
||||
if (code.HasSSSE3() && is_defaults_zero && table_size <= 2) {
|
||||
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
|
||||
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
|
||||
|
||||
if (table_size == 2) {
|
||||
const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
|
||||
code.punpcklqdq(xmm_table0, xmm_table0_upper);
|
||||
ctx.reg_alloc.Release(xmm_table0_upper);
|
||||
}
|
||||
|
||||
code.paddusb(indicies, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
||||
code.pshufb(xmm_table0, indicies);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, xmm_table0);
|
||||
return;
|
||||
}
|
||||
|
||||
if (code.HasSSE41() && table_size <= 2) {
|
||||
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
|
||||
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
|
||||
|
||||
if (table_size == 2) {
|
||||
const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
|
||||
code.punpcklqdq(xmm_table0, xmm_table0_upper);
|
||||
ctx.reg_alloc.Release(xmm_table0_upper);
|
||||
}
|
||||
|
||||
if (code.HasAVX()) {
|
||||
code.vpaddusb(xmm0, indicies, code.MConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
|
||||
} else {
|
||||
code.movaps(xmm0, indicies);
|
||||
code.paddusb(xmm0, code.MConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
|
||||
}
|
||||
code.pshufb(xmm_table0, indicies);
|
||||
code.pblendvb(xmm_table0, defaults);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, xmm_table0);
|
||||
return;
|
||||
}
|
||||
|
||||
if (code.HasSSE41() && is_defaults_zero) {
|
||||
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
|
||||
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
|
||||
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[2]);
|
||||
|
||||
{
|
||||
const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
|
||||
code.punpcklqdq(xmm_table0, xmm_table0_upper);
|
||||
ctx.reg_alloc.Release(xmm_table0_upper);
|
||||
}
|
||||
if (table_size == 4) {
|
||||
const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(table[3]);
|
||||
code.punpcklqdq(xmm_table1, xmm_table1_upper);
|
||||
ctx.reg_alloc.Release(xmm_table1_upper);
|
||||
}
|
||||
|
||||
if (code.HasAVX()) {
|
||||
code.vpaddusb(xmm0, indicies, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
||||
} else {
|
||||
code.movaps(xmm0, indicies);
|
||||
code.paddusb(xmm0, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
||||
}
|
||||
code.paddusb(indicies, code.MConst(xword, 0x6060606060606060, 0xFFFFFFFFFFFFFFFF));
|
||||
code.pshufb(xmm_table0, xmm0);
|
||||
code.pshufb(xmm_table1, indicies);
|
||||
code.pblendvb(xmm_table0, xmm_table1);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, xmm_table0);
|
||||
return;
|
||||
}
|
||||
|
||||
if (code.HasSSE41()) {
|
||||
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
|
||||
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
|
||||
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[2]);
|
||||
|
||||
{
|
||||
const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
|
||||
code.punpcklqdq(xmm_table0, xmm_table0_upper);
|
||||
ctx.reg_alloc.Release(xmm_table0_upper);
|
||||
}
|
||||
if (table_size == 4) {
|
||||
const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(table[3]);
|
||||
code.punpcklqdq(xmm_table1, xmm_table1_upper);
|
||||
ctx.reg_alloc.Release(xmm_table1_upper);
|
||||
}
|
||||
|
||||
if (code.HasAVX()) {
|
||||
code.vpaddusb(xmm0, indicies, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
||||
} else {
|
||||
code.movaps(xmm0, indicies);
|
||||
code.paddusb(xmm0, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
||||
}
|
||||
code.pshufb(xmm_table0, indicies);
|
||||
code.pshufb(xmm_table1, indicies);
|
||||
code.pblendvb(xmm_table0, xmm_table1);
|
||||
if (code.HasAVX()) {
|
||||
code.vpaddusb(xmm0, indicies, code.MConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
|
||||
} else {
|
||||
code.movaps(xmm0, indicies);
|
||||
code.paddusb(xmm0, code.MConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
|
||||
}
|
||||
code.pblendvb(xmm_table0, defaults);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, xmm_table0);
|
||||
return;
|
||||
}
|
||||
|
||||
const u32 stack_space = static_cast<u32>(6 * 8);
|
||||
code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
|
||||
for (size_t i = 0; i < table_size; ++i) {
|
||||
const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(table[i]);
|
||||
code.movq(qword[rsp + ABI_SHADOW_SPACE + i * 8], table_value);
|
||||
ctx.reg_alloc.Release(table_value);
|
||||
}
|
||||
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
ctx.reg_alloc.EndOfAllocScope();
|
||||
ctx.reg_alloc.HostCall(nullptr);
|
||||
|
||||
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]);
|
||||
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 4 * 8]);
|
||||
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 5 * 8]);
|
||||
code.mov(code.ABI_PARAM4.cvt32(), table_size);
|
||||
code.movq(qword[code.ABI_PARAM2], defaults);
|
||||
code.movq(qword[code.ABI_PARAM3], indicies);
|
||||
|
||||
code.CallLambda(
|
||||
[](const HalfVectorArray<u8>* table, HalfVectorArray<u8>& result, const HalfVectorArray<u8>& indicies, size_t table_size) {
|
||||
for (size_t i = 0; i < result.size(); ++i) {
|
||||
const size_t index = indicies[i] / table[0].size();
|
||||
const size_t elem = indicies[i] % table[0].size();
|
||||
if (index < table_size) {
|
||||
result[i] = table[index][elem];
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
code.movq(result, qword[rsp + ABI_SHADOW_SPACE + 4 * 8]);
|
||||
code.add(rsp, stack_space + ABI_SHADOW_SPACE);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
|
||||
ASSERT(inst->GetArg(1).GetInst()->GetOpcode() == IR::Opcode::VectorTable);
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
|
|
@ -112,7 +112,7 @@ INST(asimd_VRSQRTE, "VRSQRTE", "111100111D11zz11dddd010
|
|||
// Miscellaneous
|
||||
INST(asimd_VEXT, "VEXT", "111100101D11nnnnddddiiiiNQM0mmmm") // ASIMD
|
||||
INST(asimd_VTBL, "VTBL", "111100111D11nnnndddd10zzN0M0mmmm") // ASIMD
|
||||
//INST(asimd_VTBX, "VTBX", "111100111D11nnnndddd10zzN1M0mmmm") // ASIMD
|
||||
INST(asimd_VTBX, "VTBX", "111100111D11nnnndddd10zzN1M0mmmm") // ASIMD
|
||||
//INST(asimd_VDUP_scalar, "VDUP (scalar)", "111100111D11iiiidddd11000QM0mmmm") // ASIMD
|
||||
|
||||
// One register and modified immediate
|
||||
|
|
|
@ -10,6 +10,31 @@
|
|||
|
||||
namespace Dynarmic::A32 {
|
||||
|
||||
static bool TableLookup(ArmTranslatorVisitor& v, bool is_vtbl, bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm) {
|
||||
const size_t length = len + 1;
|
||||
const auto d = ToVector(false, Vd, D);
|
||||
const auto m = ToVector(false, Vm, M);
|
||||
const auto n = ToVector(false, Vn, N);
|
||||
|
||||
if (RegNumber(n) + length > 32) {
|
||||
return v.UnpredictableInstruction();
|
||||
}
|
||||
|
||||
const IR::Table table = v.ir.VectorTable([&]{
|
||||
std::vector<IR::U64> result;
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
result.emplace_back(v.ir.GetExtendedRegister(n + i));
|
||||
}
|
||||
return result;
|
||||
}());
|
||||
const IR::U64 indicies = v.ir.GetExtendedRegister(m);
|
||||
const IR::U64 defaults = is_vtbl ? v.ir.Imm64(0) : IR::U64{v.ir.GetExtendedRegister(d)};
|
||||
const IR::U64 result = v.ir.VectorTableLookup(defaults, table, indicies);
|
||||
|
||||
v.ir.SetExtendedRegister(d, result);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ArmTranslatorVisitor::asimd_VEXT(bool D, size_t Vn, size_t Vd, Imm<4> imm4, bool N, bool Q, bool M, size_t Vm) {
|
||||
if (Q && (Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm))) {
|
||||
return UndefinedInstruction();
|
||||
|
@ -33,28 +58,11 @@ bool ArmTranslatorVisitor::asimd_VEXT(bool D, size_t Vn, size_t Vd, Imm<4> imm4,
|
|||
}
|
||||
|
||||
bool ArmTranslatorVisitor::asimd_VTBL(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm) {
|
||||
const size_t length = len + 1;
|
||||
const auto d = ToVector(false, Vd, D);
|
||||
const auto m = ToVector(false, Vm, M);
|
||||
const auto n = ToVector(false, Vn, N);
|
||||
return TableLookup(*this, true, D, Vn, Vd, len, N, M, Vm);
|
||||
}
|
||||
|
||||
if (RegNumber(n) + length > 32) {
|
||||
return UnpredictableInstruction();
|
||||
}
|
||||
|
||||
const IR::U64 table0 = ir.GetExtendedRegister(n);
|
||||
const IR::U64 table1 = length >= 2 ? IR::U64{ir.GetExtendedRegister(n + 1)} : ir.Imm64(0);
|
||||
const IR::U64 table2 = length >= 3 ? IR::U64{ir.GetExtendedRegister(n + 2)} : ir.Imm64(0);
|
||||
const IR::U64 table3 = length == 4 ? IR::U64{ir.GetExtendedRegister(n + 3)} : ir.Imm64(0);
|
||||
|
||||
const IR::Table table = ir.VectorTable(length <= 2
|
||||
? std::vector<IR::U128>{ir.Pack2x64To1x128(table0, table1)}
|
||||
: std::vector<IR::U128>{ir.Pack2x64To1x128(table0, table1), ir.Pack2x64To1x128(table2, table3)});
|
||||
const IR::U128 indicies = ir.GetVector(m);
|
||||
const IR::U128 result = ir.VectorTableLookup(ir.ZeroVector(), table, indicies);
|
||||
|
||||
ir.SetVector(d, result);
|
||||
return true;
|
||||
bool ArmTranslatorVisitor::asimd_VTBX(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm) {
|
||||
return TableLookup(*this, false, D, Vn, Vd, len, N, M, Vm);
|
||||
}
|
||||
|
||||
} // namespace Dynarmic::A32
|
||||
|
|
|
@ -514,6 +514,7 @@ struct ArmTranslatorVisitor final {
|
|||
// Advanced SIMD miscellaneous
|
||||
bool asimd_VEXT(bool D, size_t Vn, size_t Vd, Imm<4> imm4, bool N, bool Q, bool M, size_t Vm);
|
||||
bool asimd_VTBL(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm);
|
||||
bool asimd_VTBX(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm);
|
||||
|
||||
// Advanced SIMD load/store structures
|
||||
bool v8_VST_multiple(bool D, Reg n, size_t Vd, Imm<4> type, size_t sz, size_t align, Reg m);
|
||||
|
|
|
@ -1739,14 +1739,26 @@ U128 IREmitter::VectorSub(size_t esize, const U128& a, const U128& b) {
|
|||
UNREACHABLE();
|
||||
}
|
||||
|
||||
Table IREmitter::VectorTable(std::vector<U64> values) {
|
||||
ASSERT(values.size() >= 1 && values.size() <= 4);
|
||||
values.resize(4);
|
||||
return Inst<Table>(Opcode::VectorTable, values[0], values[1], values[2], values[3]);
|
||||
}
|
||||
|
||||
Table IREmitter::VectorTable(std::vector<U128> values) {
|
||||
ASSERT(values.size() >= 1 && values.size() <= 4);
|
||||
values.resize(4);
|
||||
return Inst<Table>(Opcode::VectorTable, values[0], values[1], values[2], values[3]);
|
||||
}
|
||||
|
||||
U64 IREmitter::VectorTableLookup(const U64& defaults, const Table& table, const U64& indices) {
|
||||
ASSERT(table.GetInst()->GetArg(0).GetType() == Type::U64);
|
||||
return Inst<U64>(Opcode::VectorTableLookup64, defaults, table, indices);
|
||||
}
|
||||
|
||||
U128 IREmitter::VectorTableLookup(const U128& defaults, const Table& table, const U128& indices) {
|
||||
return Inst<U128>(Opcode::VectorTableLookup, defaults, table, indices);
|
||||
ASSERT(table.GetInst()->GetArg(0).GetType() == Type::U128);
|
||||
return Inst<U128>(Opcode::VectorTableLookup128, defaults, table, indices);
|
||||
}
|
||||
|
||||
U128 IREmitter::VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b) {
|
||||
|
|
|
@ -297,7 +297,9 @@ public:
|
|||
U128 VectorSignedSaturatedShiftLeft(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorSignedSaturatedShiftLeftUnsigned(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorSub(size_t esize, const U128& a, const U128& b);
|
||||
Table VectorTable(std::vector<U64> values);
|
||||
Table VectorTable(std::vector<U128> values);
|
||||
U64 VectorTableLookup(const U64& defaults, const Table& table, const U64& indices);
|
||||
U128 VectorTableLookup(const U128& defaults, const Table& table, const U128& indices);
|
||||
U128 VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorUnsignedRecipEstimate(const U128& a);
|
||||
|
|
|
@ -470,8 +470,9 @@ OPCODE(VectorSub8, U128, U128
|
|||
OPCODE(VectorSub16, U128, U128, U128 )
|
||||
OPCODE(VectorSub32, U128, U128, U128 )
|
||||
OPCODE(VectorSub64, U128, U128, U128 )
|
||||
OPCODE(VectorTable, Table, U128, Opaque, Opaque, Opaque )
|
||||
OPCODE(VectorTableLookup, U128, U128, Table, U128 )
|
||||
OPCODE(VectorTable, Table, Opaque, Opaque, Opaque, Opaque )
|
||||
OPCODE(VectorTableLookup64, U64, U64, Table, U64 )
|
||||
OPCODE(VectorTableLookup128, U128, U128, Table, U128 )
|
||||
OPCODE(VectorUnsignedAbsoluteDifference8, U128, U128, U128 )
|
||||
OPCODE(VectorUnsignedAbsoluteDifference16, U128, U128, U128 )
|
||||
OPCODE(VectorUnsignedAbsoluteDifference32, U128, U128, U128 )
|
||||
|
|
Loading…
Reference in a new issue