A32: Implement ASIMD VTBX

This commit is contained in:
MerryMage 2020-06-20 22:34:55 +01:00
parent 06f7229c57
commit 8bbc9fdbb6
8 changed files with 220 additions and 26 deletions

View file

@ -40,6 +40,9 @@ using A64FullVectorWidth = std::integral_constant<size_t, 128>;
template <typename T>
using VectorArray = std::array<T, A64FullVectorWidth::value / Common::BitSize<T>()>;
template <typename T>
using HalfVectorArray = std::array<T, A64FullVectorWidth::value / Common::BitSize<T>() / 2>;
struct EmitContext {
EmitContext(RegAlloc& reg_alloc, IR::Block& block);

View file

@ -4029,7 +4029,174 @@ void EmitX64::EmitVectorTable(EmitContext&, IR::Inst* inst) {
ASSERT_MSG(inst->UseCount() == 1, "Table cannot be used multiple times");
}
void EmitX64::EmitVectorTableLookup(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
ASSERT(inst->GetArg(1).GetInst()->GetOpcode() == IR::Opcode::VectorTable);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
auto table = ctx.reg_alloc.GetArgumentInfo(inst->GetArg(1).GetInst());
const size_t table_size = std::count_if(table.begin(), table.end(), [](const auto& elem){ return !elem.IsVoid(); });
const bool is_defaults_zero = inst->GetArg(0).IsZero();
// TODO: AVX512VL implementation when available (VPERMB / VPERMI2B / VPERMT2B)
const std::array<u64, 5> sat_const{
0,
0x7878787878787878,
0x7070707070707070,
0x6868686868686868,
0x6060606060606060,
};
if (code.HasSSSE3() && is_defaults_zero && table_size <= 2) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
if (table_size == 2) {
const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
code.punpcklqdq(xmm_table0, xmm_table0_upper);
ctx.reg_alloc.Release(xmm_table0_upper);
}
code.paddusb(indicies, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
code.pshufb(xmm_table0, indicies);
ctx.reg_alloc.DefineValue(inst, xmm_table0);
return;
}
if (code.HasSSE41() && table_size <= 2) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
if (table_size == 2) {
const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
code.punpcklqdq(xmm_table0, xmm_table0_upper);
ctx.reg_alloc.Release(xmm_table0_upper);
}
if (code.HasAVX()) {
code.vpaddusb(xmm0, indicies, code.MConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
} else {
code.movaps(xmm0, indicies);
code.paddusb(xmm0, code.MConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
}
code.pshufb(xmm_table0, indicies);
code.pblendvb(xmm_table0, defaults);
ctx.reg_alloc.DefineValue(inst, xmm_table0);
return;
}
if (code.HasSSE41() && is_defaults_zero) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[2]);
{
const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
code.punpcklqdq(xmm_table0, xmm_table0_upper);
ctx.reg_alloc.Release(xmm_table0_upper);
}
if (table_size == 4) {
const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(table[3]);
code.punpcklqdq(xmm_table1, xmm_table1_upper);
ctx.reg_alloc.Release(xmm_table1_upper);
}
if (code.HasAVX()) {
code.vpaddusb(xmm0, indicies, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
} else {
code.movaps(xmm0, indicies);
code.paddusb(xmm0, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
}
code.paddusb(indicies, code.MConst(xword, 0x6060606060606060, 0xFFFFFFFFFFFFFFFF));
code.pshufb(xmm_table0, xmm0);
code.pshufb(xmm_table1, indicies);
code.pblendvb(xmm_table0, xmm_table1);
ctx.reg_alloc.DefineValue(inst, xmm_table0);
return;
}
if (code.HasSSE41()) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[2]);
{
const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
code.punpcklqdq(xmm_table0, xmm_table0_upper);
ctx.reg_alloc.Release(xmm_table0_upper);
}
if (table_size == 4) {
const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(table[3]);
code.punpcklqdq(xmm_table1, xmm_table1_upper);
ctx.reg_alloc.Release(xmm_table1_upper);
}
if (code.HasAVX()) {
code.vpaddusb(xmm0, indicies, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
} else {
code.movaps(xmm0, indicies);
code.paddusb(xmm0, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
}
code.pshufb(xmm_table0, indicies);
code.pshufb(xmm_table1, indicies);
code.pblendvb(xmm_table0, xmm_table1);
if (code.HasAVX()) {
code.vpaddusb(xmm0, indicies, code.MConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
} else {
code.movaps(xmm0, indicies);
code.paddusb(xmm0, code.MConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
}
code.pblendvb(xmm_table0, defaults);
ctx.reg_alloc.DefineValue(inst, xmm_table0);
return;
}
const u32 stack_space = static_cast<u32>(6 * 8);
code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
for (size_t i = 0; i < table_size; ++i) {
const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(table[i]);
code.movq(qword[rsp + ABI_SHADOW_SPACE + i * 8], table_value);
ctx.reg_alloc.Release(table_value);
}
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(nullptr);
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]);
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 4 * 8]);
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 5 * 8]);
code.mov(code.ABI_PARAM4.cvt32(), table_size);
code.movq(qword[code.ABI_PARAM2], defaults);
code.movq(qword[code.ABI_PARAM3], indicies);
code.CallLambda(
[](const HalfVectorArray<u8>* table, HalfVectorArray<u8>& result, const HalfVectorArray<u8>& indicies, size_t table_size) {
for (size_t i = 0; i < result.size(); ++i) {
const size_t index = indicies[i] / table[0].size();
const size_t elem = indicies[i] % table[0].size();
if (index < table_size) {
result[i] = table[index][elem];
}
}
}
);
code.movq(result, qword[rsp + ABI_SHADOW_SPACE + 4 * 8]);
code.add(rsp, stack_space + ABI_SHADOW_SPACE);
ctx.reg_alloc.DefineValue(inst, result);
}
void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
ASSERT(inst->GetArg(1).GetInst()->GetOpcode() == IR::Opcode::VectorTable);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);

View file

@ -112,7 +112,7 @@ INST(asimd_VRSQRTE, "VRSQRTE", "111100111D11zz11dddd010
// Miscellaneous
INST(asimd_VEXT, "VEXT", "111100101D11nnnnddddiiiiNQM0mmmm") // ASIMD
INST(asimd_VTBL, "VTBL", "111100111D11nnnndddd10zzN0M0mmmm") // ASIMD
//INST(asimd_VTBX, "VTBX", "111100111D11nnnndddd10zzN1M0mmmm") // ASIMD
INST(asimd_VTBX, "VTBX", "111100111D11nnnndddd10zzN1M0mmmm") // ASIMD
//INST(asimd_VDUP_scalar, "VDUP (scalar)", "111100111D11iiiidddd11000QM0mmmm") // ASIMD
// One register and modified immediate

View file

@ -10,6 +10,31 @@
namespace Dynarmic::A32 {
static bool TableLookup(ArmTranslatorVisitor& v, bool is_vtbl, bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm) {
const size_t length = len + 1;
const auto d = ToVector(false, Vd, D);
const auto m = ToVector(false, Vm, M);
const auto n = ToVector(false, Vn, N);
if (RegNumber(n) + length > 32) {
return v.UnpredictableInstruction();
}
const IR::Table table = v.ir.VectorTable([&]{
std::vector<IR::U64> result;
for (size_t i = 0; i < length; ++i) {
result.emplace_back(v.ir.GetExtendedRegister(n + i));
}
return result;
}());
const IR::U64 indicies = v.ir.GetExtendedRegister(m);
const IR::U64 defaults = is_vtbl ? v.ir.Imm64(0) : IR::U64{v.ir.GetExtendedRegister(d)};
const IR::U64 result = v.ir.VectorTableLookup(defaults, table, indicies);
v.ir.SetExtendedRegister(d, result);
return true;
}
bool ArmTranslatorVisitor::asimd_VEXT(bool D, size_t Vn, size_t Vd, Imm<4> imm4, bool N, bool Q, bool M, size_t Vm) {
if (Q && (Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm))) {
return UndefinedInstruction();
@ -33,28 +58,11 @@ bool ArmTranslatorVisitor::asimd_VEXT(bool D, size_t Vn, size_t Vd, Imm<4> imm4,
}
bool ArmTranslatorVisitor::asimd_VTBL(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm) {
const size_t length = len + 1;
const auto d = ToVector(false, Vd, D);
const auto m = ToVector(false, Vm, M);
const auto n = ToVector(false, Vn, N);
return TableLookup(*this, true, D, Vn, Vd, len, N, M, Vm);
}
if (RegNumber(n) + length > 32) {
return UnpredictableInstruction();
}
const IR::U64 table0 = ir.GetExtendedRegister(n);
const IR::U64 table1 = length >= 2 ? IR::U64{ir.GetExtendedRegister(n + 1)} : ir.Imm64(0);
const IR::U64 table2 = length >= 3 ? IR::U64{ir.GetExtendedRegister(n + 2)} : ir.Imm64(0);
const IR::U64 table3 = length == 4 ? IR::U64{ir.GetExtendedRegister(n + 3)} : ir.Imm64(0);
const IR::Table table = ir.VectorTable(length <= 2
? std::vector<IR::U128>{ir.Pack2x64To1x128(table0, table1)}
: std::vector<IR::U128>{ir.Pack2x64To1x128(table0, table1), ir.Pack2x64To1x128(table2, table3)});
const IR::U128 indicies = ir.GetVector(m);
const IR::U128 result = ir.VectorTableLookup(ir.ZeroVector(), table, indicies);
ir.SetVector(d, result);
return true;
bool ArmTranslatorVisitor::asimd_VTBX(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm) {
return TableLookup(*this, false, D, Vn, Vd, len, N, M, Vm);
}
} // namespace Dynarmic::A32

View file

@ -514,6 +514,7 @@ struct ArmTranslatorVisitor final {
// Advanced SIMD miscellaneous
bool asimd_VEXT(bool D, size_t Vn, size_t Vd, Imm<4> imm4, bool N, bool Q, bool M, size_t Vm);
bool asimd_VTBL(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm);
bool asimd_VTBX(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm);
// Advanced SIMD load/store structures
bool v8_VST_multiple(bool D, Reg n, size_t Vd, Imm<4> type, size_t sz, size_t align, Reg m);

View file

@ -1739,14 +1739,26 @@ U128 IREmitter::VectorSub(size_t esize, const U128& a, const U128& b) {
UNREACHABLE();
}
Table IREmitter::VectorTable(std::vector<U64> values) {
ASSERT(values.size() >= 1 && values.size() <= 4);
values.resize(4);
return Inst<Table>(Opcode::VectorTable, values[0], values[1], values[2], values[3]);
}
Table IREmitter::VectorTable(std::vector<U128> values) {
ASSERT(values.size() >= 1 && values.size() <= 4);
values.resize(4);
return Inst<Table>(Opcode::VectorTable, values[0], values[1], values[2], values[3]);
}
U64 IREmitter::VectorTableLookup(const U64& defaults, const Table& table, const U64& indices) {
ASSERT(table.GetInst()->GetArg(0).GetType() == Type::U64);
return Inst<U64>(Opcode::VectorTableLookup64, defaults, table, indices);
}
U128 IREmitter::VectorTableLookup(const U128& defaults, const Table& table, const U128& indices) {
return Inst<U128>(Opcode::VectorTableLookup, defaults, table, indices);
ASSERT(table.GetInst()->GetArg(0).GetType() == Type::U128);
return Inst<U128>(Opcode::VectorTableLookup128, defaults, table, indices);
}
U128 IREmitter::VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b) {

View file

@ -297,7 +297,9 @@ public:
U128 VectorSignedSaturatedShiftLeft(size_t esize, const U128& a, const U128& b);
U128 VectorSignedSaturatedShiftLeftUnsigned(size_t esize, const U128& a, const U128& b);
U128 VectorSub(size_t esize, const U128& a, const U128& b);
Table VectorTable(std::vector<U64> values);
Table VectorTable(std::vector<U128> values);
U64 VectorTableLookup(const U64& defaults, const Table& table, const U64& indices);
U128 VectorTableLookup(const U128& defaults, const Table& table, const U128& indices);
U128 VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b);
U128 VectorUnsignedRecipEstimate(const U128& a);

View file

@ -470,8 +470,9 @@ OPCODE(VectorSub8, U128, U128
OPCODE(VectorSub16, U128, U128, U128 )
OPCODE(VectorSub32, U128, U128, U128 )
OPCODE(VectorSub64, U128, U128, U128 )
OPCODE(VectorTable, Table, U128, Opaque, Opaque, Opaque )
OPCODE(VectorTableLookup, U128, U128, Table, U128 )
OPCODE(VectorTable, Table, Opaque, Opaque, Opaque, Opaque )
OPCODE(VectorTableLookup64, U64, U64, Table, U64 )
OPCODE(VectorTableLookup128, U128, U128, Table, U128 )
OPCODE(VectorUnsignedAbsoluteDifference8, U128, U128, U128 )
OPCODE(VectorUnsignedAbsoluteDifference16, U128, U128, U128 )
OPCODE(VectorUnsignedAbsoluteDifference32, U128, U128, U128 )