IR: Add VectorMultiply{Signed,Unsigned}Widen instructions
Polyfill for x86-64 backend
This commit is contained in:
parent
bbf0179d30
commit
61d509dda2
11 changed files with 180 additions and 15 deletions
|
@ -1001,6 +1001,54 @@ void EmitIR<IR::Opcode::VectorMultiply64>(oaknut::CodeGenerator& code, EmitConte
|
|||
ASSERT_FALSE("Unimplemented");
|
||||
}
|
||||
|
||||
template<>
|
||||
void EmitIR<IR::Opcode::VectorMultiplySignedWiden8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
(void)code;
|
||||
(void)ctx;
|
||||
(void)inst;
|
||||
ASSERT_FALSE("Unimplemented");
|
||||
}
|
||||
|
||||
template<>
|
||||
void EmitIR<IR::Opcode::VectorMultiplySignedWiden16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
(void)code;
|
||||
(void)ctx;
|
||||
(void)inst;
|
||||
ASSERT_FALSE("Unimplemented");
|
||||
}
|
||||
|
||||
template<>
|
||||
void EmitIR<IR::Opcode::VectorMultiplySignedWiden32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
(void)code;
|
||||
(void)ctx;
|
||||
(void)inst;
|
||||
ASSERT_FALSE("Unimplemented");
|
||||
}
|
||||
|
||||
template<>
|
||||
void EmitIR<IR::Opcode::VectorMultiplyUnsignedWiden8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
(void)code;
|
||||
(void)ctx;
|
||||
(void)inst;
|
||||
ASSERT_FALSE("Unimplemented");
|
||||
}
|
||||
|
||||
template<>
|
||||
void EmitIR<IR::Opcode::VectorMultiplyUnsignedWiden16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
(void)code;
|
||||
(void)ctx;
|
||||
(void)inst;
|
||||
ASSERT_FALSE("Unimplemented");
|
||||
}
|
||||
|
||||
template<>
|
||||
void EmitIR<IR::Opcode::VectorMultiplyUnsignedWiden32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
(void)code;
|
||||
(void)ctx;
|
||||
(void)inst;
|
||||
ASSERT_FALSE("Unimplemented");
|
||||
}
|
||||
|
||||
template<>
|
||||
void EmitIR<IR::Opcode::VectorNarrow16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
(void)code;
|
||||
|
|
|
@ -55,6 +55,7 @@ static std::function<void(BlockOfCode&)> GenRCP(const A32::UserConfig& conf) {
|
|||
static Optimization::PolyfillOptions GenPolyfillOptions(const BlockOfCode& code) {
|
||||
return Optimization::PolyfillOptions{
|
||||
.sha256 = !code.HasHostFeature(HostFeature::SHA),
|
||||
.vector_multiply_widen = true,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -51,6 +51,7 @@ static std::function<void(BlockOfCode&)> GenRCP(const A64::UserConfig& conf) {
|
|||
static Optimization::PolyfillOptions GenPolyfillOptions(const BlockOfCode& code) {
|
||||
return Optimization::PolyfillOptions{
|
||||
.sha256 = !code.HasHostFeature(HostFeature::SHA),
|
||||
.vector_multiply_widen = true,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -2221,6 +2221,30 @@ void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) {
|
|||
ctx.reg_alloc.DefineValue(inst, tmp2);
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorMultiplySignedWiden8(EmitContext&, IR::Inst*) {
|
||||
ASSERT_FALSE("Unexpected VectorMultiplySignedWiden8");
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorMultiplySignedWiden16(EmitContext&, IR::Inst*) {
|
||||
ASSERT_FALSE("Unexpected VectorMultiplySignedWiden16");
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorMultiplySignedWiden32(EmitContext&, IR::Inst*) {
|
||||
ASSERT_FALSE("Unexpected VectorMultiplySignedWiden32");
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorMultiplyUnsignedWiden8(EmitContext&, IR::Inst*) {
|
||||
ASSERT_FALSE("Unexpected VectorMultiplyUnsignedWiden8");
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorMultiplyUnsignedWiden16(EmitContext&, IR::Inst*) {
|
||||
ASSERT_FALSE("Unexpected VectorMultiplyUnsignedWiden16");
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorMultiplyUnsignedWiden32(EmitContext&, IR::Inst*) {
|
||||
ASSERT_FALSE("Unexpected VectorMultiplyUnsignedWiden32");
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorNarrow16(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
|
|
|
@ -909,11 +909,30 @@ bool TranslatorVisitor::asimd_VABDL(bool U, bool D, size_t sz, size_t Vn, size_t
|
|||
}
|
||||
|
||||
bool TranslatorVisitor::asimd_VMLAL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm) {
|
||||
return WideInstruction(*this, U, D, sz, Vn, Vd, N, M, Vm, WidenBehaviour::Both, [this, op](size_t esize, const auto& reg_d, const auto& reg_n, const auto& reg_m) {
|
||||
const auto multiply = ir.VectorMultiply(esize, reg_n, reg_m);
|
||||
return op ? ir.VectorSub(esize, reg_d, multiply)
|
||||
: ir.VectorAdd(esize, reg_d, multiply);
|
||||
});
|
||||
const size_t esize = 8U << sz;
|
||||
|
||||
if (sz == 0b11) {
|
||||
return DecodeError();
|
||||
}
|
||||
|
||||
if (mcl::bit::get_bit<0>(Vd)) {
|
||||
return UndefinedInstruction();
|
||||
}
|
||||
|
||||
const auto d = ToVector(true, Vd, D);
|
||||
const auto m = ToVector(false, Vm, M);
|
||||
const auto n = ToVector(false, Vn, N);
|
||||
|
||||
const auto reg_d = ir.GetVector(d);
|
||||
const auto reg_m = ir.GetVector(m);
|
||||
const auto reg_n = ir.GetVector(n);
|
||||
const auto multiply = U ? ir.VectorMultiplyUnsignedWiden(esize, reg_n, reg_m)
|
||||
: ir.VectorMultiplySignedWiden(esize, reg_n, reg_m);
|
||||
const auto result = op ? ir.VectorSub(esize * 2, reg_d, multiply)
|
||||
: ir.VectorAdd(esize * 2, reg_d, multiply);
|
||||
|
||||
ir.SetVector(d, result);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TranslatorVisitor::asimd_VMULL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool P, bool N, bool M, size_t Vm) {
|
||||
|
@ -930,14 +949,11 @@ bool TranslatorVisitor::asimd_VMULL(bool U, bool D, size_t sz, size_t Vn, size_t
|
|||
const auto m = ToVector(false, Vm, M);
|
||||
const auto n = ToVector(false, Vn, N);
|
||||
|
||||
const auto extend_reg = [&](const auto& reg) {
|
||||
return U ? ir.VectorZeroExtend(esize, reg) : ir.VectorSignExtend(esize, reg);
|
||||
};
|
||||
|
||||
const auto reg_n = ir.GetVector(n);
|
||||
const auto reg_m = ir.GetVector(m);
|
||||
const auto result = P ? ir.VectorPolynomialMultiplyLong(esize, reg_n, reg_m)
|
||||
: ir.VectorMultiply(2 * esize, extend_reg(reg_n), extend_reg(reg_m));
|
||||
: U ? ir.VectorMultiplyUnsignedWiden(esize, reg_n, reg_m)
|
||||
: ir.VectorMultiplySignedWiden(esize, reg_n, reg_m);
|
||||
|
||||
ir.SetVector(d, result);
|
||||
return true;
|
||||
|
|
|
@ -85,11 +85,10 @@ bool ScalarMultiplyLong(TranslatorVisitor& v, bool U, bool D, size_t sz, size_t
|
|||
const auto [m, index] = GetScalarLocation(esize, M, Vm);
|
||||
|
||||
const auto scalar = v.ir.VectorGetElement(esize, v.ir.GetVector(m), index);
|
||||
const auto ext_scalar = U ? (esize == 16 ? IR::U32U64{v.ir.ZeroExtendToWord(scalar)} : IR::U32U64{v.ir.ZeroExtendToLong(scalar)})
|
||||
: (esize == 16 ? IR::U32U64{v.ir.SignExtendToWord(scalar)} : IR::U32U64{v.ir.SignExtendToLong(scalar)});
|
||||
const auto reg_n = U ? v.ir.VectorZeroExtend(esize, v.ir.GetVector(n)) : v.ir.VectorSignExtend(esize, v.ir.GetVector(n));
|
||||
const auto reg_m = v.ir.VectorBroadcast(esize * 2, ext_scalar);
|
||||
const auto addend = v.ir.VectorMultiply(esize * 2, reg_n, reg_m);
|
||||
const auto reg_n = v.ir.GetVector(n);
|
||||
const auto reg_m = v.ir.VectorBroadcast(esize, scalar);
|
||||
const auto addend = U ? v.ir.VectorMultiplyUnsignedWiden(esize, reg_n, reg_m)
|
||||
: v.ir.VectorMultiplySignedWiden(esize, reg_n, reg_m);
|
||||
const auto result = [&] {
|
||||
switch (multiply) {
|
||||
case MultiplyBehavior::Multiply:
|
||||
|
|
|
@ -1404,6 +1404,30 @@ U128 IREmitter::VectorMultiply(size_t esize, const U128& a, const U128& b) {
|
|||
UNREACHABLE();
|
||||
}
|
||||
|
||||
U128 IREmitter::VectorMultiplySignedWiden(size_t esize, const U128& a, const U128& b) {
|
||||
switch (esize) {
|
||||
case 8:
|
||||
return Inst<U128>(Opcode::VectorMultiplySignedWiden8, a, b);
|
||||
case 16:
|
||||
return Inst<U128>(Opcode::VectorMultiplySignedWiden16, a, b);
|
||||
case 32:
|
||||
return Inst<U128>(Opcode::VectorMultiplySignedWiden32, a, b);
|
||||
}
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
U128 IREmitter::VectorMultiplyUnsignedWiden(size_t esize, const U128& a, const U128& b) {
|
||||
switch (esize) {
|
||||
case 8:
|
||||
return Inst<U128>(Opcode::VectorMultiplyUnsignedWiden8, a, b);
|
||||
case 16:
|
||||
return Inst<U128>(Opcode::VectorMultiplyUnsignedWiden16, a, b);
|
||||
case 32:
|
||||
return Inst<U128>(Opcode::VectorMultiplyUnsignedWiden32, a, b);
|
||||
}
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
U128 IREmitter::VectorNarrow(size_t original_esize, const U128& a) {
|
||||
switch (original_esize) {
|
||||
case 16:
|
||||
|
|
|
@ -264,6 +264,8 @@ public:
|
|||
U128 VectorMinSigned(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorMinUnsigned(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorMultiply(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorMultiplySignedWiden(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorMultiplyUnsignedWiden(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorNarrow(size_t original_esize, const U128& a);
|
||||
U128 VectorNot(const U128& a);
|
||||
U128 VectorOr(const U128& a, const U128& b);
|
||||
|
|
|
@ -402,6 +402,12 @@ OPCODE(VectorMultiply8, U128, U128
|
|||
OPCODE(VectorMultiply16, U128, U128, U128 )
|
||||
OPCODE(VectorMultiply32, U128, U128, U128 )
|
||||
OPCODE(VectorMultiply64, U128, U128, U128 )
|
||||
OPCODE(VectorMultiplySignedWiden8, U128, U128, U128 )
|
||||
OPCODE(VectorMultiplySignedWiden16, U128, U128, U128 )
|
||||
OPCODE(VectorMultiplySignedWiden32, U128, U128, U128 )
|
||||
OPCODE(VectorMultiplyUnsignedWiden8, U128, U128, U128 )
|
||||
OPCODE(VectorMultiplyUnsignedWiden16, U128, U128, U128 )
|
||||
OPCODE(VectorMultiplyUnsignedWiden32, U128, U128, U128 )
|
||||
OPCODE(VectorNarrow16, U128, U128 )
|
||||
OPCODE(VectorNarrow32, U128, U128 )
|
||||
OPCODE(VectorNarrow64, U128, U128 )
|
||||
|
|
|
@ -22,6 +22,7 @@ namespace Dynarmic::Optimization {
|
|||
|
||||
struct PolyfillOptions {
|
||||
bool sha256 = false;
|
||||
bool vector_multiply_widen = false;
|
||||
|
||||
bool operator==(const PolyfillOptions&) const = default;
|
||||
};
|
||||
|
|
|
@ -138,6 +138,19 @@ void PolyfillSHA256Hash(IR::IREmitter& ir, IR::Inst& inst) {
|
|||
inst.ReplaceUsesWith(part1 ? x : y);
|
||||
}
|
||||
|
||||
template<size_t esize, bool is_signed>
|
||||
void PolyfillVectorMultiplyWiden(IR::IREmitter& ir, IR::Inst& inst) {
|
||||
IR::U128 n = (IR::U128)inst.GetArg(0);
|
||||
IR::U128 m = (IR::U128)inst.GetArg(1);
|
||||
|
||||
const IR::U128 wide_n = is_signed ? ir.VectorSignExtend(esize, n) : ir.VectorZeroExtend(esize, n);
|
||||
const IR::U128 wide_m = is_signed ? ir.VectorSignExtend(esize, m) : ir.VectorZeroExtend(esize, m);
|
||||
|
||||
const IR::U128 result = ir.VectorMultiply(esize * 2, wide_n, wide_m);
|
||||
|
||||
inst.ReplaceUsesWith(result);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void PolyfillPass(IR::Block& block, const PolyfillOptions& polyfill) {
|
||||
|
@ -166,6 +179,36 @@ void PolyfillPass(IR::Block& block, const PolyfillOptions& polyfill) {
|
|||
PolyfillSHA256Hash(ir, inst);
|
||||
}
|
||||
break;
|
||||
case IR::Opcode::VectorMultiplySignedWiden8:
|
||||
if (polyfill.vector_multiply_widen) {
|
||||
PolyfillVectorMultiplyWiden<8, true>(ir, inst);
|
||||
}
|
||||
break;
|
||||
case IR::Opcode::VectorMultiplySignedWiden16:
|
||||
if (polyfill.vector_multiply_widen) {
|
||||
PolyfillVectorMultiplyWiden<16, true>(ir, inst);
|
||||
}
|
||||
break;
|
||||
case IR::Opcode::VectorMultiplySignedWiden32:
|
||||
if (polyfill.vector_multiply_widen) {
|
||||
PolyfillVectorMultiplyWiden<32, true>(ir, inst);
|
||||
}
|
||||
break;
|
||||
case IR::Opcode::VectorMultiplyUnsignedWiden8:
|
||||
if (polyfill.vector_multiply_widen) {
|
||||
PolyfillVectorMultiplyWiden<8, false>(ir, inst);
|
||||
}
|
||||
break;
|
||||
case IR::Opcode::VectorMultiplyUnsignedWiden16:
|
||||
if (polyfill.vector_multiply_widen) {
|
||||
PolyfillVectorMultiplyWiden<16, false>(ir, inst);
|
||||
}
|
||||
break;
|
||||
case IR::Opcode::VectorMultiplyUnsignedWiden32:
|
||||
if (polyfill.vector_multiply_widen) {
|
||||
PolyfillVectorMultiplyWiden<32, false>(ir, inst);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue