diff --git a/src/backend_x64/emit_x64_vector.cpp b/src/backend_x64/emit_x64_vector.cpp index 5e879b0b..65f5d5cd 100644 --- a/src/backend_x64/emit_x64_vector.cpp +++ b/src/backend_x64/emit_x64_vector.cpp @@ -69,6 +69,30 @@ static void EmitOneArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Ins ctx.reg_alloc.DefineValue(inst, xmm0); } +template +static void EmitOneArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { + const auto fn = static_cast*>(lambda); + constexpr u32 stack_space = 2 * 16; + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]); + ctx.reg_alloc.EndOfAllocScope(); + + ctx.reg_alloc.HostCall(nullptr); + code.sub(rsp, stack_space + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); + + code.movaps(xword[code.ABI_PARAM2], arg1); + code.CallFunction(fn); + code.movaps(xmm0, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); + + code.add(rsp, stack_space + ABI_SHADOW_SPACE); + + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8()); + + ctx.reg_alloc.DefineValue(inst, xmm0); +} + template static void EmitTwoArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { const auto fn = static_cast*>(lambda); @@ -2169,6 +2193,70 @@ void EmitX64::EmitVectorSignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* i EmitVectorSignedAbsoluteDifference(32, ctx, inst, code); } +static void EmitVectorSignedSaturatedNarrowToUnsigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm dest = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm reconstructed = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(dest, src); + code.pxor(zero, zero); + + switch (original_esize) { + case 16: + code.packuswb(dest, dest); + code.movdqa(reconstructed, dest); + code.punpcklbw(reconstructed, zero); + break; + case 32: + code.packusdw(dest, dest); + code.movdqa(reconstructed, dest); + code.punpcklwd(reconstructed, zero); + break; + default: + UNREACHABLE(); + break; + } + + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + code.pxor(reconstructed, src); + code.ptest(reconstructed, reconstructed); + } else { + code.pcmpeqd(reconstructed, src); + code.movmskps(bit, reconstructed); + code.cmp(bit, 0); + } + + code.setnz(bit.cvt8()); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit.cvt8()); + + ctx.reg_alloc.DefineValue(inst, dest); +} + +void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedNarrowToUnsigned(16, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedNarrowToUnsigned(32, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned64(EmitContext& ctx, IR::Inst* inst) { + EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray& result, const VectorArray& a) { + bool qc_flag = false; + result.fill(0); + for (size_t i = 0; i < a.size(); ++i) { + const s64 saturated = std::clamp(a[i], 0, 0xFFFFFFFF); + result[i] = static_cast(saturated); + qc_flag |= saturated != a[i]; + } + return qc_flag; + }); +} + void EmitX64::EmitVectorSub8(EmitContext& ctx, IR::Inst* inst) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubb); } diff --git a/src/frontend/A64/decoder/a64.inc b/src/frontend/A64/decoder/a64.inc index 1401bb60..4f34be5d 100644 --- a/src/frontend/A64/decoder/a64.inc +++ b/src/frontend/A64/decoder/a64.inc @@ -622,7 +622,7 @@ INST(UADDLP, "UADDLP", "0Q101 INST(CMGE_zero_2, "CMGE (zero)", "0Q101110zz100000100010nnnnnddddd") INST(CMLE_2, "CMLE (zero)", "0Q101110zz100000100110nnnnnddddd") INST(NEG_2, "NEG (vector)", "0Q101110zz100000101110nnnnnddddd") -//INST(SQXTUN_2, "SQXTUN, SQXTUN2", "0Q101110zz100001001010nnnnnddddd") +INST(SQXTUN_2, "SQXTUN, SQXTUN2", "0Q101110zz100001001010nnnnnddddd") INST(SHLL, "SHLL, SHLL2", "0Q101110zz100001001110nnnnnddddd") //INST(UQXTN_2, "UQXTN, UQXTN2", "0Q101110zz100001010010nnnnnddddd") //INST(FCVTXN_2, "FCVTXN, FCVTXN2", "0Q1011100z100001011010nnnnnddddd") diff --git a/src/frontend/A64/translate/impl/impl.h b/src/frontend/A64/translate/impl/impl.h index 7dde70d7..4664ceec 100644 --- a/src/frontend/A64/translate/impl/impl.h +++ b/src/frontend/A64/translate/impl/impl.h @@ -539,7 +539,7 @@ struct TranslatorVisitor final { bool NEG_1(Imm<2> size, Vec Vn, Vec Vd); bool NEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); bool SQXTUN_1(Imm<2> size, Vec Vn, Reg Rd); - bool SQXTUN_2(bool Q, Imm<2> size, Vec Vn, Reg Rd); + bool SQXTUN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); bool UQXTN_1(Imm<2> size, Vec Vn, Reg Rd); bool UQXTN_2(bool Q, Imm<2> size, Vec Vn, Reg Rd); bool FCVTXN_1(bool sz, Vec Vn, Reg Rd); diff --git a/src/frontend/A64/translate/impl/simd_two_register_misc.cpp b/src/frontend/A64/translate/impl/simd_two_register_misc.cpp index f93fd8e1..014127a0 100644 --- a/src/frontend/A64/translate/impl/simd_two_register_misc.cpp +++ b/src/frontend/A64/translate/impl/simd_two_register_misc.cpp @@ -275,6 +275,22 @@ bool TranslatorVisitor::NEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { return true; } +bool TranslatorVisitor::SQXTUN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = 64; + const size_t part = Q ? 1 : 0; + + const IR::U128 operand = V(2 * datasize, Vn); + const IR::U128 result = ir.VectorSignedSaturatedNarrowToUnsigned(2 * esize, operand); + + Vpart(datasize, Vd, part, result); + return true; +} + bool TranslatorVisitor::NOT(bool Q, Vec Vn, Vec Vd) { const size_t datasize = Q ? 128 : 64; diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 84d462d2..0f335302 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1292,6 +1292,19 @@ U128 IREmitter::VectorSignedAbsoluteDifference(size_t esize, const U128& a, cons return {}; } +U128 IREmitter::VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a) { + switch (original_esize) { + case 16: + return Inst(Opcode::VectorSignedSaturatedNarrowToUnsigned16, a); + case 32: + return Inst(Opcode::VectorSignedSaturatedNarrowToUnsigned32, a); + case 64: + return Inst(Opcode::VectorSignedSaturatedNarrowToUnsigned64, a); + } + UNREACHABLE(); + return {}; +} + U128 IREmitter::VectorSub(size_t esize, const U128& a, const U128& b) { switch (esize) { case 8: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 6820b0a0..c981c915 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -249,6 +249,7 @@ public: U128 VectorShuffleWords(const U128& a, u8 mask); U128 VectorSignExtend(size_t original_esize, const U128& a); U128 VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b); + U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a); U128 VectorSub(size_t esize, const U128& a, const U128& b); U128 VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b); U128 VectorZeroExtend(size_t original_esize, const U128& a); diff --git a/src/frontend/ir/microinstruction.cpp b/src/frontend/ir/microinstruction.cpp index f7ffb8ca..a26dc81a 100644 --- a/src/frontend/ir/microinstruction.cpp +++ b/src/frontend/ir/microinstruction.cpp @@ -340,7 +340,15 @@ bool Inst::ReadsFromFPSRCumulativeSaturationBit() const { } bool Inst::WritesToFPSRCumulativeSaturationBit() const { - return false; + switch (op) { + case Opcode::VectorSignedSaturatedNarrowToUnsigned16: + case Opcode::VectorSignedSaturatedNarrowToUnsigned32: + case Opcode::VectorSignedSaturatedNarrowToUnsigned64: + return true; + + default: + return false; + } } bool Inst::CausesCPUException() const { diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 2e7ec331..8d55164b 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -1,491 +1,494 @@ // opcode name, return type, arg1 type, arg2 type, arg3 type, ... -OPCODE(Void, T::Void, ) -OPCODE(Identity, T::Opaque, T::Opaque ) -OPCODE(Breakpoint, T::Void, ) +OPCODE(Void, T::Void, ) +OPCODE(Identity, T::Opaque, T::Opaque ) +OPCODE(Breakpoint, T::Void, ) // A32 Context getters/setters -A32OPC(GetRegister, T::U32, T::A32Reg ) -A32OPC(GetExtendedRegister32, T::U32, T::A32ExtReg ) -A32OPC(GetExtendedRegister64, T::U64, T::A32ExtReg ) -A32OPC(SetRegister, T::Void, T::A32Reg, T::U32 ) -A32OPC(SetExtendedRegister32, T::Void, T::A32ExtReg, T::U32 ) -A32OPC(SetExtendedRegister64, T::Void, T::A32ExtReg, T::U64 ) -A32OPC(GetCpsr, T::U32, ) -A32OPC(SetCpsr, T::Void, T::U32 ) -A32OPC(SetCpsrNZCV, T::Void, T::U32 ) -A32OPC(SetCpsrNZCVQ, T::Void, T::U32 ) -A32OPC(GetNFlag, T::U1, ) -A32OPC(SetNFlag, T::Void, T::U1 ) -A32OPC(GetZFlag, T::U1, ) -A32OPC(SetZFlag, T::Void, T::U1 ) -A32OPC(GetCFlag, T::U1, ) -A32OPC(SetCFlag, T::Void, T::U1 ) -A32OPC(GetVFlag, T::U1, ) -A32OPC(SetVFlag, T::Void, T::U1 ) -A32OPC(OrQFlag, T::Void, T::U1 ) -A32OPC(GetGEFlags, T::U32, ) -A32OPC(SetGEFlags, T::Void, T::U32 ) -A32OPC(SetGEFlagsCompressed, T::Void, T::U32 ) -A32OPC(BXWritePC, T::Void, T::U32 ) -A32OPC(CallSupervisor, T::Void, T::U32 ) -A32OPC(ExceptionRaised, T::Void, T::U32, T::U64 ) -A32OPC(GetFpscr, T::U32, ) -A32OPC(SetFpscr, T::Void, T::U32, ) -A32OPC(GetFpscrNZCV, T::U32, ) -A32OPC(SetFpscrNZCV, T::Void, T::NZCVFlags ) +A32OPC(GetRegister, T::U32, T::A32Reg ) +A32OPC(GetExtendedRegister32, T::U32, T::A32ExtReg ) +A32OPC(GetExtendedRegister64, T::U64, T::A32ExtReg ) +A32OPC(SetRegister, T::Void, T::A32Reg, T::U32 ) +A32OPC(SetExtendedRegister32, T::Void, T::A32ExtReg, T::U32 ) +A32OPC(SetExtendedRegister64, T::Void, T::A32ExtReg, T::U64 ) +A32OPC(GetCpsr, T::U32, ) +A32OPC(SetCpsr, T::Void, T::U32 ) +A32OPC(SetCpsrNZCV, T::Void, T::U32 ) +A32OPC(SetCpsrNZCVQ, T::Void, T::U32 ) +A32OPC(GetNFlag, T::U1, ) +A32OPC(SetNFlag, T::Void, T::U1 ) +A32OPC(GetZFlag, T::U1, ) +A32OPC(SetZFlag, T::Void, T::U1 ) +A32OPC(GetCFlag, T::U1, ) +A32OPC(SetCFlag, T::Void, T::U1 ) +A32OPC(GetVFlag, T::U1, ) +A32OPC(SetVFlag, T::Void, T::U1 ) +A32OPC(OrQFlag, T::Void, T::U1 ) +A32OPC(GetGEFlags, T::U32, ) +A32OPC(SetGEFlags, T::Void, T::U32 ) +A32OPC(SetGEFlagsCompressed, T::Void, T::U32 ) +A32OPC(BXWritePC, T::Void, T::U32 ) +A32OPC(CallSupervisor, T::Void, T::U32 ) +A32OPC(ExceptionRaised, T::Void, T::U32, T::U64 ) +A32OPC(GetFpscr, T::U32, ) +A32OPC(SetFpscr, T::Void, T::U32, ) +A32OPC(GetFpscrNZCV, T::U32, ) +A32OPC(SetFpscrNZCV, T::Void, T::NZCVFlags ) // A64 Context getters/setters -A64OPC(SetCheckBit, T::Void, T::U1 ) -A64OPC(GetCFlag, T::U1, ) -A64OPC(SetNZCV, T::Void, T::NZCVFlags ) -A64OPC(GetW, T::U32, T::A64Reg ) -A64OPC(GetX, T::U64, T::A64Reg ) -//A64OPC(GetB, T::U128, T::A64Vec ) -//A64OPC(GetH, T::U128, T::A64Vec ) -A64OPC(GetS, T::U128, T::A64Vec ) -A64OPC(GetD, T::U128, T::A64Vec ) -A64OPC(GetQ, T::U128, T::A64Vec ) -A64OPC(GetSP, T::U64, ) -A64OPC(GetFPCR, T::U32, ) -A64OPC(GetFPSR, T::U32, ) -A64OPC(SetW, T::Void, T::A64Reg, T::U32 ) -A64OPC(SetX, T::Void, T::A64Reg, T::U64 ) -//A64OPC(SetB, T::Void, T::A64Vec, T::U8 ) -//A64OPC(SetH, T::Void, T::A64Vec, T::U16 ) -A64OPC(SetS, T::Void, T::A64Vec, T::U128 ) -A64OPC(SetD, T::Void, T::A64Vec, T::U128 ) -A64OPC(SetQ, T::Void, T::A64Vec, T::U128 ) -A64OPC(SetSP, T::Void, T::U64 ) -A64OPC(SetFPCR, T::Void, T::U32 ) -A64OPC(SetFPSR, T::Void, T::U32 ) -A64OPC(SetPC, T::Void, T::U64 ) -A64OPC(CallSupervisor, T::Void, T::U32 ) -A64OPC(ExceptionRaised, T::Void, T::U64, T::U64 ) -A64OPC(DataCacheOperationRaised, T::Void, T::U64, T::U64 ) -A64OPC(DataSynchronizationBarrier, T::Void, ) -A64OPC(DataMemoryBarrier, T::Void, ) -A64OPC(GetCNTPCT, T::U64, ) -A64OPC(GetCTR, T::U32, ) -A64OPC(GetDCZID, T::U32, ) -A64OPC(GetTPIDR, T::U64, ) -A64OPC(SetTPIDR, T::Void, T::U64 ) -A64OPC(GetTPIDRRO, T::U64, ) +A64OPC(SetCheckBit, T::Void, T::U1 ) +A64OPC(GetCFlag, T::U1, ) +A64OPC(SetNZCV, T::Void, T::NZCVFlags ) +A64OPC(GetW, T::U32, T::A64Reg ) +A64OPC(GetX, T::U64, T::A64Reg ) +//A64OPC(GetB, T::U128, T::A64Vec ) +//A64OPC(GetH, T::U128, T::A64Vec ) +A64OPC(GetS, T::U128, T::A64Vec ) +A64OPC(GetD, T::U128, T::A64Vec ) +A64OPC(GetQ, T::U128, T::A64Vec ) +A64OPC(GetSP, T::U64, ) +A64OPC(GetFPCR, T::U32, ) +A64OPC(GetFPSR, T::U32, ) +A64OPC(SetW, T::Void, T::A64Reg, T::U32 ) +A64OPC(SetX, T::Void, T::A64Reg, T::U64 ) +//A64OPC(SetB, T::Void, T::A64Vec, T::U8 ) +//A64OPC(SetH, T::Void, T::A64Vec, T::U16 ) +A64OPC(SetS, T::Void, T::A64Vec, T::U128 ) +A64OPC(SetD, T::Void, T::A64Vec, T::U128 ) +A64OPC(SetQ, T::Void, T::A64Vec, T::U128 ) +A64OPC(SetSP, T::Void, T::U64 ) +A64OPC(SetFPCR, T::Void, T::U32 ) +A64OPC(SetFPSR, T::Void, T::U32 ) +A64OPC(SetPC, T::Void, T::U64 ) +A64OPC(CallSupervisor, T::Void, T::U32 ) +A64OPC(ExceptionRaised, T::Void, T::U64, T::U64 ) +A64OPC(DataCacheOperationRaised, T::Void, T::U64, T::U64 ) +A64OPC(DataSynchronizationBarrier, T::Void, ) +A64OPC(DataMemoryBarrier, T::Void, ) +A64OPC(GetCNTPCT, T::U64, ) +A64OPC(GetCTR, T::U32, ) +A64OPC(GetDCZID, T::U32, ) +A64OPC(GetTPIDR, T::U64, ) +A64OPC(SetTPIDR, T::Void, T::U64 ) +A64OPC(GetTPIDRRO, T::U64, ) // Hints -OPCODE(PushRSB, T::Void, T::U64 ) +OPCODE(PushRSB, T::Void, T::U64 ) // Pseudo-operation, handled special ly at final emit -OPCODE(GetCarryFromOp, T::U1, T::U32 ) -OPCODE(GetOverflowFromOp, T::U1, T::U32 ) -OPCODE(GetGEFromOp, T::U32, T::U32 ) -OPCODE(GetNZCVFromOp, T::NZCVFlags, T::Opaque ) +OPCODE(GetCarryFromOp, T::U1, T::U32 ) +OPCODE(GetOverflowFromOp, T::U1, T::U32 ) +OPCODE(GetGEFromOp, T::U32, T::U32 ) +OPCODE(GetNZCVFromOp, T::NZCVFlags, T::Opaque ) -OPCODE(NZCVFromPackedFlags, T::NZCVFlags, T::U32 ) +OPCODE(NZCVFromPackedFlags, T::NZCVFlags, T::U32 ) // Calculations -OPCODE(Pack2x32To1x64, T::U64, T::U32, T::U32 ) -OPCODE(Pack2x64To1x128, T::U128, T::U64, T::U64 ) -OPCODE(LeastSignificantWord, T::U32, T::U64 ) -OPCODE(MostSignificantWord, T::U32, T::U64 ) -OPCODE(LeastSignificantHalf, T::U16, T::U32 ) -OPCODE(LeastSignificantByte, T::U8, T::U32 ) -OPCODE(MostSignificantBit, T::U1, T::U32 ) -OPCODE(IsZero32, T::U1, T::U32 ) -OPCODE(IsZero64, T::U1, T::U64 ) -OPCODE(TestBit, T::U1, T::U64, T::U8 ) -OPCODE(ConditionalSelect32, T::U32, T::Cond, T::U32, T::U32 ) -OPCODE(ConditionalSelect64, T::U64, T::Cond, T::U64, T::U64 ) -OPCODE(ConditionalSelectNZCV, T::NZCVFlags, T::Cond, T::NZCVFlags, T::NZCVFlags ) -OPCODE(LogicalShiftLeft32, T::U32, T::U32, T::U8, T::U1 ) -OPCODE(LogicalShiftLeft64, T::U64, T::U64, T::U8 ) -OPCODE(LogicalShiftRight32, T::U32, T::U32, T::U8, T::U1 ) -OPCODE(LogicalShiftRight64, T::U64, T::U64, T::U8 ) -OPCODE(ArithmeticShiftRight32, T::U32, T::U32, T::U8, T::U1 ) -OPCODE(ArithmeticShiftRight64, T::U64, T::U64, T::U8 ) -OPCODE(RotateRight32, T::U32, T::U32, T::U8, T::U1 ) -OPCODE(RotateRight64, T::U64, T::U64, T::U8 ) -OPCODE(RotateRightExtended, T::U32, T::U32, T::U1 ) -OPCODE(Add32, T::U32, T::U32, T::U32, T::U1 ) -OPCODE(Add64, T::U64, T::U64, T::U64, T::U1 ) -OPCODE(Sub32, T::U32, T::U32, T::U32, T::U1 ) -OPCODE(Sub64, T::U64, T::U64, T::U64, T::U1 ) -OPCODE(Mul32, T::U32, T::U32, T::U32 ) -OPCODE(Mul64, T::U64, T::U64, T::U64 ) -OPCODE(SignedMultiplyHigh64, T::U64, T::U64, T::U64 ) -OPCODE(UnsignedMultiplyHigh64, T::U64, T::U64, T::U64 ) -OPCODE(UnsignedDiv32, T::U32, T::U32, T::U32 ) -OPCODE(UnsignedDiv64, T::U64, T::U64, T::U64 ) -OPCODE(SignedDiv32, T::U32, T::U32, T::U32 ) -OPCODE(SignedDiv64, T::U64, T::U64, T::U64 ) -OPCODE(And32, T::U32, T::U32, T::U32 ) -OPCODE(And64, T::U64, T::U64, T::U64 ) -OPCODE(Eor32, T::U32, T::U32, T::U32 ) -OPCODE(Eor64, T::U64, T::U64, T::U64 ) -OPCODE(Or32, T::U32, T::U32, T::U32 ) -OPCODE(Or64, T::U64, T::U64, T::U64 ) -OPCODE(Not32, T::U32, T::U32 ) -OPCODE(Not64, T::U64, T::U64 ) -OPCODE(SignExtendByteToWord, T::U32, T::U8 ) -OPCODE(SignExtendHalfToWord, T::U32, T::U16 ) -OPCODE(SignExtendByteToLong, T::U64, T::U8 ) -OPCODE(SignExtendHalfToLong, T::U64, T::U16 ) -OPCODE(SignExtendWordToLong, T::U64, T::U32 ) -OPCODE(ZeroExtendByteToWord, T::U32, T::U8 ) -OPCODE(ZeroExtendHalfToWord, T::U32, T::U16 ) -OPCODE(ZeroExtendByteToLong, T::U64, T::U8 ) -OPCODE(ZeroExtendHalfToLong, T::U64, T::U16 ) -OPCODE(ZeroExtendWordToLong, T::U64, T::U32 ) -OPCODE(ZeroExtendLongToQuad, T::U128, T::U64 ) -OPCODE(ByteReverseWord, T::U32, T::U32 ) -OPCODE(ByteReverseHalf, T::U16, T::U16 ) -OPCODE(ByteReverseDual, T::U64, T::U64 ) -OPCODE(CountLeadingZeros32, T::U32, T::U32 ) -OPCODE(CountLeadingZeros64, T::U64, T::U64 ) -OPCODE(ExtractRegister32, T::U32, T::U32, T::U32, T::U8 ) -OPCODE(ExtractRegister64, T::U64, T::U64, T::U64, T::U8 ) +OPCODE(Pack2x32To1x64, T::U64, T::U32, T::U32 ) +OPCODE(Pack2x64To1x128, T::U128, T::U64, T::U64 ) +OPCODE(LeastSignificantWord, T::U32, T::U64 ) +OPCODE(MostSignificantWord, T::U32, T::U64 ) +OPCODE(LeastSignificantHalf, T::U16, T::U32 ) +OPCODE(LeastSignificantByte, T::U8, T::U32 ) +OPCODE(MostSignificantBit, T::U1, T::U32 ) +OPCODE(IsZero32, T::U1, T::U32 ) +OPCODE(IsZero64, T::U1, T::U64 ) +OPCODE(TestBit, T::U1, T::U64, T::U8 ) +OPCODE(ConditionalSelect32, T::U32, T::Cond, T::U32, T::U32 ) +OPCODE(ConditionalSelect64, T::U64, T::Cond, T::U64, T::U64 ) +OPCODE(ConditionalSelectNZCV, T::NZCVFlags, T::Cond, T::NZCVFlags, T::NZCVFlags ) +OPCODE(LogicalShiftLeft32, T::U32, T::U32, T::U8, T::U1 ) +OPCODE(LogicalShiftLeft64, T::U64, T::U64, T::U8 ) +OPCODE(LogicalShiftRight32, T::U32, T::U32, T::U8, T::U1 ) +OPCODE(LogicalShiftRight64, T::U64, T::U64, T::U8 ) +OPCODE(ArithmeticShiftRight32, T::U32, T::U32, T::U8, T::U1 ) +OPCODE(ArithmeticShiftRight64, T::U64, T::U64, T::U8 ) +OPCODE(RotateRight32, T::U32, T::U32, T::U8, T::U1 ) +OPCODE(RotateRight64, T::U64, T::U64, T::U8 ) +OPCODE(RotateRightExtended, T::U32, T::U32, T::U1 ) +OPCODE(Add32, T::U32, T::U32, T::U32, T::U1 ) +OPCODE(Add64, T::U64, T::U64, T::U64, T::U1 ) +OPCODE(Sub32, T::U32, T::U32, T::U32, T::U1 ) +OPCODE(Sub64, T::U64, T::U64, T::U64, T::U1 ) +OPCODE(Mul32, T::U32, T::U32, T::U32 ) +OPCODE(Mul64, T::U64, T::U64, T::U64 ) +OPCODE(SignedMultiplyHigh64, T::U64, T::U64, T::U64 ) +OPCODE(UnsignedMultiplyHigh64, T::U64, T::U64, T::U64 ) +OPCODE(UnsignedDiv32, T::U32, T::U32, T::U32 ) +OPCODE(UnsignedDiv64, T::U64, T::U64, T::U64 ) +OPCODE(SignedDiv32, T::U32, T::U32, T::U32 ) +OPCODE(SignedDiv64, T::U64, T::U64, T::U64 ) +OPCODE(And32, T::U32, T::U32, T::U32 ) +OPCODE(And64, T::U64, T::U64, T::U64 ) +OPCODE(Eor32, T::U32, T::U32, T::U32 ) +OPCODE(Eor64, T::U64, T::U64, T::U64 ) +OPCODE(Or32, T::U32, T::U32, T::U32 ) +OPCODE(Or64, T::U64, T::U64, T::U64 ) +OPCODE(Not32, T::U32, T::U32 ) +OPCODE(Not64, T::U64, T::U64 ) +OPCODE(SignExtendByteToWord, T::U32, T::U8 ) +OPCODE(SignExtendHalfToWord, T::U32, T::U16 ) +OPCODE(SignExtendByteToLong, T::U64, T::U8 ) +OPCODE(SignExtendHalfToLong, T::U64, T::U16 ) +OPCODE(SignExtendWordToLong, T::U64, T::U32 ) +OPCODE(ZeroExtendByteToWord, T::U32, T::U8 ) +OPCODE(ZeroExtendHalfToWord, T::U32, T::U16 ) +OPCODE(ZeroExtendByteToLong, T::U64, T::U8 ) +OPCODE(ZeroExtendHalfToLong, T::U64, T::U16 ) +OPCODE(ZeroExtendWordToLong, T::U64, T::U32 ) +OPCODE(ZeroExtendLongToQuad, T::U128, T::U64 ) +OPCODE(ByteReverseWord, T::U32, T::U32 ) +OPCODE(ByteReverseHalf, T::U16, T::U16 ) +OPCODE(ByteReverseDual, T::U64, T::U64 ) +OPCODE(CountLeadingZeros32, T::U32, T::U32 ) +OPCODE(CountLeadingZeros64, T::U64, T::U64 ) +OPCODE(ExtractRegister32, T::U32, T::U32, T::U32, T::U8 ) +OPCODE(ExtractRegister64, T::U64, T::U64, T::U64, T::U8 ) // Saturated instructions -OPCODE(SignedSaturatedAdd, T::U32, T::U32, T::U32 ) -OPCODE(SignedSaturatedSub, T::U32, T::U32, T::U32 ) -OPCODE(UnsignedSaturation, T::U32, T::U32, T::U8 ) -OPCODE(SignedSaturation, T::U32, T::U32, T::U8 ) +OPCODE(SignedSaturatedAdd, T::U32, T::U32, T::U32 ) +OPCODE(SignedSaturatedSub, T::U32, T::U32, T::U32 ) +OPCODE(UnsignedSaturation, T::U32, T::U32, T::U8 ) +OPCODE(SignedSaturation, T::U32, T::U32, T::U8 ) // Packed instructions -OPCODE(PackedAddU8, T::U32, T::U32, T::U32 ) -OPCODE(PackedAddS8, T::U32, T::U32, T::U32 ) -OPCODE(PackedSubU8, T::U32, T::U32, T::U32 ) -OPCODE(PackedSubS8, T::U32, T::U32, T::U32 ) -OPCODE(PackedAddU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedAddS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSubU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSubS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedAddSubU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedAddSubS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSubAddU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSubAddS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingAddU8, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingAddS8, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingSubU8, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingSubS8, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingAddU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingAddS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingSubU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingSubS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingAddSubU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingAddSubS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingSubAddU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingSubAddS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedAddU8, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedAddS8, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedSubU8, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedSubS8, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedAddU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedAddS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedSubU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedSubS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedAbsDiffSumS8, T::U32, T::U32, T::U32 ) -OPCODE(PackedSelect, T::U32, T::U32, T::U32, T::U32 ) +OPCODE(PackedAddU8, T::U32, T::U32, T::U32 ) +OPCODE(PackedAddS8, T::U32, T::U32, T::U32 ) +OPCODE(PackedSubU8, T::U32, T::U32, T::U32 ) +OPCODE(PackedSubS8, T::U32, T::U32, T::U32 ) +OPCODE(PackedAddU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedAddS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSubU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSubS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedAddSubU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedAddSubS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSubAddU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSubAddS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingAddU8, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingAddS8, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubU8, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubS8, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingAddU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingAddS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingAddSubU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingAddSubS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubAddU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubAddS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedAddU8, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedAddS8, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedSubU8, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedSubS8, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedAddU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedAddS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedSubU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedSubS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedAbsDiffSumS8, T::U32, T::U32, T::U32 ) +OPCODE(PackedSelect, T::U32, T::U32, T::U32, T::U32 ) // CRC instructions -OPCODE(CRC32Castagnoli8, T::U32, T::U32, T::U32 ) -OPCODE(CRC32Castagnoli16, T::U32, T::U32, T::U32 ) -OPCODE(CRC32Castagnoli32, T::U32, T::U32, T::U32 ) -OPCODE(CRC32Castagnoli64, T::U32, T::U32, T::U64 ) -OPCODE(CRC32ISO8, T::U32, T::U32, T::U32 ) -OPCODE(CRC32ISO16, T::U32, T::U32, T::U32 ) -OPCODE(CRC32ISO32, T::U32, T::U32, T::U32 ) -OPCODE(CRC32ISO64, T::U32, T::U32, T::U64 ) +OPCODE(CRC32Castagnoli8, T::U32, T::U32, T::U32 ) +OPCODE(CRC32Castagnoli16, T::U32, T::U32, T::U32 ) +OPCODE(CRC32Castagnoli32, T::U32, T::U32, T::U32 ) +OPCODE(CRC32Castagnoli64, T::U32, T::U32, T::U64 ) +OPCODE(CRC32ISO8, T::U32, T::U32, T::U32 ) +OPCODE(CRC32ISO16, T::U32, T::U32, T::U32 ) +OPCODE(CRC32ISO32, T::U32, T::U32, T::U32 ) +OPCODE(CRC32ISO64, T::U32, T::U32, T::U64 ) // AES instructions -OPCODE(AESDecryptSingleRound, T::U128, T::U128 ) -OPCODE(AESEncryptSingleRound, T::U128, T::U128 ) -OPCODE(AESInverseMixColumns, T::U128, T::U128 ) -OPCODE(AESMixColumns, T::U128, T::U128 ) +OPCODE(AESDecryptSingleRound, T::U128, T::U128 ) +OPCODE(AESEncryptSingleRound, T::U128, T::U128 ) +OPCODE(AESInverseMixColumns, T::U128, T::U128 ) +OPCODE(AESMixColumns, T::U128, T::U128 ) // SM4 instructions -OPCODE(SM4AccessSubstitutionBox, T::U8, T::U8 ) +OPCODE(SM4AccessSubstitutionBox, T::U8, T::U8 ) // Vector instructions -OPCODE(VectorGetElement8, T::U8, T::U128, T::U8 ) -OPCODE(VectorGetElement16, T::U16, T::U128, T::U8 ) -OPCODE(VectorGetElement32, T::U32, T::U128, T::U8 ) -OPCODE(VectorGetElement64, T::U64, T::U128, T::U8 ) -OPCODE(VectorSetElement8, T::U128, T::U128, T::U8, T::U8 ) -OPCODE(VectorSetElement16, T::U128, T::U128, T::U8, T::U16 ) -OPCODE(VectorSetElement32, T::U128, T::U128, T::U8, T::U32 ) -OPCODE(VectorSetElement64, T::U128, T::U128, T::U8, T::U64 ) -OPCODE(VectorAbs8, T::U128, T::U128 ) -OPCODE(VectorAbs16, T::U128, T::U128 ) -OPCODE(VectorAbs32, T::U128, T::U128 ) -OPCODE(VectorAbs64, T::U128, T::U128 ) -OPCODE(VectorAdd8, T::U128, T::U128, T::U128 ) -OPCODE(VectorAdd16, T::U128, T::U128, T::U128 ) -OPCODE(VectorAdd32, T::U128, T::U128, T::U128 ) -OPCODE(VectorAdd64, T::U128, T::U128, T::U128 ) -OPCODE(VectorAnd, T::U128, T::U128, T::U128 ) -OPCODE(VectorArithmeticShiftRight8, T::U128, T::U128, T::U8 ) -OPCODE(VectorArithmeticShiftRight16, T::U128, T::U128, T::U8 ) -OPCODE(VectorArithmeticShiftRight32, T::U128, T::U128, T::U8 ) -OPCODE(VectorArithmeticShiftRight64, T::U128, T::U128, T::U8 ) -OPCODE(VectorBroadcastLower8, T::U128, T::U8 ) -OPCODE(VectorBroadcastLower16, T::U128, T::U16 ) -OPCODE(VectorBroadcastLower32, T::U128, T::U32 ) -OPCODE(VectorBroadcast8, T::U128, T::U8 ) -OPCODE(VectorBroadcast16, T::U128, T::U16 ) -OPCODE(VectorBroadcast32, T::U128, T::U32 ) -OPCODE(VectorBroadcast64, T::U128, T::U64 ) -OPCODE(VectorDeinterleaveEven8, T::U128, T::U128, T::U128 ) -OPCODE(VectorDeinterleaveEven16, T::U128, T::U128, T::U128 ) -OPCODE(VectorDeinterleaveEven32, T::U128, T::U128, T::U128 ) -OPCODE(VectorDeinterleaveEven64, T::U128, T::U128, T::U128 ) -OPCODE(VectorDeinterleaveOdd8, T::U128, T::U128, T::U128 ) -OPCODE(VectorDeinterleaveOdd16, T::U128, T::U128, T::U128 ) -OPCODE(VectorDeinterleaveOdd32, T::U128, T::U128, T::U128 ) -OPCODE(VectorDeinterleaveOdd64, T::U128, T::U128, T::U128 ) -OPCODE(VectorEor, T::U128, T::U128, T::U128 ) -OPCODE(VectorEqual8, T::U128, T::U128, T::U128 ) -OPCODE(VectorEqual16, T::U128, T::U128, T::U128 ) -OPCODE(VectorEqual32, T::U128, T::U128, T::U128 ) -OPCODE(VectorEqual64, T::U128, T::U128, T::U128 ) -OPCODE(VectorEqual128, T::U128, T::U128, T::U128 ) -OPCODE(VectorExtract, T::U128, T::U128, T::U128, T::U8 ) -OPCODE(VectorExtractLower, T::U128, T::U128, T::U128, T::U8 ) -OPCODE(VectorGreaterS8, T::U128, T::U128, T::U128 ) -OPCODE(VectorGreaterS16, T::U128, T::U128, T::U128 ) -OPCODE(VectorGreaterS32, T::U128, T::U128, T::U128 ) -OPCODE(VectorGreaterS64, T::U128, T::U128, T::U128 ) -OPCODE(VectorHalvingAddS8, T::U128, T::U128, T::U128 ) -OPCODE(VectorHalvingAddS16, T::U128, T::U128, T::U128 ) -OPCODE(VectorHalvingAddS32, T::U128, T::U128, T::U128 ) -OPCODE(VectorHalvingAddU8, T::U128, T::U128, T::U128 ) -OPCODE(VectorHalvingAddU16, T::U128, T::U128, T::U128 ) -OPCODE(VectorHalvingAddU32, T::U128, T::U128, T::U128 ) -OPCODE(VectorHalvingSubS8, T::U128, T::U128, T::U128 ) -OPCODE(VectorHalvingSubS16, T::U128, T::U128, T::U128 ) -OPCODE(VectorHalvingSubS32, T::U128, T::U128, T::U128 ) -OPCODE(VectorHalvingSubU8, T::U128, T::U128, T::U128 ) -OPCODE(VectorHalvingSubU16, T::U128, T::U128, T::U128 ) -OPCODE(VectorHalvingSubU32, T::U128, T::U128, T::U128 ) -OPCODE(VectorInterleaveLower8, T::U128, T::U128, T::U128 ) -OPCODE(VectorInterleaveLower16, T::U128, T::U128, T::U128 ) -OPCODE(VectorInterleaveLower32, T::U128, T::U128, T::U128 ) -OPCODE(VectorInterleaveLower64, T::U128, T::U128, T::U128 ) -OPCODE(VectorInterleaveUpper8, T::U128, T::U128, T::U128 ) -OPCODE(VectorInterleaveUpper16, T::U128, T::U128, T::U128 ) -OPCODE(VectorInterleaveUpper32, T::U128, T::U128, T::U128 ) -OPCODE(VectorInterleaveUpper64, T::U128, T::U128, T::U128 ) -OPCODE(VectorLogicalShiftLeft8, T::U128, T::U128, T::U8 ) -OPCODE(VectorLogicalShiftLeft16, T::U128, T::U128, T::U8 ) -OPCODE(VectorLogicalShiftLeft32, T::U128, T::U128, T::U8 ) -OPCODE(VectorLogicalShiftLeft64, T::U128, T::U128, T::U8 ) -OPCODE(VectorLogicalShiftRight8, T::U128, T::U128, T::U8 ) -OPCODE(VectorLogicalShiftRight16, T::U128, T::U128, T::U8 ) -OPCODE(VectorLogicalShiftRight32, T::U128, T::U128, T::U8 ) -OPCODE(VectorLogicalShiftRight64, T::U128, T::U128, T::U8 ) -OPCODE(VectorLogicalVShiftS8, T::U128, T::U128, T::U128 ) -OPCODE(VectorLogicalVShiftS16, T::U128, T::U128, T::U128 ) -OPCODE(VectorLogicalVShiftS32, T::U128, T::U128, T::U128 ) -OPCODE(VectorLogicalVShiftS64, T::U128, T::U128, T::U128 ) -OPCODE(VectorLogicalVShiftU8, T::U128, T::U128, T::U128 ) -OPCODE(VectorLogicalVShiftU16, T::U128, T::U128, T::U128 ) -OPCODE(VectorLogicalVShiftU32, T::U128, T::U128, T::U128 ) -OPCODE(VectorLogicalVShiftU64, T::U128, T::U128, T::U128 ) -OPCODE(VectorMaxS8, T::U128, T::U128, T::U128 ) -OPCODE(VectorMaxS16, T::U128, T::U128, T::U128 ) -OPCODE(VectorMaxS32, T::U128, T::U128, T::U128 ) -OPCODE(VectorMaxS64, T::U128, T::U128, T::U128 ) -OPCODE(VectorMaxU8, T::U128, T::U128, T::U128 ) -OPCODE(VectorMaxU16, T::U128, T::U128, T::U128 ) -OPCODE(VectorMaxU32, T::U128, T::U128, T::U128 ) -OPCODE(VectorMaxU64, T::U128, T::U128, T::U128 ) -OPCODE(VectorMinS8, T::U128, T::U128, T::U128 ) -OPCODE(VectorMinS16, T::U128, T::U128, T::U128 ) -OPCODE(VectorMinS32, T::U128, T::U128, T::U128 ) -OPCODE(VectorMinS64, T::U128, T::U128, T::U128 ) -OPCODE(VectorMinU8, T::U128, T::U128, T::U128 ) -OPCODE(VectorMinU16, T::U128, T::U128, T::U128 ) -OPCODE(VectorMinU32, T::U128, T::U128, T::U128 ) -OPCODE(VectorMinU64, T::U128, T::U128, T::U128 ) -OPCODE(VectorMultiply8, T::U128, T::U128, T::U128 ) -OPCODE(VectorMultiply16, T::U128, T::U128, T::U128 ) -OPCODE(VectorMultiply32, T::U128, T::U128, T::U128 ) -OPCODE(VectorMultiply64, T::U128, T::U128, T::U128 ) -OPCODE(VectorNarrow16, T::U128, T::U128 ) -OPCODE(VectorNarrow32, T::U128, T::U128 ) -OPCODE(VectorNarrow64, T::U128, T::U128 ) -OPCODE(VectorNot, T::U128, T::U128 ) -OPCODE(VectorOr, T::U128, T::U128, T::U128 ) -OPCODE(VectorPairedAddLower8, T::U128, T::U128, T::U128 ) -OPCODE(VectorPairedAddLower16, T::U128, T::U128, T::U128 ) -OPCODE(VectorPairedAddLower32, T::U128, T::U128, T::U128 ) -OPCODE(VectorPairedAddSignedWiden8, T::U128, T::U128 ) -OPCODE(VectorPairedAddSignedWiden16, T::U128, T::U128 ) -OPCODE(VectorPairedAddSignedWiden32, T::U128, T::U128 ) -OPCODE(VectorPairedAddUnsignedWiden8, T::U128, T::U128 ) -OPCODE(VectorPairedAddUnsignedWiden16, T::U128, T::U128 ) -OPCODE(VectorPairedAddUnsignedWiden32, T::U128, T::U128 ) -OPCODE(VectorPairedAdd8, T::U128, T::U128, T::U128 ) -OPCODE(VectorPairedAdd16, T::U128, T::U128, T::U128 ) -OPCODE(VectorPairedAdd32, T::U128, T::U128, T::U128 ) -OPCODE(VectorPairedAdd64, T::U128, T::U128, T::U128 ) -OPCODE(VectorPopulationCount, T::U128, T::U128 ) -OPCODE(VectorReverseBits, T::U128, T::U128 ) -OPCODE(VectorRoundingHalvingAddS8, T::U128, T::U128, T::U128 ) -OPCODE(VectorRoundingHalvingAddS16, T::U128, T::U128, T::U128 ) -OPCODE(VectorRoundingHalvingAddS32, T::U128, T::U128, T::U128 ) -OPCODE(VectorRoundingHalvingAddU8, T::U128, T::U128, T::U128 ) -OPCODE(VectorRoundingHalvingAddU16, T::U128, T::U128, T::U128 ) -OPCODE(VectorRoundingHalvingAddU32, T::U128, T::U128, T::U128 ) -OPCODE(VectorShuffleHighHalfwords, T::U128, T::U128, T::U8 ) -OPCODE(VectorShuffleLowHalfwords, T::U128, T::U128, T::U8 ) -OPCODE(VectorShuffleWords, T::U128, T::U128, T::U8 ) -OPCODE(VectorSignExtend8, T::U128, T::U128 ) -OPCODE(VectorSignExtend16, T::U128, T::U128 ) -OPCODE(VectorSignExtend32, T::U128, T::U128 ) -OPCODE(VectorSignExtend64, T::U128, T::U128 ) -OPCODE(VectorSignedAbsoluteDifference8, T::U128, T::U128, T::U128 ) -OPCODE(VectorSignedAbsoluteDifference16, T::U128, T::U128, T::U128 ) -OPCODE(VectorSignedAbsoluteDifference32, T::U128, T::U128, T::U128 ) -OPCODE(VectorSub8, T::U128, T::U128, T::U128 ) -OPCODE(VectorSub16, T::U128, T::U128, T::U128 ) -OPCODE(VectorSub32, T::U128, T::U128, T::U128 ) -OPCODE(VectorSub64, T::U128, T::U128, T::U128 ) -OPCODE(VectorUnsignedAbsoluteDifference8, T::U128, T::U128, T::U128 ) -OPCODE(VectorUnsignedAbsoluteDifference16, T::U128, T::U128, T::U128 ) -OPCODE(VectorUnsignedAbsoluteDifference32, T::U128, T::U128, T::U128 ) -OPCODE(VectorZeroExtend8, T::U128, T::U128 ) -OPCODE(VectorZeroExtend16, T::U128, T::U128 ) -OPCODE(VectorZeroExtend32, T::U128, T::U128 ) -OPCODE(VectorZeroExtend64, T::U128, T::U128 ) -OPCODE(VectorZeroUpper, T::U128, T::U128 ) -OPCODE(ZeroVector, T::U128, ) +OPCODE(VectorGetElement8, T::U8, T::U128, T::U8 ) +OPCODE(VectorGetElement16, T::U16, T::U128, T::U8 ) +OPCODE(VectorGetElement32, T::U32, T::U128, T::U8 ) +OPCODE(VectorGetElement64, T::U64, T::U128, T::U8 ) +OPCODE(VectorSetElement8, T::U128, T::U128, T::U8, T::U8 ) +OPCODE(VectorSetElement16, T::U128, T::U128, T::U8, T::U16 ) +OPCODE(VectorSetElement32, T::U128, T::U128, T::U8, T::U32 ) +OPCODE(VectorSetElement64, T::U128, T::U128, T::U8, T::U64 ) +OPCODE(VectorAbs8, T::U128, T::U128 ) +OPCODE(VectorAbs16, T::U128, T::U128 ) +OPCODE(VectorAbs32, T::U128, T::U128 ) +OPCODE(VectorAbs64, T::U128, T::U128 ) +OPCODE(VectorAdd8, T::U128, T::U128, T::U128 ) +OPCODE(VectorAdd16, T::U128, T::U128, T::U128 ) +OPCODE(VectorAdd32, T::U128, T::U128, T::U128 ) +OPCODE(VectorAdd64, T::U128, T::U128, T::U128 ) +OPCODE(VectorAnd, T::U128, T::U128, T::U128 ) +OPCODE(VectorArithmeticShiftRight8, T::U128, T::U128, T::U8 ) +OPCODE(VectorArithmeticShiftRight16, T::U128, T::U128, T::U8 ) +OPCODE(VectorArithmeticShiftRight32, T::U128, T::U128, T::U8 ) +OPCODE(VectorArithmeticShiftRight64, T::U128, T::U128, T::U8 ) +OPCODE(VectorBroadcastLower8, T::U128, T::U8 ) +OPCODE(VectorBroadcastLower16, T::U128, T::U16 ) +OPCODE(VectorBroadcastLower32, T::U128, T::U32 ) +OPCODE(VectorBroadcast8, T::U128, T::U8 ) +OPCODE(VectorBroadcast16, T::U128, T::U16 ) +OPCODE(VectorBroadcast32, T::U128, T::U32 ) +OPCODE(VectorBroadcast64, T::U128, T::U64 ) +OPCODE(VectorDeinterleaveEven8, T::U128, T::U128, T::U128 ) +OPCODE(VectorDeinterleaveEven16, T::U128, T::U128, T::U128 ) +OPCODE(VectorDeinterleaveEven32, T::U128, T::U128, T::U128 ) +OPCODE(VectorDeinterleaveEven64, T::U128, T::U128, T::U128 ) +OPCODE(VectorDeinterleaveOdd8, T::U128, T::U128, T::U128 ) +OPCODE(VectorDeinterleaveOdd16, T::U128, T::U128, T::U128 ) +OPCODE(VectorDeinterleaveOdd32, T::U128, T::U128, T::U128 ) +OPCODE(VectorDeinterleaveOdd64, T::U128, T::U128, T::U128 ) +OPCODE(VectorEor, T::U128, T::U128, T::U128 ) +OPCODE(VectorEqual8, T::U128, T::U128, T::U128 ) +OPCODE(VectorEqual16, T::U128, T::U128, T::U128 ) +OPCODE(VectorEqual32, T::U128, T::U128, T::U128 ) +OPCODE(VectorEqual64, T::U128, T::U128, T::U128 ) +OPCODE(VectorEqual128, T::U128, T::U128, T::U128 ) +OPCODE(VectorExtract, T::U128, T::U128, T::U128, T::U8 ) +OPCODE(VectorExtractLower, T::U128, T::U128, T::U128, T::U8 ) +OPCODE(VectorGreaterS8, T::U128, T::U128, T::U128 ) +OPCODE(VectorGreaterS16, T::U128, T::U128, T::U128 ) +OPCODE(VectorGreaterS32, T::U128, T::U128, T::U128 ) +OPCODE(VectorGreaterS64, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingAddS8, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingAddS16, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingAddS32, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingAddU8, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingAddU16, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingAddU32, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingSubS8, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingSubS16, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingSubS32, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingSubU8, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingSubU16, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingSubU32, T::U128, T::U128, T::U128 ) +OPCODE(VectorInterleaveLower8, T::U128, T::U128, T::U128 ) +OPCODE(VectorInterleaveLower16, T::U128, T::U128, T::U128 ) +OPCODE(VectorInterleaveLower32, T::U128, T::U128, T::U128 ) +OPCODE(VectorInterleaveLower64, T::U128, T::U128, T::U128 ) +OPCODE(VectorInterleaveUpper8, T::U128, T::U128, T::U128 ) +OPCODE(VectorInterleaveUpper16, T::U128, T::U128, T::U128 ) +OPCODE(VectorInterleaveUpper32, T::U128, T::U128, T::U128 ) +OPCODE(VectorInterleaveUpper64, T::U128, T::U128, T::U128 ) +OPCODE(VectorLogicalShiftLeft8, T::U128, T::U128, T::U8 ) +OPCODE(VectorLogicalShiftLeft16, T::U128, T::U128, T::U8 ) +OPCODE(VectorLogicalShiftLeft32, T::U128, T::U128, T::U8 ) +OPCODE(VectorLogicalShiftLeft64, T::U128, T::U128, T::U8 ) +OPCODE(VectorLogicalShiftRight8, T::U128, T::U128, T::U8 ) +OPCODE(VectorLogicalShiftRight16, T::U128, T::U128, T::U8 ) +OPCODE(VectorLogicalShiftRight32, T::U128, T::U128, T::U8 ) +OPCODE(VectorLogicalShiftRight64, T::U128, T::U128, T::U8 ) +OPCODE(VectorLogicalVShiftS8, T::U128, T::U128, T::U128 ) +OPCODE(VectorLogicalVShiftS16, T::U128, T::U128, T::U128 ) +OPCODE(VectorLogicalVShiftS32, T::U128, T::U128, T::U128 ) +OPCODE(VectorLogicalVShiftS64, T::U128, T::U128, T::U128 ) +OPCODE(VectorLogicalVShiftU8, T::U128, T::U128, T::U128 ) +OPCODE(VectorLogicalVShiftU16, T::U128, T::U128, T::U128 ) +OPCODE(VectorLogicalVShiftU32, T::U128, T::U128, T::U128 ) +OPCODE(VectorLogicalVShiftU64, T::U128, T::U128, T::U128 ) +OPCODE(VectorMaxS8, T::U128, T::U128, T::U128 ) +OPCODE(VectorMaxS16, T::U128, T::U128, T::U128 ) +OPCODE(VectorMaxS32, T::U128, T::U128, T::U128 ) +OPCODE(VectorMaxS64, T::U128, T::U128, T::U128 ) +OPCODE(VectorMaxU8, T::U128, T::U128, T::U128 ) +OPCODE(VectorMaxU16, T::U128, T::U128, T::U128 ) +OPCODE(VectorMaxU32, T::U128, T::U128, T::U128 ) +OPCODE(VectorMaxU64, T::U128, T::U128, T::U128 ) +OPCODE(VectorMinS8, T::U128, T::U128, T::U128 ) +OPCODE(VectorMinS16, T::U128, T::U128, T::U128 ) +OPCODE(VectorMinS32, T::U128, T::U128, T::U128 ) +OPCODE(VectorMinS64, T::U128, T::U128, T::U128 ) +OPCODE(VectorMinU8, T::U128, T::U128, T::U128 ) +OPCODE(VectorMinU16, T::U128, T::U128, T::U128 ) +OPCODE(VectorMinU32, T::U128, T::U128, T::U128 ) +OPCODE(VectorMinU64, T::U128, T::U128, T::U128 ) +OPCODE(VectorMultiply8, T::U128, T::U128, T::U128 ) +OPCODE(VectorMultiply16, T::U128, T::U128, T::U128 ) +OPCODE(VectorMultiply32, T::U128, T::U128, T::U128 ) +OPCODE(VectorMultiply64, T::U128, T::U128, T::U128 ) +OPCODE(VectorNarrow16, T::U128, T::U128 ) +OPCODE(VectorNarrow32, T::U128, T::U128 ) +OPCODE(VectorNarrow64, T::U128, T::U128 ) +OPCODE(VectorNot, T::U128, T::U128 ) +OPCODE(VectorOr, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAddLower8, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAddLower16, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAddLower32, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAddSignedWiden8, T::U128, T::U128 ) +OPCODE(VectorPairedAddSignedWiden16, T::U128, T::U128 ) +OPCODE(VectorPairedAddSignedWiden32, T::U128, T::U128 ) +OPCODE(VectorPairedAddUnsignedWiden8, T::U128, T::U128 ) +OPCODE(VectorPairedAddUnsignedWiden16, T::U128, T::U128 ) +OPCODE(VectorPairedAddUnsignedWiden32, T::U128, T::U128 ) +OPCODE(VectorPairedAdd8, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAdd16, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAdd32, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAdd64, T::U128, T::U128, T::U128 ) +OPCODE(VectorPopulationCount, T::U128, T::U128 ) +OPCODE(VectorReverseBits, T::U128, T::U128 ) +OPCODE(VectorRoundingHalvingAddS8, T::U128, T::U128, T::U128 ) +OPCODE(VectorRoundingHalvingAddS16, T::U128, T::U128, T::U128 ) +OPCODE(VectorRoundingHalvingAddS32, T::U128, T::U128, T::U128 ) +OPCODE(VectorRoundingHalvingAddU8, T::U128, T::U128, T::U128 ) +OPCODE(VectorRoundingHalvingAddU16, T::U128, T::U128, T::U128 ) +OPCODE(VectorRoundingHalvingAddU32, T::U128, T::U128, T::U128 ) +OPCODE(VectorShuffleHighHalfwords, T::U128, T::U128, T::U8 ) +OPCODE(VectorShuffleLowHalfwords, T::U128, T::U128, T::U8 ) +OPCODE(VectorShuffleWords, T::U128, T::U128, T::U8 ) +OPCODE(VectorSignExtend8, T::U128, T::U128 ) +OPCODE(VectorSignExtend16, T::U128, T::U128 ) +OPCODE(VectorSignExtend32, T::U128, T::U128 ) +OPCODE(VectorSignExtend64, T::U128, T::U128 ) +OPCODE(VectorSignedAbsoluteDifference8, T::U128, T::U128, T::U128 ) +OPCODE(VectorSignedAbsoluteDifference16, T::U128, T::U128, T::U128 ) +OPCODE(VectorSignedAbsoluteDifference32, T::U128, T::U128, T::U128 ) +OPCODE(VectorSignedSaturatedNarrowToUnsigned16, T::U128, T::U128 ) +OPCODE(VectorSignedSaturatedNarrowToUnsigned32, T::U128, T::U128 ) +OPCODE(VectorSignedSaturatedNarrowToUnsigned64, T::U128, T::U128 ) +OPCODE(VectorSub8, T::U128, T::U128, T::U128 ) +OPCODE(VectorSub16, T::U128, T::U128, T::U128 ) +OPCODE(VectorSub32, T::U128, T::U128, T::U128 ) +OPCODE(VectorSub64, T::U128, T::U128, T::U128 ) +OPCODE(VectorUnsignedAbsoluteDifference8, T::U128, T::U128, T::U128 ) +OPCODE(VectorUnsignedAbsoluteDifference16, T::U128, T::U128, T::U128 ) +OPCODE(VectorUnsignedAbsoluteDifference32, T::U128, T::U128, T::U128 ) +OPCODE(VectorZeroExtend8, T::U128, T::U128 ) +OPCODE(VectorZeroExtend16, T::U128, T::U128 ) +OPCODE(VectorZeroExtend32, T::U128, T::U128 ) +OPCODE(VectorZeroExtend64, T::U128, T::U128 ) +OPCODE(VectorZeroUpper, T::U128, T::U128 ) +OPCODE(ZeroVector, T::U128, ) // Floating-point operations -OPCODE(FPAbs32, T::U32, T::U32 ) -OPCODE(FPAbs64, T::U64, T::U64 ) -OPCODE(FPAdd32, T::U32, T::U32, T::U32 ) -OPCODE(FPAdd64, T::U64, T::U64, T::U64 ) -OPCODE(FPCompare32, T::NZCVFlags, T::U32, T::U32, T::U1 ) -OPCODE(FPCompare64, T::NZCVFlags, T::U64, T::U64, T::U1 ) -OPCODE(FPDiv32, T::U32, T::U32, T::U32 ) -OPCODE(FPDiv64, T::U64, T::U64, T::U64 ) -OPCODE(FPMax32, T::U32, T::U32, T::U32 ) -OPCODE(FPMax64, T::U64, T::U64, T::U64 ) -OPCODE(FPMaxNumeric32, T::U32, T::U32, T::U32 ) -OPCODE(FPMaxNumeric64, T::U64, T::U64, T::U64 ) -OPCODE(FPMin32, T::U32, T::U32, T::U32 ) -OPCODE(FPMin64, T::U64, T::U64, T::U64 ) -OPCODE(FPMinNumeric32, T::U32, T::U32, T::U32 ) -OPCODE(FPMinNumeric64, T::U64, T::U64, T::U64 ) -OPCODE(FPMul32, T::U32, T::U32, T::U32 ) -OPCODE(FPMul64, T::U64, T::U64, T::U64 ) -OPCODE(FPMulAdd32, T::U32, T::U32, T::U32, T::U32 ) -OPCODE(FPMulAdd64, T::U64, T::U64, T::U64, T::U64 ) -OPCODE(FPNeg32, T::U32, T::U32 ) -OPCODE(FPNeg64, T::U64, T::U64 ) -OPCODE(FPRoundInt32, T::U32, T::U32, T::U8, T::U1 ) -OPCODE(FPRoundInt64, T::U64, T::U64, T::U8, T::U1 ) -OPCODE(FPRSqrtEstimate32, T::U32, T::U32 ) -OPCODE(FPRSqrtEstimate64, T::U64, T::U64 ) -OPCODE(FPRSqrtStepFused32, T::U32, T::U32, T::U32 ) -OPCODE(FPRSqrtStepFused64, T::U64, T::U64, T::U64 ) -OPCODE(FPSqrt32, T::U32, T::U32 ) -OPCODE(FPSqrt64, T::U64, T::U64 ) -OPCODE(FPSub32, T::U32, T::U32, T::U32 ) -OPCODE(FPSub64, T::U64, T::U64, T::U64 ) +OPCODE(FPAbs32, T::U32, T::U32 ) +OPCODE(FPAbs64, T::U64, T::U64 ) +OPCODE(FPAdd32, T::U32, T::U32, T::U32 ) +OPCODE(FPAdd64, T::U64, T::U64, T::U64 ) +OPCODE(FPCompare32, T::NZCVFlags, T::U32, T::U32, T::U1 ) +OPCODE(FPCompare64, T::NZCVFlags, T::U64, T::U64, T::U1 ) +OPCODE(FPDiv32, T::U32, T::U32, T::U32 ) +OPCODE(FPDiv64, T::U64, T::U64, T::U64 ) +OPCODE(FPMax32, T::U32, T::U32, T::U32 ) +OPCODE(FPMax64, T::U64, T::U64, T::U64 ) +OPCODE(FPMaxNumeric32, T::U32, T::U32, T::U32 ) +OPCODE(FPMaxNumeric64, T::U64, T::U64, T::U64 ) +OPCODE(FPMin32, T::U32, T::U32, T::U32 ) +OPCODE(FPMin64, T::U64, T::U64, T::U64 ) +OPCODE(FPMinNumeric32, T::U32, T::U32, T::U32 ) +OPCODE(FPMinNumeric64, T::U64, T::U64, T::U64 ) +OPCODE(FPMul32, T::U32, T::U32, T::U32 ) +OPCODE(FPMul64, T::U64, T::U64, T::U64 ) +OPCODE(FPMulAdd32, T::U32, T::U32, T::U32, T::U32 ) +OPCODE(FPMulAdd64, T::U64, T::U64, T::U64, T::U64 ) +OPCODE(FPNeg32, T::U32, T::U32 ) +OPCODE(FPNeg64, T::U64, T::U64 ) +OPCODE(FPRoundInt32, T::U32, T::U32, T::U8, T::U1 ) +OPCODE(FPRoundInt64, T::U64, T::U64, T::U8, T::U1 ) +OPCODE(FPRSqrtEstimate32, T::U32, T::U32 ) +OPCODE(FPRSqrtEstimate64, T::U64, T::U64 ) +OPCODE(FPRSqrtStepFused32, T::U32, T::U32, T::U32 ) +OPCODE(FPRSqrtStepFused64, T::U64, T::U64, T::U64 ) +OPCODE(FPSqrt32, T::U32, T::U32 ) +OPCODE(FPSqrt64, T::U64, T::U64 ) +OPCODE(FPSub32, T::U32, T::U32, T::U32 ) +OPCODE(FPSub64, T::U64, T::U64, T::U64 ) // Floating-point conversions -OPCODE(FPSingleToDouble, T::U64, T::U32 ) -OPCODE(FPDoubleToSingle, T::U32, T::U64 ) -OPCODE(FPDoubleToFixedS32, T::U32, T::U64, T::U8, T::U8 ) -OPCODE(FPDoubleToFixedS64, T::U64, T::U64, T::U8, T::U8 ) -OPCODE(FPDoubleToFixedU32, T::U32, T::U64, T::U8, T::U8 ) -OPCODE(FPDoubleToFixedU64, T::U64, T::U64, T::U8, T::U8 ) -OPCODE(FPSingleToFixedS32, T::U32, T::U32, T::U8, T::U8 ) -OPCODE(FPSingleToFixedS64, T::U64, T::U32, T::U8, T::U8 ) -OPCODE(FPSingleToFixedU32, T::U32, T::U32, T::U8, T::U8 ) -OPCODE(FPSingleToFixedU64, T::U64, T::U32, T::U8, T::U8 ) -OPCODE(FPU32ToSingle, T::U32, T::U32, T::U1 ) -OPCODE(FPS32ToSingle, T::U32, T::U32, T::U1 ) -OPCODE(FPU32ToDouble, T::U64, T::U32, T::U1 ) -OPCODE(FPU64ToDouble, T::U64, T::U64, T::U1 ) -OPCODE(FPU64ToSingle, T::U32, T::U64, T::U1 ) -OPCODE(FPS32ToDouble, T::U64, T::U32, T::U1 ) -OPCODE(FPS64ToDouble, T::U64, T::U64, T::U1 ) -OPCODE(FPS64ToSingle, T::U32, T::U64, T::U1 ) +OPCODE(FPSingleToDouble, T::U64, T::U32 ) +OPCODE(FPDoubleToSingle, T::U32, T::U64 ) +OPCODE(FPDoubleToFixedS32, T::U32, T::U64, T::U8, T::U8 ) +OPCODE(FPDoubleToFixedS64, T::U64, T::U64, T::U8, T::U8 ) +OPCODE(FPDoubleToFixedU32, T::U32, T::U64, T::U8, T::U8 ) +OPCODE(FPDoubleToFixedU64, T::U64, T::U64, T::U8, T::U8 ) +OPCODE(FPSingleToFixedS32, T::U32, T::U32, T::U8, T::U8 ) +OPCODE(FPSingleToFixedS64, T::U64, T::U32, T::U8, T::U8 ) +OPCODE(FPSingleToFixedU32, T::U32, T::U32, T::U8, T::U8 ) +OPCODE(FPSingleToFixedU64, T::U64, T::U32, T::U8, T::U8 ) +OPCODE(FPU32ToSingle, T::U32, T::U32, T::U1 ) +OPCODE(FPS32ToSingle, T::U32, T::U32, T::U1 ) +OPCODE(FPU32ToDouble, T::U64, T::U32, T::U1 ) +OPCODE(FPU64ToDouble, T::U64, T::U64, T::U1 ) +OPCODE(FPU64ToSingle, T::U32, T::U64, T::U1 ) +OPCODE(FPS32ToDouble, T::U64, T::U32, T::U1 ) +OPCODE(FPS64ToDouble, T::U64, T::U64, T::U1 ) +OPCODE(FPS64ToSingle, T::U32, T::U64, T::U1 ) // Floating-point vector instructions -OPCODE(FPVectorAbs16, T::U128, T::U128 ) -OPCODE(FPVectorAbs32, T::U128, T::U128 ) -OPCODE(FPVectorAbs64, T::U128, T::U128 ) -OPCODE(FPVectorAdd32, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorAdd64, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorDiv32, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorDiv64, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorEqual32, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorEqual64, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorGreater32, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorGreater64, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorGreaterEqual32, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorGreaterEqual64, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorMul32, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorMul64, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorPairedAddLower32, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorPairedAddLower64, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorPairedAdd32, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorPairedAdd64, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorRSqrtEstimate32, T::U128, T::U128 ) -OPCODE(FPVectorRSqrtEstimate64, T::U128, T::U128 ) -OPCODE(FPVectorRSqrtStepFused32, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorRSqrtStepFused64, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorS32ToSingle, T::U128, T::U128 ) -OPCODE(FPVectorS64ToDouble, T::U128, T::U128 ) -OPCODE(FPVectorSub32, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorSub64, T::U128, T::U128, T::U128 ) -OPCODE(FPVectorU32ToSingle, T::U128, T::U128 ) -OPCODE(FPVectorU64ToDouble, T::U128, T::U128 ) +OPCODE(FPVectorAbs16, T::U128, T::U128 ) +OPCODE(FPVectorAbs32, T::U128, T::U128 ) +OPCODE(FPVectorAbs64, T::U128, T::U128 ) +OPCODE(FPVectorAdd32, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorAdd64, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorDiv32, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorDiv64, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorEqual32, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorEqual64, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorGreater32, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorGreater64, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorGreaterEqual32, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorGreaterEqual64, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorMul32, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorMul64, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorPairedAddLower32, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorPairedAddLower64, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorPairedAdd32, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorPairedAdd64, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorRSqrtEstimate32, T::U128, T::U128 ) +OPCODE(FPVectorRSqrtEstimate64, T::U128, T::U128 ) +OPCODE(FPVectorRSqrtStepFused32, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorRSqrtStepFused64, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorS32ToSingle, T::U128, T::U128 ) +OPCODE(FPVectorS64ToDouble, T::U128, T::U128 ) +OPCODE(FPVectorSub32, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorSub64, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorU32ToSingle, T::U128, T::U128 ) +OPCODE(FPVectorU64ToDouble, T::U128, T::U128 ) // A32 Memory access -A32OPC(ClearExclusive, T::Void, ) -A32OPC(SetExclusive, T::Void, T::U32, T::U8 ) -A32OPC(ReadMemory8, T::U8, T::U32 ) -A32OPC(ReadMemory16, T::U16, T::U32 ) -A32OPC(ReadMemory32, T::U32, T::U32 ) -A32OPC(ReadMemory64, T::U64, T::U32 ) -A32OPC(WriteMemory8, T::Void, T::U32, T::U8 ) -A32OPC(WriteMemory16, T::Void, T::U32, T::U16 ) -A32OPC(WriteMemory32, T::Void, T::U32, T::U32 ) -A32OPC(WriteMemory64, T::Void, T::U32, T::U64 ) -A32OPC(ExclusiveWriteMemory8, T::U32, T::U32, T::U8 ) -A32OPC(ExclusiveWriteMemory16, T::U32, T::U32, T::U16 ) -A32OPC(ExclusiveWriteMemory32, T::U32, T::U32, T::U32 ) -A32OPC(ExclusiveWriteMemory64, T::U32, T::U32, T::U32, T::U32 ) +A32OPC(ClearExclusive, T::Void, ) +A32OPC(SetExclusive, T::Void, T::U32, T::U8 ) +A32OPC(ReadMemory8, T::U8, T::U32 ) +A32OPC(ReadMemory16, T::U16, T::U32 ) +A32OPC(ReadMemory32, T::U32, T::U32 ) +A32OPC(ReadMemory64, T::U64, T::U32 ) +A32OPC(WriteMemory8, T::Void, T::U32, T::U8 ) +A32OPC(WriteMemory16, T::Void, T::U32, T::U16 ) +A32OPC(WriteMemory32, T::Void, T::U32, T::U32 ) +A32OPC(WriteMemory64, T::Void, T::U32, T::U64 ) +A32OPC(ExclusiveWriteMemory8, T::U32, T::U32, T::U8 ) +A32OPC(ExclusiveWriteMemory16, T::U32, T::U32, T::U16 ) +A32OPC(ExclusiveWriteMemory32, T::U32, T::U32, T::U32 ) +A32OPC(ExclusiveWriteMemory64, T::U32, T::U32, T::U32, T::U32 ) // A64 Memory access -A64OPC(ClearExclusive, T::Void, ) -A64OPC(SetExclusive, T::Void, T::U64, T::U8 ) -A64OPC(ReadMemory8, T::U8, T::U64 ) -A64OPC(ReadMemory16, T::U16, T::U64 ) -A64OPC(ReadMemory32, T::U32, T::U64 ) -A64OPC(ReadMemory64, T::U64, T::U64 ) -A64OPC(ReadMemory128, T::U128, T::U64 ) -A64OPC(WriteMemory8, T::Void, T::U64, T::U8 ) -A64OPC(WriteMemory16, T::Void, T::U64, T::U16 ) -A64OPC(WriteMemory32, T::Void, T::U64, T::U32 ) -A64OPC(WriteMemory64, T::Void, T::U64, T::U64 ) -A64OPC(WriteMemory128, T::Void, T::U64, T::U128 ) -A64OPC(ExclusiveWriteMemory8, T::U32, T::U64, T::U8 ) -A64OPC(ExclusiveWriteMemory16, T::U32, T::U64, T::U16 ) -A64OPC(ExclusiveWriteMemory32, T::U32, T::U64, T::U32 ) -A64OPC(ExclusiveWriteMemory64, T::U32, T::U64, T::U64 ) -A64OPC(ExclusiveWriteMemory128, T::U32, T::U64, T::U128 ) +A64OPC(ClearExclusive, T::Void, ) +A64OPC(SetExclusive, T::Void, T::U64, T::U8 ) +A64OPC(ReadMemory8, T::U8, T::U64 ) +A64OPC(ReadMemory16, T::U16, T::U64 ) +A64OPC(ReadMemory32, T::U32, T::U64 ) +A64OPC(ReadMemory64, T::U64, T::U64 ) +A64OPC(ReadMemory128, T::U128, T::U64 ) +A64OPC(WriteMemory8, T::Void, T::U64, T::U8 ) +A64OPC(WriteMemory16, T::Void, T::U64, T::U16 ) +A64OPC(WriteMemory32, T::Void, T::U64, T::U32 ) +A64OPC(WriteMemory64, T::Void, T::U64, T::U64 ) +A64OPC(WriteMemory128, T::Void, T::U64, T::U128 ) +A64OPC(ExclusiveWriteMemory8, T::U32, T::U64, T::U8 ) +A64OPC(ExclusiveWriteMemory16, T::U32, T::U64, T::U16 ) +A64OPC(ExclusiveWriteMemory32, T::U32, T::U64, T::U32 ) +A64OPC(ExclusiveWriteMemory64, T::U32, T::U64, T::U64 ) +A64OPC(ExclusiveWriteMemory128, T::U32, T::U64, T::U128 ) // Coprocessor -A32OPC(CoprocInternalOperation, T::Void, T::CoprocInfo ) -A32OPC(CoprocSendOneWord, T::Void, T::CoprocInfo, T::U32 ) -A32OPC(CoprocSendTwoWords, T::Void, T::CoprocInfo, T::U32, T::U32 ) -A32OPC(CoprocGetOneWord, T::U32, T::CoprocInfo ) -A32OPC(CoprocGetTwoWords, T::U64, T::CoprocInfo ) -A32OPC(CoprocLoadWords, T::Void, T::CoprocInfo, T::U32 ) -A32OPC(CoprocStoreWords, T::Void, T::CoprocInfo, T::U32 ) +A32OPC(CoprocInternalOperation, T::Void, T::CoprocInfo ) +A32OPC(CoprocSendOneWord, T::Void, T::CoprocInfo, T::U32 ) +A32OPC(CoprocSendTwoWords, T::Void, T::CoprocInfo, T::U32, T::U32 ) +A32OPC(CoprocGetOneWord, T::U32, T::CoprocInfo ) +A32OPC(CoprocGetTwoWords, T::U64, T::CoprocInfo ) +A32OPC(CoprocLoadWords, T::Void, T::CoprocInfo, T::U32 ) +A32OPC(CoprocStoreWords, T::Void, T::CoprocInfo, T::U32 ) diff --git a/tests/A64/fuzz_with_unicorn.cpp b/tests/A64/fuzz_with_unicorn.cpp index d2bcdaee..df1921f5 100644 --- a/tests/A64/fuzz_with_unicorn.cpp +++ b/tests/A64/fuzz_with_unicorn.cpp @@ -11,6 +11,7 @@ #include +#include "common/fp/fpsr.h" #include "common/llvm_disassemble.h" #include "common/scope_exit.h" #include "frontend/A64/decoder/a64.h" @@ -171,6 +172,7 @@ static void RunTestInstance(const Unicorn::RegisterArray& regs, const Unicorn::V jit.SetPC(instructions_offset * 4); jit.SetSP(0x08000000); jit.SetFpcr(fpcr); + jit.SetFpsr(0); jit.SetPstate(pstate); jit.ClearCache(); uni.SetRegisters(regs); @@ -178,6 +180,7 @@ static void RunTestInstance(const Unicorn::RegisterArray& regs, const Unicorn::V uni.SetPC(instructions_offset * 4); uni.SetSP(0x08000000); uni.SetFpcr(fpcr); + uni.SetFpsr(0); uni.SetPstate(pstate); uni.ClearPageCache(); @@ -213,6 +216,7 @@ static void RunTestInstance(const Unicorn::RegisterArray& regs, const Unicorn::V fmt::print("sp : {:016x} {:016x} {}\n", uni.GetSP(), jit.GetSP(), uni.GetSP() != jit.GetSP() ? "*" : ""); fmt::print("pc : {:016x} {:016x} {}\n", uni.GetPC(), jit.GetPC(), uni.GetPC() != jit.GetPC() ? "*" : ""); fmt::print("p : {:08x} {:08x} {}\n", uni.GetPstate(), jit.GetPstate(), (uni.GetPstate() & 0xF0000000) != (jit.GetPstate() & 0xF0000000) ? "*" : ""); + fmt::print("qc : {:08x} {:08x} {}\n", uni.GetFpsr(), jit.GetFpsr(), FP::FPSR{uni.GetFpsr()}.QC() != FP::FPSR{jit.GetFpsr()}.QC() ? "*" : ""); fmt::print("\n"); fmt::print("Modified memory:\n"); @@ -255,6 +259,7 @@ static void RunTestInstance(const Unicorn::RegisterArray& regs, const Unicorn::V REQUIRE((uni.GetPstate() & 0xF0000000) == (jit.GetPstate() & 0xF0000000)); REQUIRE(uni_env.modified_memory == jit_env.modified_memory); REQUIRE(uni_env.interrupts.empty()); + REQUIRE(FP::FPSR{uni.GetFpsr()}.QC() == FP::FPSR{jit.GetFpsr()}.QC()); } TEST_CASE("A64: Single random instruction", "[a64]") { diff --git a/tests/A64/unicorn_emu/unicorn.cpp b/tests/A64/unicorn_emu/unicorn.cpp index 4eb2a6ac..c0431f6a 100644 --- a/tests/A64/unicorn_emu/unicorn.cpp +++ b/tests/A64/unicorn_emu/unicorn.cpp @@ -128,6 +128,16 @@ void Unicorn::SetFpcr(u32 value) { CHECKED(uc_reg_write(uc, UC_ARM64_REG_FPCR, &value)); } +u32 Unicorn::GetFpsr() const { + u32 fpsr; + CHECKED(uc_reg_read(uc, UC_ARM64_REG_FPSR, &fpsr)); + return fpsr; +} + +void Unicorn::SetFpsr(u32 value) { + CHECKED(uc_reg_write(uc, UC_ARM64_REG_FPSR, &value)); +} + u32 Unicorn::GetPstate() const { u32 pstate; CHECKED(uc_reg_read(uc, UC_ARM64_REG_NZCV, &pstate)); diff --git a/tests/A64/unicorn_emu/unicorn.h b/tests/A64/unicorn_emu/unicorn.h index a39996e1..50b37842 100644 --- a/tests/A64/unicorn_emu/unicorn.h +++ b/tests/A64/unicorn_emu/unicorn.h @@ -46,6 +46,9 @@ public: u32 GetFpcr() const; void SetFpcr(u32 value); + u32 GetFpsr() const; + void SetFpsr(u32 value); + u32 GetPstate() const; void SetPstate(u32 value);