From f6247125c097729bf66fd1b369e281e2c626baf5 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Sat, 10 Feb 2018 09:31:50 +0000 Subject: [PATCH] IR: Implement VectorLogicalShiftLeft{8,16,32,64} --- src/backend_x64/emit_x64_vector.cpp | 47 +++ src/frontend/ir/ir_emitter.cpp | 16 + src/frontend/ir/ir_emitter.h | 4 + src/frontend/ir/opcodes.inc | 538 ++++++++++++++-------------- 4 files changed, 338 insertions(+), 267 deletions(-) diff --git a/src/backend_x64/emit_x64_vector.cpp b/src/backend_x64/emit_x64_vector.cpp index ac75a46b..ddc72b83 100644 --- a/src/backend_x64/emit_x64_vector.cpp +++ b/src/backend_x64/emit_x64_vector.cpp @@ -587,6 +587,53 @@ void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, a); } +void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + // TODO: Optimize + for (size_t i = 0; i < shift_amount; ++i) { + code.paddb(result, result); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorLogicalShiftLeft16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + code.psllw(result, shift_amount); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + code.pslld(result, shift_amount); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + code.psllq(result, shift_amount); + + ctx.reg_alloc.DefineValue(inst, result); +} + void EmitX64::EmitVectorZeroUpper(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index b3151575..845561ed 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -873,6 +873,22 @@ U128 IREmitter::VectorInterleaveLower64(const U128& a, const U128& b) { return Inst(Opcode::VectorInterleaveLower64, a, b); } +U128 IREmitter::VectorLogicalShiftLeft8(const U128& a, u8 shift_amount) { + return Inst(Opcode::VectorLogicalShiftLeft8, a, Imm8(shift_amount)); +} + +U128 IREmitter::VectorLogicalShiftLeft16(const U128& a, u8 shift_amount) { + return Inst(Opcode::VectorLogicalShiftLeft16, a, Imm8(shift_amount)); +} + +U128 IREmitter::VectorLogicalShiftLeft32(const U128& a, u8 shift_amount) { + return Inst(Opcode::VectorLogicalShiftLeft32, a, Imm8(shift_amount)); +} + +U128 IREmitter::VectorLogicalShiftLeft64(const U128& a, u8 shift_amount) { + return Inst(Opcode::VectorLogicalShiftLeft64, a, Imm8(shift_amount)); +} + U128 IREmitter::VectorNot(const U128& a) { return Inst(Opcode::VectorNot, a); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index c13d65e6..975c6eae 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -229,6 +229,10 @@ public: U128 VectorInterleaveLower16(const U128& a, const U128& b); U128 VectorInterleaveLower32(const U128& a, const U128& b); U128 VectorInterleaveLower64(const U128& a, const U128& b); + U128 VectorLogicalShiftLeft8(const U128& a, u8 shift_amount); + U128 VectorLogicalShiftLeft16(const U128& a, u8 shift_amount); + U128 VectorLogicalShiftLeft32(const U128& a, u8 shift_amount); + U128 VectorLogicalShiftLeft64(const U128& a, u8 shift_amount); U128 VectorNot(const U128& a); U128 VectorOr(const U128& a, const U128& b); U128 VectorPairedAdd8(const U128& a, const U128& b); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 9e03e4ee..aaec2ab9 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -1,299 +1,303 @@ -// opcode name, return type, arg1 type, arg2 type, arg3 type, ... +// opcode name, return type, arg1 type, arg2 type, arg3 type, ... -OPCODE(Void, T::Void, ) -OPCODE(Identity, T::Opaque, T::Opaque ) -OPCODE(Breakpoint, T::Void, ) +OPCODE(Void, T::Void, ) +OPCODE(Identity, T::Opaque, T::Opaque ) +OPCODE(Breakpoint, T::Void, ) // A32 Context getters/setters -A32OPC(GetRegister, T::U32, T::A32Reg ) -A32OPC(GetExtendedRegister32, T::U32, T::A32ExtReg ) -A32OPC(GetExtendedRegister64, T::U64, T::A32ExtReg ) -A32OPC(SetRegister, T::Void, T::A32Reg, T::U32 ) -A32OPC(SetExtendedRegister32, T::Void, T::A32ExtReg, T::U32 ) -A32OPC(SetExtendedRegister64, T::Void, T::A32ExtReg, T::U64 ) -A32OPC(GetCpsr, T::U32, ) -A32OPC(SetCpsr, T::Void, T::U32 ) -A32OPC(SetCpsrNZCV, T::Void, T::U32 ) -A32OPC(SetCpsrNZCVQ, T::Void, T::U32 ) -A32OPC(GetNFlag, T::U1, ) -A32OPC(SetNFlag, T::Void, T::U1 ) -A32OPC(GetZFlag, T::U1, ) -A32OPC(SetZFlag, T::Void, T::U1 ) -A32OPC(GetCFlag, T::U1, ) -A32OPC(SetCFlag, T::Void, T::U1 ) -A32OPC(GetVFlag, T::U1, ) -A32OPC(SetVFlag, T::Void, T::U1 ) -A32OPC(OrQFlag, T::Void, T::U1 ) -A32OPC(GetGEFlags, T::U32, ) -A32OPC(SetGEFlags, T::Void, T::U32 ) -A32OPC(SetGEFlagsCompressed, T::Void, T::U32 ) -A32OPC(BXWritePC, T::Void, T::U32 ) -A32OPC(CallSupervisor, T::Void, T::U32 ) -A32OPC(ExceptionRaised, T::Void, T::U32, T::U64 ) -A32OPC(GetFpscr, T::U32, ) -A32OPC(SetFpscr, T::Void, T::U32, ) -A32OPC(GetFpscrNZCV, T::U32, ) -A32OPC(SetFpscrNZCV, T::Void, T::NZCVFlags ) +A32OPC(GetRegister, T::U32, T::A32Reg ) +A32OPC(GetExtendedRegister32, T::U32, T::A32ExtReg ) +A32OPC(GetExtendedRegister64, T::U64, T::A32ExtReg ) +A32OPC(SetRegister, T::Void, T::A32Reg, T::U32 ) +A32OPC(SetExtendedRegister32, T::Void, T::A32ExtReg, T::U32 ) +A32OPC(SetExtendedRegister64, T::Void, T::A32ExtReg, T::U64 ) +A32OPC(GetCpsr, T::U32, ) +A32OPC(SetCpsr, T::Void, T::U32 ) +A32OPC(SetCpsrNZCV, T::Void, T::U32 ) +A32OPC(SetCpsrNZCVQ, T::Void, T::U32 ) +A32OPC(GetNFlag, T::U1, ) +A32OPC(SetNFlag, T::Void, T::U1 ) +A32OPC(GetZFlag, T::U1, ) +A32OPC(SetZFlag, T::Void, T::U1 ) +A32OPC(GetCFlag, T::U1, ) +A32OPC(SetCFlag, T::Void, T::U1 ) +A32OPC(GetVFlag, T::U1, ) +A32OPC(SetVFlag, T::Void, T::U1 ) +A32OPC(OrQFlag, T::Void, T::U1 ) +A32OPC(GetGEFlags, T::U32, ) +A32OPC(SetGEFlags, T::Void, T::U32 ) +A32OPC(SetGEFlagsCompressed, T::Void, T::U32 ) +A32OPC(BXWritePC, T::Void, T::U32 ) +A32OPC(CallSupervisor, T::Void, T::U32 ) +A32OPC(ExceptionRaised, T::Void, T::U32, T::U64 ) +A32OPC(GetFpscr, T::U32, ) +A32OPC(SetFpscr, T::Void, T::U32, ) +A32OPC(GetFpscrNZCV, T::U32, ) +A32OPC(SetFpscrNZCV, T::Void, T::NZCVFlags ) // A64 Context getters/setters -A64OPC(SetCheckBit, T::Void, T::U1 ) -A64OPC(GetCFlag, T::U1, ) -A64OPC(SetNZCV, T::Void, T::NZCVFlags ) -A64OPC(GetW, T::U32, T::A64Reg ) -A64OPC(GetX, T::U64, T::A64Reg ) -//A64OPC(GetB, T::U128, T::A64Vec ) -//A64OPC(GetH, T::U128, T::A64Vec ) -A64OPC(GetS, T::U128, T::A64Vec ) -A64OPC(GetD, T::U128, T::A64Vec ) -A64OPC(GetQ, T::U128, T::A64Vec ) -A64OPC(GetSP, T::U64, ) -A64OPC(SetW, T::Void, T::A64Reg, T::U32 ) -A64OPC(SetX, T::Void, T::A64Reg, T::U64 ) -//A64OPC(SetB, T::Void, T::A64Vec, T::U8 ) -//A64OPC(SetH, T::Void, T::A64Vec, T::U16 ) -A64OPC(SetS, T::Void, T::A64Vec, T::U128 ) -A64OPC(SetD, T::Void, T::A64Vec, T::U128 ) -A64OPC(SetQ, T::Void, T::A64Vec, T::U128 ) -A64OPC(SetSP, T::Void, T::U64 ) -A64OPC(SetPC, T::Void, T::U64 ) -A64OPC(CallSupervisor, T::Void, T::U32 ) -A64OPC(ExceptionRaised, T::Void, T::U64, T::U64 ) +A64OPC(SetCheckBit, T::Void, T::U1 ) +A64OPC(GetCFlag, T::U1, ) +A64OPC(SetNZCV, T::Void, T::NZCVFlags ) +A64OPC(GetW, T::U32, T::A64Reg ) +A64OPC(GetX, T::U64, T::A64Reg ) +//A64OPC(GetB, T::U128, T::A64Vec ) +//A64OPC(GetH, T::U128, T::A64Vec ) +A64OPC(GetS, T::U128, T::A64Vec ) +A64OPC(GetD, T::U128, T::A64Vec ) +A64OPC(GetQ, T::U128, T::A64Vec ) +A64OPC(GetSP, T::U64, ) +A64OPC(SetW, T::Void, T::A64Reg, T::U32 ) +A64OPC(SetX, T::Void, T::A64Reg, T::U64 ) +//A64OPC(SetB, T::Void, T::A64Vec, T::U8 ) +//A64OPC(SetH, T::Void, T::A64Vec, T::U16 ) +A64OPC(SetS, T::Void, T::A64Vec, T::U128 ) +A64OPC(SetD, T::Void, T::A64Vec, T::U128 ) +A64OPC(SetQ, T::Void, T::A64Vec, T::U128 ) +A64OPC(SetSP, T::Void, T::U64 ) +A64OPC(SetPC, T::Void, T::U64 ) +A64OPC(CallSupervisor, T::Void, T::U32 ) +A64OPC(ExceptionRaised, T::Void, T::U64, T::U64 ) // Hints -OPCODE(PushRSB, T::Void, T::U64 ) +OPCODE(PushRSB, T::Void, T::U64 ) // Pseudo-operation, handled specially at final emit -OPCODE(GetCarryFromOp, T::U1, T::U32 ) -OPCODE(GetOverflowFromOp, T::U1, T::U32 ) -OPCODE(GetGEFromOp, T::U32, T::U32 ) -OPCODE(GetNZCVFromOp, T::NZCVFlags, T::Opaque ) +OPCODE(GetCarryFromOp, T::U1, T::U32 ) +OPCODE(GetOverflowFromOp, T::U1, T::U32 ) +OPCODE(GetGEFromOp, T::U32, T::U32 ) +OPCODE(GetNZCVFromOp, T::NZCVFlags, T::Opaque ) -OPCODE(NZCVFromPackedFlags, T::NZCVFlags, T::U32 ) +OPCODE(NZCVFromPackedFlags, T::NZCVFlags, T::U32 ) // Calculations -OPCODE(Pack2x32To1x64, T::U64, T::U32, T::U32 ) -OPCODE(LeastSignificantWord, T::U32, T::U64 ) -OPCODE(MostSignificantWord, T::U32, T::U64 ) -OPCODE(LeastSignificantHalf, T::U16, T::U32 ) -OPCODE(LeastSignificantByte, T::U8, T::U32 ) -OPCODE(MostSignificantBit, T::U1, T::U32 ) -OPCODE(IsZero32, T::U1, T::U32 ) -OPCODE(IsZero64, T::U1, T::U64 ) -OPCODE(TestBit, T::U1, T::U64, T::U8 ) -OPCODE(ConditionalSelect32, T::U32, T::Cond, T::U32, T::U32 ) -OPCODE(ConditionalSelect64, T::U64, T::Cond, T::U64, T::U64 ) -OPCODE(ConditionalSelectNZCV, T::NZCVFlags, T::Cond, T::NZCVFlags, T::NZCVFlags ) -OPCODE(LogicalShiftLeft32, T::U32, T::U32, T::U8, T::U1 ) -OPCODE(LogicalShiftLeft64, T::U64, T::U64, T::U8 ) -OPCODE(LogicalShiftRight32, T::U32, T::U32, T::U8, T::U1 ) -OPCODE(LogicalShiftRight64, T::U64, T::U64, T::U8 ) -OPCODE(ArithmeticShiftRight32, T::U32, T::U32, T::U8, T::U1 ) -OPCODE(ArithmeticShiftRight64, T::U64, T::U64, T::U8 ) -OPCODE(RotateRight32, T::U32, T::U32, T::U8, T::U1 ) -OPCODE(RotateRight64, T::U64, T::U64, T::U8 ) -OPCODE(RotateRightExtended, T::U32, T::U32, T::U1 ) -OPCODE(Add32, T::U32, T::U32, T::U32, T::U1 ) -OPCODE(Add64, T::U64, T::U64, T::U64, T::U1 ) -OPCODE(Sub32, T::U32, T::U32, T::U32, T::U1 ) -OPCODE(Sub64, T::U64, T::U64, T::U64, T::U1 ) -OPCODE(Mul32, T::U32, T::U32, T::U32 ) -OPCODE(Mul64, T::U64, T::U64, T::U64 ) -OPCODE(SignedMultiplyHigh64, T::U64, T::U64, T::U64 ) -OPCODE(UnsignedMultiplyHigh64, T::U64, T::U64, T::U64 ) -OPCODE(UnsignedDiv32, T::U32, T::U32, T::U32 ) -OPCODE(UnsignedDiv64, T::U64, T::U64, T::U64 ) -OPCODE(SignedDiv32, T::U32, T::U32, T::U32 ) -OPCODE(SignedDiv64, T::U64, T::U64, T::U64 ) -OPCODE(And32, T::U32, T::U32, T::U32 ) -OPCODE(And64, T::U64, T::U64, T::U64 ) -OPCODE(Eor32, T::U32, T::U32, T::U32 ) -OPCODE(Eor64, T::U64, T::U64, T::U64 ) -OPCODE(Or32, T::U32, T::U32, T::U32 ) -OPCODE(Or64, T::U64, T::U64, T::U64 ) -OPCODE(Not32, T::U32, T::U32 ) -OPCODE(Not64, T::U64, T::U64 ) -OPCODE(SignExtendByteToWord, T::U32, T::U8 ) -OPCODE(SignExtendHalfToWord, T::U32, T::U16 ) -OPCODE(SignExtendByteToLong, T::U64, T::U8 ) -OPCODE(SignExtendHalfToLong, T::U64, T::U16 ) -OPCODE(SignExtendWordToLong, T::U64, T::U32 ) -OPCODE(ZeroExtendByteToWord, T::U32, T::U8 ) -OPCODE(ZeroExtendHalfToWord, T::U32, T::U16 ) -OPCODE(ZeroExtendByteToLong, T::U64, T::U8 ) -OPCODE(ZeroExtendHalfToLong, T::U64, T::U16 ) -OPCODE(ZeroExtendWordToLong, T::U64, T::U32 ) -OPCODE(ZeroExtendLongToQuad, T::U128, T::U64 ) -OPCODE(ByteReverseWord, T::U32, T::U32 ) -OPCODE(ByteReverseHalf, T::U16, T::U16 ) -OPCODE(ByteReverseDual, T::U64, T::U64 ) -OPCODE(CountLeadingZeros32, T::U32, T::U32 ) -OPCODE(CountLeadingZeros64, T::U64, T::U64 ) -OPCODE(ExtractRegister32, T::U32, T::U32, T::U32, T::U8 ) -OPCODE(ExtractRegister64, T::U64, T::U64, T::U64, T::U8 ) +OPCODE(Pack2x32To1x64, T::U64, T::U32, T::U32 ) +OPCODE(LeastSignificantWord, T::U32, T::U64 ) +OPCODE(MostSignificantWord, T::U32, T::U64 ) +OPCODE(LeastSignificantHalf, T::U16, T::U32 ) +OPCODE(LeastSignificantByte, T::U8, T::U32 ) +OPCODE(MostSignificantBit, T::U1, T::U32 ) +OPCODE(IsZero32, T::U1, T::U32 ) +OPCODE(IsZero64, T::U1, T::U64 ) +OPCODE(TestBit, T::U1, T::U64, T::U8 ) +OPCODE(ConditionalSelect32, T::U32, T::Cond, T::U32, T::U32 ) +OPCODE(ConditionalSelect64, T::U64, T::Cond, T::U64, T::U64 ) +OPCODE(ConditionalSelectNZCV, T::NZCVFlags, T::Cond, T::NZCVFlags, T::NZCVFlags ) +OPCODE(LogicalShiftLeft32, T::U32, T::U32, T::U8, T::U1 ) +OPCODE(LogicalShiftLeft64, T::U64, T::U64, T::U8 ) +OPCODE(LogicalShiftRight32, T::U32, T::U32, T::U8, T::U1 ) +OPCODE(LogicalShiftRight64, T::U64, T::U64, T::U8 ) +OPCODE(ArithmeticShiftRight32, T::U32, T::U32, T::U8, T::U1 ) +OPCODE(ArithmeticShiftRight64, T::U64, T::U64, T::U8 ) +OPCODE(RotateRight32, T::U32, T::U32, T::U8, T::U1 ) +OPCODE(RotateRight64, T::U64, T::U64, T::U8 ) +OPCODE(RotateRightExtended, T::U32, T::U32, T::U1 ) +OPCODE(Add32, T::U32, T::U32, T::U32, T::U1 ) +OPCODE(Add64, T::U64, T::U64, T::U64, T::U1 ) +OPCODE(Sub32, T::U32, T::U32, T::U32, T::U1 ) +OPCODE(Sub64, T::U64, T::U64, T::U64, T::U1 ) +OPCODE(Mul32, T::U32, T::U32, T::U32 ) +OPCODE(Mul64, T::U64, T::U64, T::U64 ) +OPCODE(SignedMultiplyHigh64, T::U64, T::U64, T::U64 ) +OPCODE(UnsignedMultiplyHigh64, T::U64, T::U64, T::U64 ) +OPCODE(UnsignedDiv32, T::U32, T::U32, T::U32 ) +OPCODE(UnsignedDiv64, T::U64, T::U64, T::U64 ) +OPCODE(SignedDiv32, T::U32, T::U32, T::U32 ) +OPCODE(SignedDiv64, T::U64, T::U64, T::U64 ) +OPCODE(And32, T::U32, T::U32, T::U32 ) +OPCODE(And64, T::U64, T::U64, T::U64 ) +OPCODE(Eor32, T::U32, T::U32, T::U32 ) +OPCODE(Eor64, T::U64, T::U64, T::U64 ) +OPCODE(Or32, T::U32, T::U32, T::U32 ) +OPCODE(Or64, T::U64, T::U64, T::U64 ) +OPCODE(Not32, T::U32, T::U32 ) +OPCODE(Not64, T::U64, T::U64 ) +OPCODE(SignExtendByteToWord, T::U32, T::U8 ) +OPCODE(SignExtendHalfToWord, T::U32, T::U16 ) +OPCODE(SignExtendByteToLong, T::U64, T::U8 ) +OPCODE(SignExtendHalfToLong, T::U64, T::U16 ) +OPCODE(SignExtendWordToLong, T::U64, T::U32 ) +OPCODE(ZeroExtendByteToWord, T::U32, T::U8 ) +OPCODE(ZeroExtendHalfToWord, T::U32, T::U16 ) +OPCODE(ZeroExtendByteToLong, T::U64, T::U8 ) +OPCODE(ZeroExtendHalfToLong, T::U64, T::U16 ) +OPCODE(ZeroExtendWordToLong, T::U64, T::U32 ) +OPCODE(ZeroExtendLongToQuad, T::U128, T::U64 ) +OPCODE(ByteReverseWord, T::U32, T::U32 ) +OPCODE(ByteReverseHalf, T::U16, T::U16 ) +OPCODE(ByteReverseDual, T::U64, T::U64 ) +OPCODE(CountLeadingZeros32, T::U32, T::U32 ) +OPCODE(CountLeadingZeros64, T::U64, T::U64 ) +OPCODE(ExtractRegister32, T::U32, T::U32, T::U32, T::U8 ) +OPCODE(ExtractRegister64, T::U64, T::U64, T::U64, T::U8 ) // Saturated instructions -OPCODE(SignedSaturatedAdd, T::U32, T::U32, T::U32 ) -OPCODE(SignedSaturatedSub, T::U32, T::U32, T::U32 ) -OPCODE(UnsignedSaturation, T::U32, T::U32, T::U8 ) -OPCODE(SignedSaturation, T::U32, T::U32, T::U8 ) +OPCODE(SignedSaturatedAdd, T::U32, T::U32, T::U32 ) +OPCODE(SignedSaturatedSub, T::U32, T::U32, T::U32 ) +OPCODE(UnsignedSaturation, T::U32, T::U32, T::U8 ) +OPCODE(SignedSaturation, T::U32, T::U32, T::U8 ) // Packed instructions -OPCODE(PackedAddU8, T::U32, T::U32, T::U32 ) -OPCODE(PackedAddS8, T::U32, T::U32, T::U32 ) -OPCODE(PackedSubU8, T::U32, T::U32, T::U32 ) -OPCODE(PackedSubS8, T::U32, T::U32, T::U32 ) -OPCODE(PackedAddU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedAddS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSubU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSubS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedAddSubU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedAddSubS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSubAddU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSubAddS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingAddU8, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingAddS8, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingSubU8, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingSubS8, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingAddU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingAddS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingSubU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingSubS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingAddSubU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingAddSubS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingSubAddU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingSubAddS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedAddU8, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedAddS8, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedSubU8, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedSubS8, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedAddU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedAddS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedSubU16, T::U32, T::U32, T::U32 ) -OPCODE(PackedSaturatedSubS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedAbsDiffSumS8, T::U32, T::U32, T::U32 ) -OPCODE(PackedSelect, T::U32, T::U32, T::U32, T::U32 ) +OPCODE(PackedAddU8, T::U32, T::U32, T::U32 ) +OPCODE(PackedAddS8, T::U32, T::U32, T::U32 ) +OPCODE(PackedSubU8, T::U32, T::U32, T::U32 ) +OPCODE(PackedSubS8, T::U32, T::U32, T::U32 ) +OPCODE(PackedAddU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedAddS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSubU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSubS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedAddSubU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedAddSubS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSubAddU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSubAddS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingAddU8, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingAddS8, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubU8, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubS8, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingAddU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingAddS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingAddSubU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingAddSubS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubAddU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubAddS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedAddU8, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedAddS8, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedSubU8, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedSubS8, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedAddU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedAddS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedSubU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSaturatedSubS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedAbsDiffSumS8, T::U32, T::U32, T::U32 ) +OPCODE(PackedSelect, T::U32, T::U32, T::U32, T::U32 ) // CRC instructions -OPCODE(CRC32Castagnoli8, T::U32, T::U32, T::U32 ) -OPCODE(CRC32Castagnoli16, T::U32, T::U32, T::U32 ) -OPCODE(CRC32Castagnoli32, T::U32, T::U32, T::U32 ) -OPCODE(CRC32Castagnoli64, T::U32, T::U32, T::U64 ) -OPCODE(CRC32ISO8, T::U32, T::U32, T::U32 ) -OPCODE(CRC32ISO16, T::U32, T::U32, T::U32 ) -OPCODE(CRC32ISO32, T::U32, T::U32, T::U32 ) -OPCODE(CRC32ISO64, T::U32, T::U32, T::U64 ) +OPCODE(CRC32Castagnoli8, T::U32, T::U32, T::U32 ) +OPCODE(CRC32Castagnoli16, T::U32, T::U32, T::U32 ) +OPCODE(CRC32Castagnoli32, T::U32, T::U32, T::U32 ) +OPCODE(CRC32Castagnoli64, T::U32, T::U32, T::U64 ) +OPCODE(CRC32ISO8, T::U32, T::U32, T::U32 ) +OPCODE(CRC32ISO16, T::U32, T::U32, T::U32 ) +OPCODE(CRC32ISO32, T::U32, T::U32, T::U32 ) +OPCODE(CRC32ISO64, T::U32, T::U32, T::U64 ) // AES instructions -OPCODE(AESDecryptSingleRound, T::U128, T::U128 ) -OPCODE(AESEncryptSingleRound, T::U128, T::U128 ) -OPCODE(AESInverseMixColumns, T::U128, T::U128 ) -OPCODE(AESMixColumns, T::U128, T::U128 ) +OPCODE(AESDecryptSingleRound, T::U128, T::U128 ) +OPCODE(AESEncryptSingleRound, T::U128, T::U128 ) +OPCODE(AESInverseMixColumns, T::U128, T::U128 ) +OPCODE(AESMixColumns, T::U128, T::U128 ) // Vector instructions -OPCODE(VectorGetElement8, T::U8, T::U128, T::U8 ) -OPCODE(VectorGetElement16, T::U16, T::U128, T::U8 ) -OPCODE(VectorGetElement32, T::U32, T::U128, T::U8 ) -OPCODE(VectorGetElement64, T::U64, T::U128, T::U8 ) -OPCODE(VectorSetElement8, T::U128, T::U128, T::U8, T::U8 ) -OPCODE(VectorSetElement16, T::U128, T::U128, T::U8, T::U16 ) -OPCODE(VectorSetElement32, T::U128, T::U128, T::U8, T::U32 ) -OPCODE(VectorSetElement64, T::U128, T::U128, T::U8, T::U64 ) -OPCODE(VectorAdd8, T::U128, T::U128, T::U128 ) -OPCODE(VectorAdd16, T::U128, T::U128, T::U128 ) -OPCODE(VectorAdd32, T::U128, T::U128, T::U128 ) -OPCODE(VectorAdd64, T::U128, T::U128, T::U128 ) -OPCODE(VectorAnd, T::U128, T::U128, T::U128 ) -OPCODE(VectorBroadcastLower8, T::U128, T::U8 ) -OPCODE(VectorBroadcastLower16, T::U128, T::U16 ) -OPCODE(VectorBroadcastLower32, T::U128, T::U32 ) -OPCODE(VectorBroadcast8, T::U128, T::U8 ) -OPCODE(VectorBroadcast16, T::U128, T::U16 ) -OPCODE(VectorBroadcast32, T::U128, T::U32 ) -OPCODE(VectorBroadcast64, T::U128, T::U64 ) -OPCODE(VectorEor, T::U128, T::U128, T::U128 ) -OPCODE(VectorEqual8, T::U128, T::U128, T::U128 ) -OPCODE(VectorEqual16, T::U128, T::U128, T::U128 ) -OPCODE(VectorEqual32, T::U128, T::U128, T::U128 ) -OPCODE(VectorEqual64, T::U128, T::U128, T::U128 ) -OPCODE(VectorEqual128, T::U128, T::U128, T::U128 ) -OPCODE(VectorInterleaveLower8, T::U128, T::U128, T::U128 ) -OPCODE(VectorInterleaveLower16, T::U128, T::U128, T::U128 ) -OPCODE(VectorInterleaveLower32, T::U128, T::U128, T::U128 ) -OPCODE(VectorInterleaveLower64, T::U128, T::U128, T::U128 ) -OPCODE(VectorNot, T::U128, T::U128 ) -OPCODE(VectorOr, T::U128, T::U128, T::U128 ) -OPCODE(VectorPairedAddLower8, T::U128, T::U128, T::U128 ) -OPCODE(VectorPairedAddLower16, T::U128, T::U128, T::U128 ) -OPCODE(VectorPairedAddLower32, T::U128, T::U128, T::U128 ) -OPCODE(VectorPairedAdd8, T::U128, T::U128, T::U128 ) -OPCODE(VectorPairedAdd16, T::U128, T::U128, T::U128 ) -OPCODE(VectorPairedAdd32, T::U128, T::U128, T::U128 ) -OPCODE(VectorPairedAdd64, T::U128, T::U128, T::U128 ) -OPCODE(VectorZeroUpper, T::U128, T::U128 ) +OPCODE(VectorGetElement8, T::U8, T::U128, T::U8 ) +OPCODE(VectorGetElement16, T::U16, T::U128, T::U8 ) +OPCODE(VectorGetElement32, T::U32, T::U128, T::U8 ) +OPCODE(VectorGetElement64, T::U64, T::U128, T::U8 ) +OPCODE(VectorSetElement8, T::U128, T::U128, T::U8, T::U8 ) +OPCODE(VectorSetElement16, T::U128, T::U128, T::U8, T::U16 ) +OPCODE(VectorSetElement32, T::U128, T::U128, T::U8, T::U32 ) +OPCODE(VectorSetElement64, T::U128, T::U128, T::U8, T::U64 ) +OPCODE(VectorAdd8, T::U128, T::U128, T::U128 ) +OPCODE(VectorAdd16, T::U128, T::U128, T::U128 ) +OPCODE(VectorAdd32, T::U128, T::U128, T::U128 ) +OPCODE(VectorAdd64, T::U128, T::U128, T::U128 ) +OPCODE(VectorAnd, T::U128, T::U128, T::U128 ) +OPCODE(VectorBroadcastLower8, T::U128, T::U8 ) +OPCODE(VectorBroadcastLower16, T::U128, T::U16 ) +OPCODE(VectorBroadcastLower32, T::U128, T::U32 ) +OPCODE(VectorBroadcast8, T::U128, T::U8 ) +OPCODE(VectorBroadcast16, T::U128, T::U16 ) +OPCODE(VectorBroadcast32, T::U128, T::U32 ) +OPCODE(VectorBroadcast64, T::U128, T::U64 ) +OPCODE(VectorEor, T::U128, T::U128, T::U128 ) +OPCODE(VectorEqual8, T::U128, T::U128, T::U128 ) +OPCODE(VectorEqual16, T::U128, T::U128, T::U128 ) +OPCODE(VectorEqual32, T::U128, T::U128, T::U128 ) +OPCODE(VectorEqual64, T::U128, T::U128, T::U128 ) +OPCODE(VectorEqual128, T::U128, T::U128, T::U128 ) +OPCODE(VectorInterleaveLower8, T::U128, T::U128, T::U128 ) +OPCODE(VectorInterleaveLower16, T::U128, T::U128, T::U128 ) +OPCODE(VectorInterleaveLower32, T::U128, T::U128, T::U128 ) +OPCODE(VectorInterleaveLower64, T::U128, T::U128, T::U128 ) +OPCODE(VectorLogicalShiftLeft8, T::U128, T::U128, T::U8 ) +OPCODE(VectorLogicalShiftLeft16, T::U128, T::U128, T::U8 ) +OPCODE(VectorLogicalShiftLeft32, T::U128, T::U128, T::U8 ) +OPCODE(VectorLogicalShiftLeft64, T::U128, T::U128, T::U8 ) +OPCODE(VectorNot, T::U128, T::U128 ) +OPCODE(VectorOr, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAddLower8, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAddLower16, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAddLower32, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAdd8, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAdd16, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAdd32, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAdd64, T::U128, T::U128, T::U128 ) +OPCODE(VectorZeroUpper, T::U128, T::U128 ) // Floating-point operations -OPCODE(FPAbs32, T::U32, T::U32 ) -OPCODE(FPAbs64, T::U64, T::U64 ) -OPCODE(FPAdd32, T::U32, T::U32, T::U32 ) -OPCODE(FPAdd64, T::U64, T::U64, T::U64 ) -OPCODE(FPCompare32, T::NZCVFlags, T::U32, T::U32, T::U1 ) -OPCODE(FPCompare64, T::NZCVFlags, T::U64, T::U64, T::U1 ) -OPCODE(FPDiv32, T::U32, T::U32, T::U32 ) -OPCODE(FPDiv64, T::U64, T::U64, T::U64 ) -OPCODE(FPMul32, T::U32, T::U32, T::U32 ) -OPCODE(FPMul64, T::U64, T::U64, T::U64 ) -OPCODE(FPNeg32, T::U32, T::U32 ) -OPCODE(FPNeg64, T::U64, T::U64 ) -OPCODE(FPSqrt32, T::U32, T::U32 ) -OPCODE(FPSqrt64, T::U64, T::U64 ) -OPCODE(FPSub32, T::U32, T::U32, T::U32 ) -OPCODE(FPSub64, T::U64, T::U64, T::U64 ) +OPCODE(FPAbs32, T::U32, T::U32 ) +OPCODE(FPAbs64, T::U64, T::U64 ) +OPCODE(FPAdd32, T::U32, T::U32, T::U32 ) +OPCODE(FPAdd64, T::U64, T::U64, T::U64 ) +OPCODE(FPCompare32, T::NZCVFlags, T::U32, T::U32, T::U1 ) +OPCODE(FPCompare64, T::NZCVFlags, T::U64, T::U64, T::U1 ) +OPCODE(FPDiv32, T::U32, T::U32, T::U32 ) +OPCODE(FPDiv64, T::U64, T::U64, T::U64 ) +OPCODE(FPMul32, T::U32, T::U32, T::U32 ) +OPCODE(FPMul64, T::U64, T::U64, T::U64 ) +OPCODE(FPNeg32, T::U32, T::U32 ) +OPCODE(FPNeg64, T::U64, T::U64 ) +OPCODE(FPSqrt32, T::U32, T::U32 ) +OPCODE(FPSqrt64, T::U64, T::U64 ) +OPCODE(FPSub32, T::U32, T::U32, T::U32 ) +OPCODE(FPSub64, T::U64, T::U64, T::U64 ) // Floating-point conversions -OPCODE(FPSingleToDouble, T::U64, T::U32 ) -OPCODE(FPDoubleToSingle, T::U32, T::U64 ) -OPCODE(FPSingleToU32, T::U32, T::U32, T::U1 ) -OPCODE(FPSingleToS32, T::U32, T::U32, T::U1 ) -OPCODE(FPDoubleToU32, T::U32, T::U64, T::U1 ) -OPCODE(FPDoubleToS32, T::U32, T::U64, T::U1 ) -OPCODE(FPU32ToSingle, T::U32, T::U32, T::U1 ) -OPCODE(FPS32ToSingle, T::U32, T::U32, T::U1 ) -OPCODE(FPU32ToDouble, T::U64, T::U32, T::U1 ) -OPCODE(FPS32ToDouble, T::U64, T::U32, T::U1 ) +OPCODE(FPSingleToDouble, T::U64, T::U32 ) +OPCODE(FPDoubleToSingle, T::U32, T::U64 ) +OPCODE(FPSingleToU32, T::U32, T::U32, T::U1 ) +OPCODE(FPSingleToS32, T::U32, T::U32, T::U1 ) +OPCODE(FPDoubleToU32, T::U32, T::U64, T::U1 ) +OPCODE(FPDoubleToS32, T::U32, T::U64, T::U1 ) +OPCODE(FPU32ToSingle, T::U32, T::U32, T::U1 ) +OPCODE(FPS32ToSingle, T::U32, T::U32, T::U1 ) +OPCODE(FPU32ToDouble, T::U64, T::U32, T::U1 ) +OPCODE(FPS32ToDouble, T::U64, T::U32, T::U1 ) // A32 Memory access -A32OPC(ClearExclusive, T::Void, ) -A32OPC(SetExclusive, T::Void, T::U32, T::U8 ) -A32OPC(ReadMemory8, T::U8, T::U32 ) -A32OPC(ReadMemory16, T::U16, T::U32 ) -A32OPC(ReadMemory32, T::U32, T::U32 ) -A32OPC(ReadMemory64, T::U64, T::U32 ) -A32OPC(WriteMemory8, T::Void, T::U32, T::U8 ) -A32OPC(WriteMemory16, T::Void, T::U32, T::U16 ) -A32OPC(WriteMemory32, T::Void, T::U32, T::U32 ) -A32OPC(WriteMemory64, T::Void, T::U32, T::U64 ) -A32OPC(ExclusiveWriteMemory8, T::U32, T::U32, T::U8 ) -A32OPC(ExclusiveWriteMemory16, T::U32, T::U32, T::U16 ) -A32OPC(ExclusiveWriteMemory32, T::U32, T::U32, T::U32 ) -A32OPC(ExclusiveWriteMemory64, T::U32, T::U32, T::U32, T::U32 ) +A32OPC(ClearExclusive, T::Void, ) +A32OPC(SetExclusive, T::Void, T::U32, T::U8 ) +A32OPC(ReadMemory8, T::U8, T::U32 ) +A32OPC(ReadMemory16, T::U16, T::U32 ) +A32OPC(ReadMemory32, T::U32, T::U32 ) +A32OPC(ReadMemory64, T::U64, T::U32 ) +A32OPC(WriteMemory8, T::Void, T::U32, T::U8 ) +A32OPC(WriteMemory16, T::Void, T::U32, T::U16 ) +A32OPC(WriteMemory32, T::Void, T::U32, T::U32 ) +A32OPC(WriteMemory64, T::Void, T::U32, T::U64 ) +A32OPC(ExclusiveWriteMemory8, T::U32, T::U32, T::U8 ) +A32OPC(ExclusiveWriteMemory16, T::U32, T::U32, T::U16 ) +A32OPC(ExclusiveWriteMemory32, T::U32, T::U32, T::U32 ) +A32OPC(ExclusiveWriteMemory64, T::U32, T::U32, T::U32, T::U32 ) // A64 Memory access -A64OPC(ReadMemory8, T::U8, T::U64 ) -A64OPC(ReadMemory16, T::U16, T::U64 ) -A64OPC(ReadMemory32, T::U32, T::U64 ) -A64OPC(ReadMemory64, T::U64, T::U64 ) -A64OPC(ReadMemory128, T::U128, T::U64 ) -A64OPC(WriteMemory8, T::Void, T::U64, T::U8 ) -A64OPC(WriteMemory16, T::Void, T::U64, T::U16 ) -A64OPC(WriteMemory32, T::Void, T::U64, T::U32 ) -A64OPC(WriteMemory64, T::Void, T::U64, T::U64 ) -A64OPC(WriteMemory128, T::Void, T::U64, T::U128 ) +A64OPC(ReadMemory8, T::U8, T::U64 ) +A64OPC(ReadMemory16, T::U16, T::U64 ) +A64OPC(ReadMemory32, T::U32, T::U64 ) +A64OPC(ReadMemory64, T::U64, T::U64 ) +A64OPC(ReadMemory128, T::U128, T::U64 ) +A64OPC(WriteMemory8, T::Void, T::U64, T::U8 ) +A64OPC(WriteMemory16, T::Void, T::U64, T::U16 ) +A64OPC(WriteMemory32, T::Void, T::U64, T::U32 ) +A64OPC(WriteMemory64, T::Void, T::U64, T::U64 ) +A64OPC(WriteMemory128, T::Void, T::U64, T::U128 ) // Coprocessor -A32OPC(CoprocInternalOperation, T::Void, T::CoprocInfo ) -A32OPC(CoprocSendOneWord, T::Void, T::CoprocInfo, T::U32 ) -A32OPC(CoprocSendTwoWords, T::Void, T::CoprocInfo, T::U32, T::U32 ) -A32OPC(CoprocGetOneWord, T::U32, T::CoprocInfo ) -A32OPC(CoprocGetTwoWords, T::U64, T::CoprocInfo ) -A32OPC(CoprocLoadWords, T::Void, T::CoprocInfo, T::U32 ) -A32OPC(CoprocStoreWords, T::Void, T::CoprocInfo, T::U32 ) +A32OPC(CoprocInternalOperation, T::Void, T::CoprocInfo ) +A32OPC(CoprocSendOneWord, T::Void, T::CoprocInfo, T::U32 ) +A32OPC(CoprocSendTwoWords, T::Void, T::CoprocInfo, T::U32, T::U32 ) +A32OPC(CoprocGetOneWord, T::U32, T::CoprocInfo ) +A32OPC(CoprocGetTwoWords, T::U64, T::CoprocInfo ) +A32OPC(CoprocLoadWords, T::Void, T::CoprocInfo, T::U32 ) +A32OPC(CoprocStoreWords, T::Void, T::CoprocInfo, T::U32 )