From 9fb82036ca94cd726a2f73db8d68c6e306c216fe Mon Sep 17 00:00:00 2001 From: MerryMage Date: Wed, 22 Apr 2020 20:45:52 +0100 Subject: [PATCH] Squashed 'externals/xbyak/' changes from d512551e..2794cde7 2794cde7 add xword, yword, etc. in Xbyak::util fb9c04e4 fix document for vfpclassps a51be78b fix test dependency 04fdfb1e update version e6354f8b add vgf2p8mulb 09a12642 add gf2p8affineqb d171ba0e add gf2p8affineinvqb 457f4fd0 add vpshufbitqmb 5af0ba39 add vpexpand{b,w} e450f965 vpopcnt{d,q} supports ptr_b 48499eb1 add vpdpbusd(s), vpdpwssd(s) 9c745109 add vpdpbusd, vpdpbusds 0e1a11b4 add vpopcnt{b,w,d,q} 9acfc132 add vpshrd(v){w,d,q} ac8de850 add vpshld(v){w,d,q} f181c259 add vcompressb, vcompressw 5a402477 vpclmulqdq supports AVX-512 9e16b40b vaes* supports AVX-512 7fde08e0 add flags for intel's manual 319433-030.pdf c5da3778 add test of v4fmaddps, vp4dpwssd, etc. e4fc9d8a fix mpx encoding d0b2fb62 add bnd(0xf2) prefix for MPX f12b5678 use db for array cd74ab44 remove bat file git-subtree-dir: externals/xbyak git-subtree-split: 2794cde79eb71e86490061cac9622ad0067b8d15 --- gen/gen_avx512.cpp | 35 +++ gen/gen_code.cpp | 19 +- readme.md | 7 +- readme.txt | 7 +- sample/test_util.cpp | 12 +- test/6.bat | 8 - test/Makefile | 10 +- test/cybozu/test.hpp | 84 ++++-- test/jmp.cpp | 3 +- test/make_nm.cpp | 16 ++ test/misc.cpp | 580 +++++++++++++++++++++++++++++++++++++++++ test/nm_frame.cpp | 1 + test/test_address.bat | 2 +- xbyak/xbyak.h | 150 +++++++---- xbyak/xbyak_mnemonic.h | 48 +++- xbyak/xbyak_util.h | 126 +++++---- 16 files changed, 935 insertions(+), 173 deletions(-) delete mode 100644 test/6.bat diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 1c8cf9bc..5e0591ec 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -228,6 +228,9 @@ void putXM_X() { 0x8B, "vpcompressd", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 }, { 0x8B, "vpcompressq", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_N8 }, + + { 0x63, "vcompressb", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N1 }, + { 0x63, "vcompressw", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_N2 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -343,6 +346,28 @@ void putX_X_XM_IMM() { 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, { 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, + + { 0x70, "vpshldw", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z, true }, + { 0x71, "vpshldd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, true }, + { 0x71, "vpshldq", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, true }, + + { 0x70, "vpshldvw", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z, false }, + { 0x71, "vpshldvd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false }, + { 0x71, "vpshldvq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, false }, + + { 0x72, "vpshrdw", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z, true }, + { 0x73, "vpshrdd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, true }, + { 0x73, "vpshrdq", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, true }, + + { 0x72, "vpshrdvw", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z, false }, + { 0x73, "vpshrdvd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false }, + { 0x73, "vpshrdvq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, false }, + + { 0x50, "vpdpbusd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false }, + { 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false }, + + { 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false }, + { 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -623,6 +648,14 @@ void putX_XM_IMM() { 0x56, "vreducepd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64 | T_SAE_Z, true }, { 0x56, "vreduceps", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32 | T_SAE_Z, true }, + + { 0x54, "vpopcntb", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z, false }, + { 0x54, "vpopcntw", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z, false }, + { 0x55, "vpopcntd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false }, + { 0x55, "vpopcntq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, false }, + + { 0x62, "vpexpandb", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_N1, false }, + { 0x62, "vpexpandw", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_N2, false }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -661,6 +694,8 @@ void putMisc() puts("void vfpclassps(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isBit(128|256|512)) throw Error(ERR_BAD_MEM_SIZE); Reg x = k; x.setBit(op.getBit()); opVex(x, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm); }"); puts("void vfpclasssd(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isXMEM()) throw Error(ERR_BAD_MEM_SIZE); opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW1 | T_N8, 0x67, imm); }"); puts("void vfpclassss(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isXMEM()) throw Error(ERR_BAD_MEM_SIZE); opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm); }"); + + puts("void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) { opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F); }"); } void putV4FMA() diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index 4e5fd89d..fe0b59ac 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -60,7 +60,7 @@ void putX_X_XM(bool omitOnly) { 0x02, "pblendd", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 2 }, { 0x0B, "roundsd", T_0F3A | T_66 | T_W0, true, true, 3 }, { 0x0A, "roundss", T_0F3A | T_66 | T_W0, true, true, 3 }, - { 0x44, "pclmulqdq", T_0F3A | T_66 | T_W0, true, true, 3 }, + { 0x44, "pclmulqdq", T_0F3A | T_66 | T_W0 | T_YMM | T_EVEX, true, true, 3 }, { 0x0C, "permilps", T_0F38 | T_66 | T_W0 | T_YMM | T_EVEX | T_EW0 | T_B32, false, false, 2 }, { 0x0D, "permilpd", T_0F38 | T_66 | T_W0 | T_YMM | T_EVEX | T_EW1 | T_B64, false, false, 2 }, @@ -202,6 +202,10 @@ void putX_X_XM(bool omitOnly) { 0x14, "unpcklpd", T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64, false, true, 2 }, { 0x14, "unpcklps", T_0F | T_YMM | T_EVEX | T_EW0 | T_B32, false, true, 2 }, + + { 0xCF, "gf2p8affineinvqb", T_66 | T_0F3A | T_W1 | T_EVEX | T_YMM | T_EW1 | T_SAE_Z | T_B64, true, false, 3 }, + { 0xCE, "gf2p8affineqb", T_66 | T_0F3A | T_W1 | T_EVEX | T_YMM | T_EW1 | T_SAE_Z | T_B64, true, false, 3 }, + { 0xCF, "gf2p8mulb", T_66 | T_0F38 | T_W0 | T_EVEX | T_YMM | T_EW0 | T_SAE_Z, false, false, 3 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -615,6 +619,7 @@ void put() //////////////////////////////////////////////////////////////// { const GenericTbl tbl[] = { + { "bnd", 0xf2 }, /* 0xf2 prefix for MPX */ { "cbw", 0x66, 0x98 }, { "cdq", 0x99 }, { "clc", 0xF8 }, @@ -941,11 +946,11 @@ void put() puts("void bndcl(const BoundsReg& bnd, const Operand& op) { db(0xF3); opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM()); }"); puts("void bndcu(const BoundsReg& bnd, const Operand& op) { db(0xF2); opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM()); }"); puts("void bndcn(const BoundsReg& bnd, const Operand& op) { db(0xF2); opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1B, NONE, !op.isMEM()); }"); - puts("void bndldx(const BoundsReg& bnd, const Address& addr) { opModM(addr, bnd, 0x0F, 0x1A); }"); + puts("void bndldx(const BoundsReg& bnd, const Address& addr) { opMIB(addr, bnd, 0x0F, 0x1A); }"); puts("void bndmk(const BoundsReg& bnd, const Address& addr) { db(0xF3); opModM(addr, bnd, 0x0F, 0x1B); }"); puts("void bndmov(const BoundsReg& bnd, const Operand& op) { db(0x66); opModRM(bnd, op, op.isBNDREG(), op.isMEM(), 0x0F, 0x1A); }"); puts("void bndmov(const Address& addr, const BoundsReg& bnd) { db(0x66); opModM(addr, bnd, 0x0F, 0x1B); }"); - puts("void bndstx(const Address& addr, const BoundsReg& bnd) { opModM(addr, bnd, 0x0F, 0x1B); }"); + puts("void bndstx(const Address& addr, const BoundsReg& bnd) { opMIB(addr, bnd, 0x0F, 0x1B); }"); } // misc { @@ -1257,10 +1262,10 @@ void put() { 0x7D, "hsubpd", T_0F | T_66 | T_YMM, 3 }, { 0x7D, "hsubps", T_0F | T_F2 | T_YMM, 3 }, - { 0xDC, "aesenc", T_0F38 | T_66 | T_W0, 3 }, - { 0xDD, "aesenclast", T_0F38 | T_66 | T_W0, 3 }, - { 0xDE, "aesdec", T_0F38 | T_66 | T_W0, 3 }, - { 0xDF, "aesdeclast", T_0F38 | T_66 | T_W0, 3 }, + { 0xDC, "aesenc", T_0F38 | T_66 | T_YMM | T_EVEX, 3 }, + { 0xDD, "aesenclast", T_0F38 | T_66 | T_YMM | T_EVEX, 3 }, + { 0xDE, "aesdec", T_0F38 | T_66 | T_YMM | T_EVEX, 3 }, + { 0xDF, "aesdeclast", T_0F38 | T_66 | T_YMM | T_EVEX, 3 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; diff --git a/readme.md b/readme.md index 4f5adf0d..ec1908d5 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,5 @@ -Xbyak 5.52 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++ +Xbyak 5.601 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++ ============= Abstract @@ -128,7 +128,7 @@ vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]); vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512 vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit -vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64], 5); // broadcast 32-bit to 128-bit +vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit ``` Remark * k1, ..., k7 are new opmask registers. @@ -333,6 +333,9 @@ The header files under xbyak/ are independent of cybozulib. History ------------- +* 2018/Jan/24 ver 5.601 add xword, yword, etc. into Xbyak::util namespace +* 2018/Jan/05 ver 5.60 support AVX-512 for Ice lake(319433-030.pdf) +* 2017/Aug/22 ver 5.53 fix mpx encoding, add bnd() prefix * 2017/Aug/18 ver 5.52 fix align (thanks to MerryMage) * 2017/Aug/17 ver 5.51 add multi-byte nop and align() uses it(thanks to inolen) * 2017/Aug/08 ver 5.50 add mpx(thanks to magurosan) diff --git a/readme.txt b/readme.txt index 2bc20f8f..aa99b85b 100644 --- a/readme.txt +++ b/readme.txt @@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.52 + C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.601 ----------------------------------------------------------------------------- ◎概要 @@ -155,7 +155,7 @@ vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]); vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512 vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit -vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64], 5); // broadcast 32-bit to 128-bit +vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 256-bit 注意 @@ -343,6 +343,9 @@ cybozulibは単体テストでのみ利用されていて、xbyak/ディレク ----------------------------------------------------------------------------- ◎履歴 +2018/01/24 ver 5.601 xword, ywordなどをXbyak::util名前空間に追加 +2018/01/05 ver 5.60 Ice lake系命令対応(319433-030.pdf) +2017/08/22 ver 5.53 mpxエンコーディングバグ修正, bnd()プレフィクス追加 2017/08/18 ver 5.52 align修正(thanks to MerryMage) 2017/08/17 ver 5.51 multi-byte nop追加 align()はそれを使用する(thanks to inolen) 2017/08/08 ver 5.50 mpx追加(thanks to magurosan) diff --git a/sample/test_util.cpp b/sample/test_util.cpp index d3497872..bb515db9 100644 --- a/sample/test_util.cpp +++ b/sample/test_util.cpp @@ -61,15 +61,23 @@ void putCPUinfo() { Cpu::tMOVBE, "movbe" }, { Cpu::tAVX512F, "avx512f" }, { Cpu::tAVX512DQ, "avx512dq" }, - { Cpu::tAVX512IFMA, "avx512ifma" }, + { Cpu::tAVX512IFMA, "avx512_ifma" }, { Cpu::tAVX512PF, "avx512pf" }, { Cpu::tAVX512ER, "avx512er" }, { Cpu::tAVX512CD, "avx512cd" }, { Cpu::tAVX512BW, "avx512bw" }, { Cpu::tAVX512VL, "avx512vl" }, - { Cpu::tAVX512VBMI, "avx512vbmi" }, + { Cpu::tAVX512VBMI, "avx512_vbmi" }, { Cpu::tAVX512_4VNNIW, "avx512_4vnniw" }, { Cpu::tAVX512_4FMAPS, "avx512_4fmaps" }, + + { Cpu::tAVX512_VBMI2, "avx512_vbmi2" }, + { Cpu::tGFNI, "gfni" }, + { Cpu::tVAES, "vaes" }, + { Cpu::tVPCLMULQDQ, "vpclmulqdq" }, + { Cpu::tAVX512_VNNI, "avx512_vnni" }, + { Cpu::tAVX512_BITALG, "avx512_bitalg" }, + { Cpu::tAVX512_VPOPCNTDQ, "avx512_vpopcntdq" }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str); diff --git a/test/6.bat b/test/6.bat deleted file mode 100644 index 6fc7e97e..00000000 --- a/test/6.bat +++ /dev/null @@ -1,8 +0,0 @@ -@echo off -rm a.lst b.lst -echo nasm -nasm -l a.lst -f win64 test.asm -cat a.lst -echo yasm -yasm -l b.lst -f win64 test.asm -cat b.lst diff --git a/test/Makefile b/test/Makefile index bcd63b66..e07e1bf1 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,11 +1,11 @@ -TARGET = make_nm normalize_prefix jmp address nm_frame bad_address misc +TARGET = make_nm normalize_prefix jmp address bad_address misc XBYAK_INC=../xbyak/xbyak.h BIT=32 ifeq ($(shell uname -m),x86_64) BIT=64 endif -ifeq ($(MODE_BIT),64) +ifeq ($(BIT),64) TARGET += jmp64 address64 endif @@ -28,14 +28,12 @@ address: address.cpp ../xbyak/xbyak.h $(CXX) $(CFLAGS) address.cpp -o $@ -m32 address64: address.cpp ../xbyak/xbyak.h $(CXX) $(CFLAGS) address.cpp -o $@ -m64 -nm_frame: nm_frame.cpp ../xbyak/xbyak.h - $(CXX) $(CFLAGS) nm_frame.cpp -o $@ -m32 bad_address: bad_address.cpp ../xbyak/xbyak.h $(CXX) $(CFLAGS) bad_address.cpp -o $@ misc: misc.cpp ../xbyak/xbyak.h $(CXX) $(CFLAGS) misc.cpp -o $@ -test: normalize_prefix jmp bad_address +test: normalize_prefix jmp bad_address $(TARGET) $(MAKE) -C ../gen ./test_nm.sh ./test_nm.sh Y @@ -65,7 +63,7 @@ ifeq ($(BIT),64) ./test_avx512.sh 64 endif clean: - rm -rf *.o $(TARGET) lib_run + rm -rf *.o $(TARGET) lib_run nm.cpp nm_frame make_512 lib_run: lib_test.cpp lib_run.cpp lib.h $(CXX) $(CFLAGS) lib_run.cpp lib_test.cpp -o lib_run diff --git a/test/cybozu/test.hpp b/test/cybozu/test.hpp index d192b1c9..fa735d21 100644 --- a/test/cybozu/test.hpp +++ b/test/cybozu/test.hpp @@ -86,13 +86,15 @@ public: } fflush(stdout); if (msg.empty()) { + int err = ngCount_ + exceptionCount_; + int total = okCount_ + err; std::cout << "ctest:name=" << getBaseName(*argv) << ", module=" << list_.size() - << ", total=" << (okCount_ + ngCount_ + exceptionCount_) + << ", total=" << total << ", ok=" << okCount_ << ", ng=" << ngCount_ << ", exception=" << exceptionCount_ << std::endl; - return 0; + return err > 0 ? 1 : 0; } else { std::cout << msg << std::endl; return 1; @@ -128,6 +130,15 @@ bool isEqual(const T& lhs, const U& rhs) return lhs == rhs; } +// avoid warning of comparision of integers of different signs +inline bool isEqual(size_t lhs, int rhs) +{ + return lhs == size_t(rhs); +} +inline bool isEqual(int lhs, size_t rhs) +{ + return size_t(lhs) == rhs; +} inline bool isEqual(const char *lhs, const char *rhs) { return strcmp(lhs, rhs) == 0; @@ -188,9 +199,9 @@ int main(int argc, char *argv[]) @param y [in] */ #define CYBOZU_TEST_EQUAL(x, y) { \ - bool eq = cybozu::test::isEqual(x, y); \ - cybozu::test::test(eq, "CYBOZU_TEST_EQUAL", #x ", " #y, __FILE__, __LINE__); \ - if (!eq) { \ + bool _cybozu_eq = cybozu::test::isEqual(x, y); \ + cybozu::test::test(_cybozu_eq, "CYBOZU_TEST_EQUAL", #x ", " #y, __FILE__, __LINE__); \ + if (!_cybozu_eq) { \ std::cout << "ctest: lhs=" << (x) << std::endl; \ std::cout << "ctest: rhs=" << (y) << std::endl; \ } \ @@ -201,22 +212,39 @@ int main(int argc, char *argv[]) @param y [in] */ #define CYBOZU_TEST_NEAR(x, y, eps) { \ - bool isNear = fabs((x) - (y)) < eps; \ - cybozu::test::test(isNear, "CYBOZU_TEST_NEAR", #x ", " #y, __FILE__, __LINE__); \ - if (!isNear) { \ + bool _cybozu_isNear = fabs((x) - (y)) < eps; \ + cybozu::test::test(_cybozu_isNear, "CYBOZU_TEST_NEAR", #x ", " #y, __FILE__, __LINE__); \ + if (!_cybozu_isNear) { \ std::cout << "ctest: lhs=" << (x) << std::endl; \ std::cout << "ctest: rhs=" << (y) << std::endl; \ } \ } #define CYBOZU_TEST_EQUAL_POINTER(x, y) { \ - bool eq = x == y; \ - cybozu::test::test(eq, "CYBOZU_TEST_EQUAL_POINTER", #x ", " #y, __FILE__, __LINE__); \ - if (!eq) { \ + bool _cybozu_eq = x == y; \ + cybozu::test::test(_cybozu_eq, "CYBOZU_TEST_EQUAL_POINTER", #x ", " #y, __FILE__, __LINE__); \ + if (!_cybozu_eq) { \ std::cout << "ctest: lhs=" << static_cast(x) << std::endl; \ std::cout << "ctest: rhs=" << static_cast(y) << std::endl; \ } \ } +/** + alert if x[] != y[] + @param x [in] + @param y [in] + @param n [in] +*/ +#define CYBOZU_TEST_EQUAL_ARRAY(x, y, n) { \ + for (size_t _cybozu_test_i = 0, _cybozu_ie = (size_t)(n); _cybozu_test_i < _cybozu_ie; _cybozu_test_i++) { \ + bool _cybozu_eq = cybozu::test::isEqual((x)[_cybozu_test_i], (y)[_cybozu_test_i]); \ + cybozu::test::test(_cybozu_eq, "CYBOZU_TEST_EQUAL_ARRAY", #x ", " #y ", " #n, __FILE__, __LINE__); \ + if (!_cybozu_eq) { \ + std::cout << "ctest: i=" << _cybozu_test_i << std::endl; \ + std::cout << "ctest: lhs=" << (x)[_cybozu_test_i] << std::endl; \ + std::cout << "ctest: rhs=" << (y)[_cybozu_test_i] << std::endl; \ + } \ + } \ +} /** always alert @@ -229,25 +257,25 @@ int main(int argc, char *argv[]) */ #define CYBOZU_TEST_EXCEPTION_MESSAGE(statement, Exception, msg) \ { \ - int ret = 0; \ - std::string errMsg; \ + int _cybozu_ret = 0; \ + std::string _cybozu_errMsg; \ try { \ statement; \ - ret = 1; \ - } catch (const Exception& e) { \ - errMsg = e.what(); \ - if (errMsg.find(msg) == std::string::npos) { \ - ret = 2; \ + _cybozu_ret = 1; \ + } catch (const Exception& _cybozu_e) { \ + _cybozu_errMsg = _cybozu_e.what(); \ + if (_cybozu_errMsg.find(msg) == std::string::npos) { \ + _cybozu_ret = 2; \ } \ } catch (...) { \ - ret = 3; \ + _cybozu_ret = 3; \ } \ - if (ret) { \ + if (_cybozu_ret) { \ cybozu::test::test(false, "CYBOZU_TEST_EXCEPTION_MESSAGE", #statement ", " #Exception ", " #msg, __FILE__, __LINE__); \ - if (ret == 1) { \ + if (_cybozu_ret == 1) { \ std::cout << "ctest: no exception" << std::endl; \ - } else if (ret == 2) { \ - std::cout << "ctest: bad exception msg:" << errMsg << std::endl; \ + } else if (_cybozu_ret == 2) { \ + std::cout << "ctest: bad exception msg:" << _cybozu_errMsg << std::endl; \ } else { \ std::cout << "ctest: unexpected exception" << std::endl; \ } \ @@ -258,17 +286,17 @@ int main(int argc, char *argv[]) #define CYBOZU_TEST_EXCEPTION(statement, Exception) \ { \ - int ret = 0; \ + int _cybozu_ret = 0; \ try { \ statement; \ - ret = 1; \ + _cybozu_ret = 1; \ } catch (const Exception&) { \ } catch (...) { \ - ret = 2; \ + _cybozu_ret = 2; \ } \ - if (ret) { \ + if (_cybozu_ret) { \ cybozu::test::test(false, "CYBOZU_TEST_EXCEPTION", #statement ", " #Exception, __FILE__, __LINE__); \ - if (ret == 1) { \ + if (_cybozu_ret == 1) { \ std::cout << "ctest: no exception" << std::endl; \ } else { \ std::cout << "ctest: unexpected exception" << std::endl; \ diff --git a/test/jmp.cpp b/test/jmp.cpp index eb2c9c7f..79d54904 100644 --- a/test/jmp.cpp +++ b/test/jmp.cpp @@ -1,6 +1,7 @@ #include #include #include +#define XBYAK_NO_OP_NAMES #include #include #include @@ -401,7 +402,7 @@ CYBOZU_TEST_AUTO(test5) using namespace Xbyak; inLocalLabel(); mov(ecx, count); - xor(eax, eax); + xor_(eax, eax); L(".lp"); for (int i = 0; i < count; i++) { L(Label::toStr(i)); diff --git a/test/make_nm.cpp b/test/make_nm.cpp index 6eff1e53..72c82472 100644 --- a/test/make_nm.cpp +++ b/test/make_nm.cpp @@ -1363,6 +1363,22 @@ class Test { put("bndmk", BNDREG, MEM); put("bndmov", BNDREG, BNDREG|MEM); put("bndstx", MEM, BNDREG); + put("bndstx", "ptr [eax]", "[eax]", BNDREG); + put("bndstx", "ptr [eax+5]", "[eax+5]", BNDREG); + put("bndstx", "ptr [eax+500]", "[eax+500]", BNDREG); + put("bndstx", "ptr [eax+ecx]", "[eax+ecx]", BNDREG); + put("bndstx", "ptr [ecx+eax]", "[ecx+eax]", BNDREG); + put("bndstx", "ptr [eax+esp]", "[eax+esp]", BNDREG); + put("bndstx", "ptr [esp+eax]", "[esp+eax]", BNDREG); + put("bndstx", "ptr [eax+ecx*2]", "[eax+ecx*2]", BNDREG); + put("bndstx", "ptr [ecx+ecx]", "[ecx+ecx]", BNDREG); + put("bndstx", "ptr [ecx*2]", "[ecx*2]", BNDREG); + put("bndstx", "ptr [eax+ecx*2+500]", "[eax+ecx*2+500]", BNDREG); +#ifdef XBYAK64 + put("bndstx", "ptr [rax+rcx*2]", "[rax+rcx*2]", BNDREG); + put("bndstx", "ptr [r9*2]", "[r9*2]", BNDREG); + put("bndstx", "ptr [r9*2+r15]", "[r9*2+r15]", BNDREG); +#endif } void putFpuMem16_32() const { diff --git a/test/misc.cpp b/test/misc.cpp index 4747f810..701111c9 100644 --- a/test/misc.cpp +++ b/test/misc.cpp @@ -103,3 +103,583 @@ CYBOZU_TEST_AUTO(align) } } c; } + +#ifdef XBYAK64 +CYBOZU_TEST_AUTO(vfmaddps) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + v4fmaddps(zmm1, zmm8, ptr [rdx + 64]); + v4fmaddss(xmm15, xmm8, ptr [rax + 64]); + v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]); + v4fnmaddss(xmm31, xmm2, ptr [rsp + 0x80]); + vp4dpwssd(zmm23 | k7 | T_z, zmm1, ptr [rax + 64]); + vp4dpwssds(zmm10 | k4, zmm3, ptr [rsp + rax * 4 + 64]); + } + } c; + const uint8_t tbl[] = { + 0x62, 0xf2, 0x3f, 0x48, 0x9a, 0x4a, 0x04, + 0x62, 0x72, 0x3f, 0x08, 0x9b, 0x78, 0x04, + 0x62, 0xf2, 0x6f, 0x4d, 0xaa, 0x69, 0x08, + 0x62, 0x62, 0x6f, 0x08, 0xab, 0x7c, 0x24, 0x08, + 0x62, 0xe2, 0x77, 0xcf, 0x52, 0x78, 0x04, + 0x62, 0x72, 0x67, 0x4c, 0x53, 0x54, 0x84, 0x04, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} +CYBOZU_TEST_AUTO(vaes) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + vaesdec(xmm20, xmm30, ptr [rcx + 64]); + vaesdec(ymm1, ymm2, ptr [rcx + 64]); + vaesdec(zmm1, zmm2, ptr [rcx + 64]); + + vaesdeclast(xmm20, xmm30, ptr [rax + 64]); + vaesdeclast(ymm20, ymm30, ptr [rax + 64]); + vaesdeclast(zmm20, zmm30, ptr [rax + 64]); + + vaesenc(xmm20, xmm30, ptr [rcx + 64]); + vaesenc(ymm1, ymm2, ptr [rcx + 64]); + vaesenc(zmm1, zmm2, ptr [rcx + 64]); + + vaesenclast(xmm20, xmm30, ptr [rax + 64]); + vaesenclast(ymm20, ymm30, ptr [rax + 64]); + vaesenclast(zmm20, zmm30, ptr [rax + 64]); + } + } c; + const uint8_t tbl[] = { + 0x62, 0xE2, 0x0D, 0x00, 0xDE, 0x61, 0x04, + 0xC4, 0xE2, 0x6D, 0xDE, 0x49, 0x40, + 0x62, 0xF2, 0x6D, 0x48, 0xDE, 0x49, 0x01, + + 0x62, 0xE2, 0x0D, 0x00, 0xDF, 0x60, 0x04, + 0x62, 0xE2, 0x0D, 0x20, 0xDF, 0x60, 0x02, + 0x62, 0xE2, 0x0D, 0x40, 0xDF, 0x60, 0x01, + + 0x62, 0xE2, 0x0D, 0x00, 0xDC, 0x61, 0x04, + 0xC4, 0xE2, 0x6D, 0xDC, 0x49, 0x40, + 0x62, 0xF2, 0x6D, 0x48, 0xDC, 0x49, 0x01, + + 0x62, 0xE2, 0x0D, 0x00, 0xDD, 0x60, 0x04, + 0x62, 0xE2, 0x0D, 0x20, 0xDD, 0x60, 0x02, + 0x62, 0xE2, 0x0D, 0x40, 0xDD, 0x60, 0x01, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} +CYBOZU_TEST_AUTO(vpclmulqdq) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + vpclmulqdq(xmm2, xmm3, ptr [rax + 64], 3); + vpclmulqdq(ymm2, ymm3, ptr [rax + 64], 3); + vpclmulqdq(zmm2, zmm3, ptr [rax + 64], 3); + + vpclmulqdq(xmm20, xmm3, ptr [rax + 64], 3); + vpclmulqdq(ymm20, ymm3, ptr [rax + 64], 3); + vpclmulqdq(zmm20, zmm3, ptr [rax + 64], 3); + } + } c; + const uint8_t tbl[] = { + 0xc4, 0xe3, 0x61, 0x44, 0x50, 0x40, 0x03, + 0xc4, 0xe3, 0x65, 0x44, 0x50, 0x40, 0x03, + 0x62, 0xf3, 0x65, 0x48, 0x44, 0x50, 0x01, 0x03, + 0x62, 0xe3, 0x65, 0x08, 0x44, 0x60, 0x04, 0x03, + 0x62, 0xe3, 0x65, 0x28, 0x44, 0x60, 0x02, 0x03, + 0x62, 0xe3, 0x65, 0x48, 0x44, 0x60, 0x01, 0x03, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} +CYBOZU_TEST_AUTO(vcompressb_w) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + vcompressb(ptr[rax + 64], xmm1); + vcompressb(xmm30 | k5, xmm1); + vcompressb(ptr[rax + 64], ymm1); + vcompressb(ymm30 | k3 |T_z, ymm1); + vcompressb(ptr[rax + 64], zmm1); + vcompressb(zmm30 | k2 |T_z, zmm1); + + vcompressw(ptr[rax + 64], xmm1); + vcompressw(xmm30 | k5, xmm1); + vcompressw(ptr[rax + 64], ymm1); + vcompressw(ymm30 | k3 |T_z, ymm1); + vcompressw(ptr[rax + 64], zmm1); + vcompressw(zmm30 | k2 |T_z, zmm1); + } + } c; + const uint8_t tbl[] = { + 0x62, 0xf2, 0x7d, 0x08, 0x63, 0x48, 0x40, + 0x62, 0x92, 0x7d, 0x0d, 0x63, 0xce, + 0x62, 0xf2, 0x7d, 0x28, 0x63, 0x48, 0x40, + 0x62, 0x92, 0x7d, 0xab, 0x63, 0xce, + 0x62, 0xf2, 0x7d, 0x48, 0x63, 0x48, 0x40, + 0x62, 0x92, 0x7d, 0xca, 0x63, 0xce, + + 0x62, 0xf2, 0xfd, 0x08, 0x63, 0x48, 0x20, + 0x62, 0x92, 0xfd, 0x0d, 0x63, 0xce, + 0x62, 0xf2, 0xfd, 0x28, 0x63, 0x48, 0x20, + 0x62, 0x92, 0xfd, 0xab, 0x63, 0xce, + 0x62, 0xf2, 0xfd, 0x48, 0x63, 0x48, 0x20, + 0x62, 0x92, 0xfd, 0xca, 0x63, 0xce, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} +CYBOZU_TEST_AUTO(shld) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + vpshldw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); + vpshldw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); + vpshldw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); + + vpshldd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); + vpshldd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); + vpshldd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); + + vpshldq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); + vpshldq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); + vpshldq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); + + vpshldvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); + vpshldvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); + vpshldvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); + + vpshldvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); + vpshldvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); + vpshldvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); + + vpshldvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); + vpshldvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); + vpshldvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); + } + } c; + const uint8_t tbl[] = { + 0x62, 0xf3, 0xed, 0x8b, 0x70, 0x68, 0x04, 0x05, + 0x62, 0xf3, 0xed, 0xab, 0x70, 0x68, 0x02, 0x05, + 0x62, 0xf3, 0xed, 0xcb, 0x70, 0x68, 0x01, 0x05, + + 0x62, 0xf3, 0x6d, 0x8b, 0x71, 0x68, 0x04, 0x05, + 0x62, 0xf3, 0x6d, 0xab, 0x71, 0x68, 0x02, 0x05, + 0x62, 0xf3, 0x6d, 0xcb, 0x71, 0x68, 0x01, 0x05, + + 0x62, 0xf3, 0xed, 0x8b, 0x71, 0x68, 0x04, 0x05, + 0x62, 0xf3, 0xed, 0xab, 0x71, 0x68, 0x02, 0x05, + 0x62, 0xf3, 0xed, 0xcb, 0x71, 0x68, 0x01, 0x05, + + 0x62, 0xf2, 0xed, 0x8b, 0x70, 0x68, 0x04, + 0x62, 0xf2, 0xed, 0xab, 0x70, 0x68, 0x02, + 0x62, 0xf2, 0xed, 0xcb, 0x70, 0x68, 0x01, + + 0x62, 0xf2, 0x6d, 0x8b, 0x71, 0x68, 0x04, + 0x62, 0xf2, 0x6d, 0xab, 0x71, 0x68, 0x02, + 0x62, 0xf2, 0x6d, 0xcb, 0x71, 0x68, 0x01, + + 0x62, 0xf2, 0xed, 0x8b, 0x71, 0x68, 0x04, + 0x62, 0xf2, 0xed, 0xab, 0x71, 0x68, 0x02, + 0x62, 0xf2, 0xed, 0xcb, 0x71, 0x68, 0x01, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} +CYBOZU_TEST_AUTO(shrd) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + vpshrdw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); + vpshrdw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); + vpshrdw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); + + vpshrdd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); + vpshrdd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); + vpshrdd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); + + vpshrdq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); + vpshrdq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); + vpshrdq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); + + vpshrdvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); + vpshrdvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); + vpshrdvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); + + vpshrdvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); + vpshrdvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); + vpshrdvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); + + vpshrdvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); + vpshrdvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); + vpshrdvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); + + vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5); + vpshrdd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5); + vpshrdd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5); + + vpshrdq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5); + vpshrdq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5); + vpshrdq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5); + + vpshrdvd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]); + vpshrdvd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]); + vpshrdvd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]); + + vpshrdvq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]); + vpshrdvq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]); + vpshrdvq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]); + } + } c; + const uint8_t tbl[] = { + 0x62, 0xf3, 0xed, 0x8b, 0x72, 0x68, 0x04, 0x05, + 0x62, 0xf3, 0xed, 0xab, 0x72, 0x68, 0x02, 0x05, + 0x62, 0xf3, 0xed, 0xcb, 0x72, 0x68, 0x01, 0x05, + + 0x62, 0xf3, 0x6d, 0x8b, 0x73, 0x68, 0x04, 0x05, + 0x62, 0xf3, 0x6d, 0xab, 0x73, 0x68, 0x02, 0x05, + 0x62, 0xf3, 0x6d, 0xcb, 0x73, 0x68, 0x01, 0x05, + + 0x62, 0xf3, 0xed, 0x8b, 0x73, 0x68, 0x04, 0x05, + 0x62, 0xf3, 0xed, 0xab, 0x73, 0x68, 0x02, 0x05, + 0x62, 0xf3, 0xed, 0xcb, 0x73, 0x68, 0x01, 0x05, + + 0x62, 0xf2, 0xed, 0x8b, 0x72, 0x68, 0x04, + 0x62, 0xf2, 0xed, 0xab, 0x72, 0x68, 0x02, + 0x62, 0xf2, 0xed, 0xcb, 0x72, 0x68, 0x01, + + 0x62, 0xf2, 0x6d, 0x8b, 0x73, 0x68, 0x04, + 0x62, 0xf2, 0x6d, 0xab, 0x73, 0x68, 0x02, + 0x62, 0xf2, 0x6d, 0xcb, 0x73, 0x68, 0x01, + + 0x62, 0xf2, 0xed, 0x8b, 0x73, 0x68, 0x04, + 0x62, 0xf2, 0xed, 0xab, 0x73, 0x68, 0x02, + 0x62, 0xf2, 0xed, 0xcb, 0x73, 0x68, 0x01, + + 0x62, 0xf3, 0x6d, 0x9b, 0x73, 0x68, 0x10, 0x05, + 0x62, 0xf3, 0x6d, 0xbb, 0x73, 0x68, 0x10, 0x05, + 0x62, 0xf3, 0x6d, 0xdb, 0x73, 0x68, 0x10, 0x05, + + 0x62, 0xf3, 0xed, 0x9b, 0x73, 0x68, 0x08, 0x05, + 0x62, 0xf3, 0xed, 0xbb, 0x73, 0x68, 0x08, 0x05, + 0x62, 0xf3, 0xed, 0xdb, 0x73, 0x68, 0x08, 0x05, + + 0x62, 0xf2, 0x6d, 0x9b, 0x73, 0x68, 0x10, + 0x62, 0xf2, 0x6d, 0xbb, 0x73, 0x68, 0x10, + 0x62, 0xf2, 0x6d, 0xdb, 0x73, 0x68, 0x10, + + 0x62, 0xf2, 0xed, 0x9b, 0x73, 0x68, 0x08, + 0x62, 0xf2, 0xed, 0xbb, 0x73, 0x68, 0x08, + 0x62, 0xf2, 0xed, 0xdb, 0x73, 0x68, 0x08, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} +CYBOZU_TEST_AUTO(vpopcnt) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + vpopcntb(xmm5|k3|T_z, ptr [rax + 0x40]); + vpopcntb(ymm5|k3|T_z, ptr [rax + 0x40]); + vpopcntb(zmm5|k3|T_z, ptr [rax + 0x40]); + + vpopcntw(xmm5|k3|T_z, ptr [rax + 0x40]); + vpopcntw(ymm5|k3|T_z, ptr [rax + 0x40]); + vpopcntw(zmm5|k3|T_z, ptr [rax + 0x40]); + + vpopcntd(xmm5|k3|T_z, ptr [rax + 0x40]); + vpopcntd(ymm5|k3|T_z, ptr [rax + 0x40]); + vpopcntd(zmm5|k3|T_z, ptr [rax + 0x40]); + + vpopcntd(xmm5|k3|T_z, ptr_b [rax + 0x40]); + vpopcntd(ymm5|k3|T_z, ptr_b [rax + 0x40]); + vpopcntd(zmm5|k3|T_z, ptr_b [rax + 0x40]); + + vpopcntq(xmm5|k3|T_z, ptr [rax + 0x40]); + vpopcntq(ymm5|k3|T_z, ptr [rax + 0x40]); + vpopcntq(zmm5|k3|T_z, ptr [rax + 0x40]); + + vpopcntq(xmm5|k3|T_z, ptr_b [rax + 0x40]); + vpopcntq(ymm5|k3|T_z, ptr_b [rax + 0x40]); + vpopcntq(zmm5|k3|T_z, ptr_b [rax + 0x40]); + } + } c; + const uint8_t tbl[] = { + 0x62, 0xf2, 0x7d, 0x8b, 0x54, 0x68, 0x04, + 0x62, 0xf2, 0x7d, 0xab, 0x54, 0x68, 0x02, + 0x62, 0xf2, 0x7d, 0xcb, 0x54, 0x68, 0x01, + + 0x62, 0xf2, 0xfd, 0x8b, 0x54, 0x68, 0x04, + 0x62, 0xf2, 0xfd, 0xab, 0x54, 0x68, 0x02, + 0x62, 0xf2, 0xfd, 0xcb, 0x54, 0x68, 0x01, + + 0x62, 0xf2, 0x7d, 0x8b, 0x55, 0x68, 0x04, + 0x62, 0xf2, 0x7d, 0xab, 0x55, 0x68, 0x02, + 0x62, 0xf2, 0x7d, 0xcb, 0x55, 0x68, 0x01, + + 0x62, 0xf2, 0x7d, 0x9b, 0x55, 0x68, 0x10, + 0x62, 0xf2, 0x7d, 0xbb, 0x55, 0x68, 0x10, + 0x62, 0xf2, 0x7d, 0xdb, 0x55, 0x68, 0x10, + + 0x62, 0xf2, 0xfd, 0x8b, 0x55, 0x68, 0x04, + 0x62, 0xf2, 0xfd, 0xab, 0x55, 0x68, 0x02, + 0x62, 0xf2, 0xfd, 0xcb, 0x55, 0x68, 0x01, + + 0x62, 0xf2, 0xfd, 0x9b, 0x55, 0x68, 0x08, + 0x62, 0xf2, 0xfd, 0xbb, 0x55, 0x68, 0x08, + 0x62, 0xf2, 0xfd, 0xdb, 0x55, 0x68, 0x08, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} +CYBOZU_TEST_AUTO(vpdpbus) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + vpdpbusd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); + vpdpbusd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); + vpdpbusd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); + + vpdpbusd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); + vpdpbusd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); + vpdpbusd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); + + vpdpbusds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); + vpdpbusds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); + vpdpbusds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); + + vpdpbusds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); + vpdpbusds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); + vpdpbusds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); + + vpdpwssd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); + vpdpwssd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); + vpdpwssd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); + + vpdpwssd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); + vpdpwssd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); + vpdpwssd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); + + vpdpwssds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); + vpdpwssds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); + vpdpwssds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); + + vpdpwssds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); + vpdpwssds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); + vpdpwssds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); + } + } c; + const uint8_t tbl[] = { + 0x62, 0xf2, 0x5d, 0x83, 0x50, 0x68, 0x04, + 0x62, 0xf2, 0x5d, 0xa3, 0x50, 0x68, 0x02, + 0x62, 0xf2, 0x5d, 0xc3, 0x50, 0x68, 0x01, + + 0x62, 0xf2, 0x5d, 0x93, 0x50, 0x68, 0x10, + 0x62, 0xf2, 0x5d, 0xb3, 0x50, 0x68, 0x10, + 0x62, 0xf2, 0x5d, 0xd3, 0x50, 0x68, 0x10, + + 0x62, 0xf2, 0x5d, 0x83, 0x51, 0x68, 0x04, + 0x62, 0xf2, 0x5d, 0xa3, 0x51, 0x68, 0x02, + 0x62, 0xf2, 0x5d, 0xc3, 0x51, 0x68, 0x01, + + 0x62, 0xf2, 0x5d, 0x93, 0x51, 0x68, 0x10, + 0x62, 0xf2, 0x5d, 0xb3, 0x51, 0x68, 0x10, + 0x62, 0xf2, 0x5d, 0xd3, 0x51, 0x68, 0x10, + + 0x62, 0xf2, 0x5d, 0x83, 0x52, 0x68, 0x04, + 0x62, 0xf2, 0x5d, 0xa3, 0x52, 0x68, 0x02, + 0x62, 0xf2, 0x5d, 0xc3, 0x52, 0x68, 0x01, + + 0x62, 0xf2, 0x5d, 0x93, 0x52, 0x68, 0x10, + 0x62, 0xf2, 0x5d, 0xb3, 0x52, 0x68, 0x10, + 0x62, 0xf2, 0x5d, 0xd3, 0x52, 0x68, 0x10, + + 0x62, 0xf2, 0x5d, 0x83, 0x53, 0x68, 0x04, + 0x62, 0xf2, 0x5d, 0xa3, 0x53, 0x68, 0x02, + 0x62, 0xf2, 0x5d, 0xc3, 0x53, 0x68, 0x01, + + 0x62, 0xf2, 0x5d, 0x93, 0x53, 0x68, 0x10, + 0x62, 0xf2, 0x5d, 0xb3, 0x53, 0x68, 0x10, + 0x62, 0xf2, 0x5d, 0xd3, 0x53, 0x68, 0x10, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} +CYBOZU_TEST_AUTO(vexpand_vpshufbitqmb) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + vpexpandb(xmm5|k3|T_z, xmm30); + vpexpandb(ymm5|k3|T_z, ymm30); + vpexpandb(zmm5|k3|T_z, zmm30); + vpexpandb(xmm5|k3|T_z, ptr [rax + 0x40]); + vpexpandb(ymm5|k3|T_z, ptr [rax + 0x40]); + vpexpandb(zmm5|k3|T_z, ptr [rax + 0x40]); + + vpexpandw(xmm5|k3|T_z, xmm30); + vpexpandw(ymm5|k3|T_z, ymm30); + vpexpandw(zmm5|k3|T_z, zmm30); + vpexpandw(xmm5|k3|T_z, ptr [rax + 0x40]); + vpexpandw(ymm5|k3|T_z, ptr [rax + 0x40]); + vpexpandw(zmm5|k3|T_z, ptr [rax + 0x40]); + + vpshufbitqmb(k1|k2, xmm2, ptr [rax + 0x40]); + vpshufbitqmb(k1|k2, ymm2, ptr [rax + 0x40]); + vpshufbitqmb(k1|k2, zmm2, ptr [rax + 0x40]); + } + } c; + const uint8_t tbl[] = { + 0x62, 0x92, 0x7d, 0x8b, 0x62, 0xee, + 0x62, 0x92, 0x7d, 0xab, 0x62, 0xee, + 0x62, 0x92, 0x7d, 0xcb, 0x62, 0xee, + 0x62, 0xf2, 0x7d, 0x8b, 0x62, 0x68, 0x40, + 0x62, 0xf2, 0x7d, 0xab, 0x62, 0x68, 0x40, + 0x62, 0xf2, 0x7d, 0xcb, 0x62, 0x68, 0x40, + + 0x62, 0x92, 0xfd, 0x8b, 0x62, 0xee, + 0x62, 0x92, 0xfd, 0xab, 0x62, 0xee, + 0x62, 0x92, 0xfd, 0xcb, 0x62, 0xee, + 0x62, 0xf2, 0xfd, 0x8b, 0x62, 0x68, 0x20, + 0x62, 0xf2, 0xfd, 0xab, 0x62, 0x68, 0x20, + 0x62, 0xf2, 0xfd, 0xcb, 0x62, 0x68, 0x20, + + 0x62, 0xf2, 0x6d, 0x0a, 0x8f, 0x48, 0x04, + 0x62, 0xf2, 0x6d, 0x2a, 0x8f, 0x48, 0x02, + 0x62, 0xf2, 0x6d, 0x4a, 0x8f, 0x48, 0x01, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} +CYBOZU_TEST_AUTO(gf2) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + /// + gf2p8affineinvqb(xmm1, xmm2, 3); + gf2p8affineinvqb(xmm1, ptr [rax + 0x40], 3); + + vgf2p8affineinvqb(xmm1, xmm5, xmm2, 3); + vgf2p8affineinvqb(ymm1, ymm5, ymm2, 3); + vgf2p8affineinvqb(xmm1, xmm5, ptr [rax + 0x40], 3); + vgf2p8affineinvqb(ymm1, ymm5, ptr [rax + 0x40], 3); + + vgf2p8affineinvqb(xmm30, xmm31, xmm4, 5); + vgf2p8affineinvqb(ymm30, ymm31, ymm4, 5); + vgf2p8affineinvqb(zmm30, zmm31, zmm4, 5); + + vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5); + vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5); + vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5); + + vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5); + vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5); + vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5); + /// + gf2p8affineqb(xmm1, xmm2, 3); + gf2p8affineqb(xmm1, ptr [rax + 0x40], 3); + + vgf2p8affineqb(xmm1, xmm5, xmm2, 3); + vgf2p8affineqb(ymm1, ymm5, ymm2, 3); + vgf2p8affineqb(xmm1, xmm5, ptr [rax + 0x40], 3); + vgf2p8affineqb(ymm1, ymm5, ptr [rax + 0x40], 3); + + vgf2p8affineqb(xmm30, xmm31, xmm4, 5); + vgf2p8affineqb(ymm30, ymm31, ymm4, 5); + vgf2p8affineqb(zmm30, zmm31, zmm4, 5); + + vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5); + vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5); + vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5); + + vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5); + vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5); + vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5); + /// + gf2p8mulb(xmm1, xmm2); + gf2p8mulb(xmm1, ptr [rax + 0x40]); + + vgf2p8mulb(xmm1, xmm5, xmm2); + vgf2p8mulb(ymm1, ymm5, ymm2); + vgf2p8mulb(xmm1, xmm5, ptr [rax + 0x40]); + vgf2p8mulb(ymm1, ymm5, ptr [rax + 0x40]); + + vgf2p8mulb(xmm30, xmm31, xmm4); + vgf2p8mulb(ymm30, ymm31, ymm4); + vgf2p8mulb(zmm30, zmm31, zmm4); + + vgf2p8mulb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40]); + vgf2p8mulb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40]); + vgf2p8mulb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40]); + } + } c; + const uint8_t tbl[] = { + 0x66, 0x0f, 0x3a, 0xcf, 0xca, 0x03, + 0x66, 0x0f, 0x3a, 0xcf, 0x48, 0x40, 0x03, + 0xc4, 0xe3, 0xd1, 0xcf, 0xca, 0x03, + 0xc4, 0xe3, 0xd5, 0xcf, 0xca, 0x03, + 0xc4, 0xe3, 0xd1, 0xcf, 0x48, 0x40, 0x03, + 0xc4, 0xe3, 0xd5, 0xcf, 0x48, 0x40, 0x03, + 0x62, 0x63, 0x85, 0x00, 0xcf, 0xf4, 0x05, + 0x62, 0x63, 0x85, 0x20, 0xcf, 0xf4, 0x05, + 0x62, 0x63, 0x85, 0x40, 0xcf, 0xf4, 0x05, + 0x62, 0x63, 0xd5, 0x89, 0xcf, 0x70, 0x04, 0x05, + 0x62, 0x63, 0xd5, 0xa9, 0xcf, 0x70, 0x02, 0x05, + 0x62, 0x63, 0xd5, 0xc9, 0xcf, 0x70, 0x01, 0x05, + 0x62, 0x63, 0xd5, 0x99, 0xcf, 0x70, 0x08, 0x05, + 0x62, 0x63, 0xd5, 0xb9, 0xcf, 0x70, 0x08, 0x05, + 0x62, 0x63, 0xd5, 0xd9, 0xcf, 0x70, 0x08, 0x05, + + 0x66, 0x0f, 0x3a, 0xce, 0xca, 0x03, + 0x66, 0x0f, 0x3a, 0xce, 0x48, 0x40, 0x03, + 0xc4, 0xe3, 0xd1, 0xce, 0xca, 0x03, + 0xc4, 0xe3, 0xd5, 0xce, 0xca, 0x03, + 0xc4, 0xe3, 0xd1, 0xce, 0x48, 0x40, 0x03, + 0xc4, 0xe3, 0xd5, 0xce, 0x48, 0x40, 0x03, + 0x62, 0x63, 0x85, 0x00, 0xce, 0xf4, 0x05, + 0x62, 0x63, 0x85, 0x20, 0xce, 0xf4, 0x05, + 0x62, 0x63, 0x85, 0x40, 0xce, 0xf4, 0x05, + 0x62, 0x63, 0xd5, 0x89, 0xce, 0x70, 0x04, 0x05, + 0x62, 0x63, 0xd5, 0xa9, 0xce, 0x70, 0x02, 0x05, + 0x62, 0x63, 0xd5, 0xc9, 0xce, 0x70, 0x01, 0x05, + 0x62, 0x63, 0xd5, 0x99, 0xce, 0x70, 0x08, 0x05, + 0x62, 0x63, 0xd5, 0xb9, 0xce, 0x70, 0x08, 0x05, + 0x62, 0x63, 0xd5, 0xd9, 0xce, 0x70, 0x08, 0x05, + + 0x66, 0x0f, 0x38, 0xcf, 0xca, + 0x66, 0x0f, 0x38, 0xcf, 0x48, 0x40, + 0xc4, 0xe2, 0x51, 0xcf, 0xca, + 0xc4, 0xe2, 0x55, 0xcf, 0xca, + 0xc4, 0xe2, 0x51, 0xcf, 0x48, 0x40, + 0xc4, 0xe2, 0x55, 0xcf, 0x48, 0x40, + 0x62, 0x62, 0x05, 0x00, 0xcf, 0xf4, + 0x62, 0x62, 0x05, 0x20, 0xcf, 0xf4, + 0x62, 0x62, 0x05, 0x40, 0xcf, 0xf4, + 0x62, 0x62, 0x55, 0x89, 0xcf, 0x70, 0x04, + 0x62, 0x62, 0x55, 0xa9, 0xcf, 0x70, 0x02, + 0x62, 0x62, 0x55, 0xc9, 0xcf, 0x70, 0x01, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} +#endif diff --git a/test/nm_frame.cpp b/test/nm_frame.cpp index 95240c7c..697c2c4a 100644 --- a/test/nm_frame.cpp +++ b/test/nm_frame.cpp @@ -6,6 +6,7 @@ using namespace Xbyak; #ifdef _MSC_VER #pragma warning(disable : 4245) + #pragma warning(disable : 4312) #endif class Sample : public CodeGenerator { void operator=(const Sample&); diff --git a/test/test_address.bat b/test/test_address.bat index 030c04da..f96542f1 100644 --- a/test/test_address.bat +++ b/test/test_address.bat @@ -1,5 +1,5 @@ @echo off -set FILTER=cat +set FILTER=grep -v warning if /i "%1"=="64" ( set OPT2=-DXBYAK64 set OPT3=win64 diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 1f8e96cd..6ab93a09 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -105,7 +105,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x5520 /* 0xABCD = A.BC(D) */ + VERSION = 0x5601 /* 0xABCD = A.BC(D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -177,6 +177,7 @@ enum { ERR_INVALID_OPMASK_WITH_MEMORY, ERR_INVALID_ZERO, ERR_INVALID_RIP_IN_AUTO_GROW, + ERR_INVALID_MIB_ADDRESS, ERR_INTERNAL }; @@ -237,6 +238,7 @@ public: "invalid opmask with memory", "invalid zero", "invalid rip in AutoGrow", + "invalid mib address", "internal error", }; assert((size_t)err_ < sizeof(errTbl) / sizeof(*errTbl)); @@ -345,6 +347,9 @@ public: }; #endif +class Address; +class Reg; + class Operand { static const uint8 EXT8BIT = 0x20; unsigned int idx_:6; // 0..31 + EXT8BIT = 1 if spl/bpl/sil/dil @@ -497,6 +502,8 @@ public: bool isEqualIfNotInherited(const Operand& rhs) const { return idx_ == rhs.idx_ && kind_ == rhs.kind_ && bit_ == rhs.bit_ && zero_ == rhs.zero_ && mask_ == rhs.mask_ && rounding_ == rhs.rounding_; } bool operator==(const Operand& rhs) const; bool operator!=(const Operand& rhs) const { return !operator==(rhs); } + const Address& getAddress() const; + const Reg& getReg() const; }; class Label; @@ -530,6 +537,12 @@ public: #endif }; +inline const Reg& Operand::getReg() const +{ + assert(!isMEM()); + return static_cast(*this); +} + struct Reg8 : public Reg { explicit Reg8(int idx = 0, bool ext8bit = false) : Reg(idx, Operand::REG, 8, ext8bit) { } }; @@ -543,6 +556,13 @@ struct Mmx : public Reg { }; struct EvexModifierRounding { + enum { + T_RN_SAE = 1, + T_RD_SAE = 2, + T_RU_SAE = 3, + T_RZ_SAE = 4, + T_SAE = 5 + }; explicit EvexModifierRounding(int rounding) : rounding(rounding) {} int rounding; }; @@ -689,13 +709,15 @@ public: } } bool isVsib(int bit = 128 | 256 | 512) const { return index_.isBit(bit); } - void optimize() + RegExp optimize() const { + RegExp exp = *this; // [reg * 2] => [reg + reg] - if (index_.isBit(i32e) && !base_.getBit() && index_.getBit() && scale_ == 2) { - base_ = index_; - scale_ = 1; + if (index_.isBit(i32e) && !base_.getBit() && scale_ == 2) { + exp.base_ = index_; + exp.scale_ = 1; } + return exp; } bool operator==(const RegExp& rhs) const { @@ -715,6 +737,11 @@ public: } friend RegExp operator+(const RegExp& a, const RegExp& b); friend RegExp operator-(const RegExp& e, size_t disp); + uint8 getRex() const + { + uint8 rex = index_.getRexX() | base_.getRexB(); + return rex ? uint8(rex | 0x40) : 0; + } private: /* [base_ + index_ * scale_ + disp_] @@ -975,7 +1002,6 @@ public: : Operand(0, MEM, sizeBit), e_(e), label_(0), mode_(M_ModRM), permitVsib_(false), broadcast_(broadcast) { e_.verify(); - e_.optimize(); } #ifdef XBYAK64 explicit Address(size_t disp) @@ -984,7 +1010,10 @@ public: : Operand(0, MEM, sizeBit), e_(addr.disp_), label_(addr.label_), mode_(addr.isAddr_ ? M_ripAddr : M_rip), permitVsib_(false), broadcast_(broadcast) { } #endif void permitVsib() const { permitVsib_ = true; } - const RegExp& getRegExp() const { return e_; } + RegExp getRegExp(bool optimize = true) const + { + return optimize ? e_.optimize() : e_; + } Mode getMode() const { return mode_; } bool is32bit() const { verify(); return e_.getBase().getBit() == 32 || e_.getIndex().getBit() == 32; } bool isOnlyDisp() const { verify(); return !e_.getBase().getBit() && !e_.getIndex().getBit(); } // for mov eax @@ -993,9 +1022,7 @@ public: { verify(); if (mode_ != M_ModRM) return 0; - uint8 rex = e_.getIndex().getRexX() | e_.getBase().getRexB(); - if (rex) rex |= 0x40; - return rex; + return getRegExp().getRex(); } bool is64bitDisp() const { verify(); return mode_ == M_64bitDisp; } // for moffset bool isBroadcast() const { return broadcast_; } @@ -1014,9 +1041,15 @@ private: void verify() const { if (e_.isVsib() && !permitVsib_) throw Error(ERR_BAD_VSIB_ADDRESSING); } }; +inline const Address& Operand::getAddress() const +{ + assert(isMEM()); + return static_cast(*this); +} + inline bool Operand::operator==(const Operand& rhs) const { - if (isMEM() && rhs.isMEM()) return static_cast(*this) == static_cast(rhs); + if (isMEM() && rhs.isMEM()) return this->getAddress() == rhs.getAddress(); return isEqualIfNotInherited(rhs); } @@ -1363,12 +1396,12 @@ private: if (p1->isMEM()) std::swap(p1, p2); if (p1->isMEM()) throw Error(ERR_BAD_COMBINATION); if (p2->isMEM()) { - const Address& addr = static_cast(*p2); + const Address& addr = p2->getAddress(); if (BIT == 64 && addr.is32bit()) db(0x67); - rex = addr.getRex() | static_cast(*p1).getRex(); + rex = addr.getRex() | p1->getReg().getRex(); } else { // ModRM(reg, base); - rex = static_cast(op2).getRex(static_cast(op1)); + rex = op2.getReg().getRex(op1.getReg()); } // except movsx(16bit, 32/64bit) if ((op1.isBit(16) && !op2.isBit(i32e)) || (op2.isBit(16) && !op1.isBit(i32e))) db(0x66); @@ -1447,13 +1480,6 @@ private: if ((a > 0 && a != v) + (b > 0 && b != v) + (c > 0 && c != v) > 0) return Error(err); return v; } - enum { - T_RN_SAE = 1, - T_RD_SAE = 2, - T_RU_SAE = 3, - T_RZ_SAE = 4, - T_SAE = 5 - }; int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0, uint32 VL = 0) { if (!(type & (T_EVEX | T_MUST_EVEX))) throw Error(ERR_EVEX_IS_INVALID); @@ -1472,7 +1498,7 @@ private: int rounding = verifyDuplicate(reg.getRounding(), base.getRounding(), v ? v->getRounding() : 0, ERR_ROUNDING_IS_ALREADY_SET); int disp8N = 1; if (rounding) { - if (rounding == T_SAE){ + if (rounding == EvexModifierRounding::T_SAE) { verifySAE(base, type); LL = 0; } else { verifyER(base, type); LL = rounding - 1; @@ -1581,6 +1607,17 @@ private: db(code0 | (reg.isBit(8) ? 0 : 1)); if (code1 != NONE) db(code1); if (code2 != NONE) db(code2); opAddr(addr, reg.getIdx(), immSize); } + void opMIB(const Address& addr, const Reg& reg, int code0, int code1) + { + if (addr.is64bitDisp()) throw Error(ERR_CANT_USE_64BIT_DISP); + if (addr.getMode() != Address::M_ModRM) throw Error(ERR_INVALID_MIB_ADDRESS); + if (BIT == 64 && addr.is32bit()) db(0x67); + const RegExp& regExp = addr.getRegExp(false); + uint8 rex = regExp.getRex(); + if (rex) db(rex); + db(code0); db(code1); + setSIB(regExp, reg.getIdx()); + } void makeJmp(uint32 disp, LabelType type, uint8 shortCode, uint8 longCode, uint8 longPref) { const int shortJmpSize = 2; @@ -1656,9 +1693,9 @@ private: if (isValid && !isValid(reg, op)) throw Error(ERR_BAD_COMBINATION); if (pref != NONE) db(pref); if (op.isMEM()) { - opModM(static_cast(op), static_cast(reg), 0x0F, preCode, code, (imm8 != NONE) ? 1 : 0); + opModM(op.getAddress(), reg.getReg(), 0x0F, preCode, code, (imm8 != NONE) ? 1 : 0); } else { - opModR(static_cast(reg), static_cast(op), 0x0F, preCode, code); + opModR(reg.getReg(), op.getReg(), 0x0F, preCode, code); } if (imm8 != NONE) db(imm8); } @@ -1676,9 +1713,9 @@ private: { if (pref != NONE) db(pref); if (op1.isXMM() && op2.isMEM()) { - opModM(static_cast(op2), static_cast(op1), 0x0F, code); + opModM(op2.getAddress(), op1.getReg(), 0x0F, code); } else if (op1.isMEM() && op2.isXMM()) { - opModM(static_cast(op1), static_cast(op2), 0x0F, code | 1); + opModM(op1.getAddress(), op2.getReg(), 0x0F, code | 1); } else { throw Error(ERR_BAD_COMBINATION); } @@ -1687,7 +1724,7 @@ private: { if (hasMMX2 && op.isREG(i32e)) { /* pextrw is special */ if (mmx.isXMM()) db(0x66); - opModR(static_cast(op), mmx, 0x0F, 0xC5); db(imm); + opModR(op.getReg(), mmx, 0x0F, 0xC5); db(imm); } else { opGen(mmx, op, code, 0x66, isXMM_REG32orMEM, imm, 0x3A); } @@ -1697,9 +1734,9 @@ private: int opBit = op.getBit(); if (disableRex && opBit == 64) opBit = 32; if (op.isREG(bit)) { - opModR(Reg(ext, Operand::REG, opBit), static_cast(op).changeBit(opBit), code0, code1, code2); + opModR(Reg(ext, Operand::REG, opBit), op.getReg().changeBit(opBit), code0, code1, code2); } else if (op.isMEM()) { - opModM(static_cast(op), Reg(ext, Operand::REG, opBit), code0, code1, code2, immSize); + opModM(op.getAddress(), Reg(ext, Operand::REG, opBit), code0, code1, code2, immSize); } else { throw Error(ERR_BAD_COMBINATION); } @@ -1718,9 +1755,9 @@ private: void opModRM(const Operand& op1, const Operand& op2, bool condR, bool condM, int code0, int code1 = NONE, int code2 = NONE, int immSize = 0) { if (condR) { - opModR(static_cast(op1), static_cast(op2), code0, code1, code2); + opModR(op1.getReg(), op2.getReg(), code0, code1, code2); } else if (condM) { - opModM(static_cast(op2), static_cast(op1), code0, code1, code2, immSize); + opModM(op2.getAddress(), op1.getReg(), code0, code1, code2, immSize); } else { throw Error(ERR_BAD_COMBINATION); } @@ -1735,7 +1772,7 @@ private: void opRM_RM(const Operand& op1, const Operand& op2, int code) { if (op1.isREG() && op2.isMEM()) { - opModM(static_cast(op2), static_cast(op1), code | 2); + opModM(op2.getAddress(), op1.getReg(), code | 2); } else { opModRM(op2, op1, op1.isREG() && op1.getKind() == op2.getKind(), op1.isMEM() && op2.isREG(), code); } @@ -1768,19 +1805,19 @@ private: #endif code = 0xFE; if (op.isREG()) { - opModR(Reg(ext, Operand::REG, op.getBit()), static_cast(op), code); + opModR(Reg(ext, Operand::REG, op.getBit()), op.getReg(), code); } else { - opModM(static_cast(op), Reg(ext, Operand::REG, op.getBit()), code); + opModM(op.getAddress(), Reg(ext, Operand::REG, op.getBit()), code); } } void opPushPop(const Operand& op, int code, int ext, int alt) { if (op.isREG()) { if (op.isBit(16)) db(0x66); - if (static_cast(op).getIdx() >= 8) db(0x41); + if (op.getReg().getIdx() >= 8) db(0x41); db(alt | (op.getIdx() & 7)); } else if (op.isMEM()) { - opModM(static_cast(op), Reg(ext, Operand::REG, op.getBit()), code); + opModM(op.getAddress(), Reg(ext, Operand::REG, op.getBit()), code); } else { throw Error(ERR_BAD_COMBINATION); } @@ -1869,11 +1906,12 @@ private: void opVex(const Reg& r, const Operand *p1, const Operand& op2, int type, int code, int imm8 = NONE) { if (op2.isMEM()) { - const Address& addr = static_cast(op2); - const Reg& base = addr.getRegExp().getBase(); + const Address& addr = op2.getAddress(); + const RegExp& regExp = addr.getRegExp(); + const Reg& base = regExp.getBase(); if (BIT == 64 && addr.is32bit()) db(0x67); int disp8N = 0; - bool x = addr.getRegExp().getIndex().isExtIdx(); + bool x = regExp.getIndex().isExtIdx(); if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() || addr.getOpmaskIdx()) { int aaa = addr.getOpmaskIdx(); if (aaa && !(type & T_M_K)) throw Error(ERR_INVALID_OPMASK_WITH_MEMORY); @@ -1882,14 +1920,14 @@ private: if (!(type & (T_B32 | T_B64))) throw Error(ERR_INVALID_BROADCAST); b = true; } - int VL = addr.getRegExp().isVsib() ? addr.getRegExp().getIndex().getBit() : 0; + int VL = regExp.isVsib() ? regExp.getIndex().getBit() : 0; disp8N = evex(r, base, p1, type, code, x, b, aaa, VL); } else { vex(r, base, p1, type, code, x); } opAddr(addr, r.getIdx(), (imm8 != NONE) ? 1 : 0, disp8N); } else { - const Reg& base = static_cast(op2); + const Reg& base = op2.getReg(); if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || base.hasEvex()) { evex(r, base, p1, type, code); } else { @@ -1971,11 +2009,12 @@ private: } void opGather(const Xmm& x1, const Address& addr, const Xmm& x2, int type, uint8 code, int mode) { - if (!addr.getRegExp().isVsib(128 | 256)) throw Error(ERR_BAD_VSIB_ADDRESSING); + const RegExp& regExp = addr.getRegExp(); + if (!regExp.isVsib(128 | 256)) throw Error(ERR_BAD_VSIB_ADDRESSING); const int y_vx_y = 0; const int y_vy_y = 1; // const int x_vy_x = 2; - const bool isAddrYMM = addr.getRegExp().getIndex().getBit() == 256; + const bool isAddrYMM = regExp.getIndex().getBit() == 256; if (!x1.isXMM() || isAddrYMM || !x2.isXMM()) { bool isOK = false; if (mode == y_vx_y) { @@ -2174,13 +2213,13 @@ public: const Address *addr = 0; uint8 code = 0; if (reg1.isREG() && reg1.getIdx() == 0 && reg2.isMEM()) { // mov eax|ax|al, [disp] - reg = &static_cast(reg1); - addr= &static_cast(reg2); + reg = ®1.getReg(); + addr= ®2.getAddress(); code = 0xA0; } else if (reg1.isMEM() && reg2.isREG() && reg2.getIdx() == 0) { // mov [disp], eax|ax|al - reg = &static_cast(reg2); - addr= &static_cast(reg1); + reg = ®2.getReg(); + addr= ®1.getAddress(); code = 0xA2; } #ifdef XBYAK64 @@ -2207,7 +2246,7 @@ public: void mov(const Operand& op, size_t imm) { if (op.isREG()) { - const int size = mov_imm(static_cast(op), imm); + const int size = mov_imm(op.getReg(), imm); db(imm, size); } else if (op.isMEM()) { verifyMemHasSize(op); @@ -2219,7 +2258,7 @@ public: if (!inner::IsInInt32(imm)) throw Error(ERR_IMM_IS_TOO_BIG); immSize = 4; } - opModM(static_cast(op), Reg(0, Operand::REG, op.getBit()), 0xC6, NONE, NONE, immSize); + opModM(op.getAddress(), Reg(0, Operand::REG, op.getBit()), 0xC6, NONE, NONE, immSize); db(static_cast(imm), immSize); } else { throw Error(ERR_BAD_COMBINATION); @@ -2303,7 +2342,7 @@ public: } void mov(const Segment& seg, const Operand& op) { - opModRM(Reg8(seg.getIdx()), op.isREG(16|i32e) ? static_cast(static_cast(op).cvt32()) : op, op.isREG(16|i32e), op.isMEM(), 0x8E); + opModRM(Reg8(seg.getIdx()), op.isREG(16|i32e) ? static_cast(op.getReg().cvt32()) : op, op.isREG(16|i32e), op.isMEM(), 0x8E); } #endif @@ -2328,7 +2367,7 @@ public: , st0(0), st1(1), st2(2), st3(3), st4(4), st5(5), st6(6), st7(7) , k0(0), k1(1), k2(2), k3(3), k4(4), k5(5), k6(6), k7(7) , bnd0(0), bnd1(1), bnd2(2), bnd3(3) - , T_sae(T_SAE), T_rn_sae(T_RN_SAE), T_rd_sae(T_RD_SAE), T_ru_sae(T_RU_SAE), T_rz_sae(T_RZ_SAE) + , T_sae(EvexModifierRounding::T_SAE), T_rn_sae(EvexModifierRounding::T_RN_SAE), T_rd_sae(EvexModifierRounding::T_RD_SAE), T_ru_sae(EvexModifierRounding::T_RU_SAE), T_rz_sae(EvexModifierRounding::T_RZ_SAE) , T_z() #ifdef XBYAK64 , rax(Operand::RAX), rcx(Operand::RCX), rdx(Operand::RDX), rbx(Operand::RBX), rsp(Operand::RSP), rbp(Operand::RBP), rsi(Operand::RSI), rdi(Operand::RDI), r8(Operand::R8), r9(Operand::R9), r10(Operand::R10), r11(Operand::R11), r12(Operand::R12), r13(Operand::R13), r14(Operand::R14), r15(Operand::R15) @@ -2423,9 +2462,7 @@ public: while (size > 0) { size_t len = (std::min)(n, size); const uint8 *seq = nopTbl[len - 1]; - for (size_t i = 0; i < len; i++) { - db(seq[i]); - } + db(seq, len); size -= len; } } @@ -2456,10 +2493,13 @@ static const Zmm zmm0(0), zmm1(1), zmm2(2), zmm3(3), zmm4(4), zmm5(5), zmm6(6), static const Reg32 eax(Operand::EAX), ecx(Operand::ECX), edx(Operand::EDX), ebx(Operand::EBX), esp(Operand::ESP), ebp(Operand::EBP), esi(Operand::ESI), edi(Operand::EDI); static const Reg16 ax(Operand::AX), cx(Operand::CX), dx(Operand::DX), bx(Operand::BX), sp(Operand::SP), bp(Operand::BP), si(Operand::SI), di(Operand::DI); static const Reg8 al(Operand::AL), cl(Operand::CL), dl(Operand::DL), bl(Operand::BL), ah(Operand::AH), ch(Operand::CH), dh(Operand::DH), bh(Operand::BH); -static const AddressFrame ptr(0), byte(8), word(16), dword(32), qword(64); +static const AddressFrame ptr(0), byte(8), word(16), dword(32), qword(64), xword(128), yword(256), zword(512); +static const AddressFrame ptr_b(0, true), xword_b(128, true), yword_b(256, true), zword_b(512, true); static const Fpu st0(0), st1(1), st2(2), st3(3), st4(4), st5(5), st6(6), st7(7); static const Opmask k0(0), k1(1), k2(2), k3(3), k4(4), k5(5), k6(6), k7(7); static const BoundsReg bnd0(0), bnd1(1), bnd2(2), bnd3(3); +static const EvexModifierRounding T_sae(EvexModifierRounding::T_SAE), T_rn_sae(EvexModifierRounding::T_RN_SAE), T_rd_sae(EvexModifierRounding::T_RD_SAE), T_ru_sae(EvexModifierRounding::T_RU_SAE), T_rz_sae(EvexModifierRounding::T_RZ_SAE); +static const EvexModifierZero T_z; #ifdef XBYAK64 static const Reg64 rax(Operand::RAX), rcx(Operand::RCX), rdx(Operand::RDX), rbx(Operand::RBX), rsp(Operand::RSP), rbp(Operand::RBP), rsi(Operand::RSI), rdi(Operand::RDI), r8(Operand::R8), r9(Operand::R9), r10(Operand::R10), r11(Operand::R11), r12(Operand::R12), r13(Operand::R13), r14(Operand::R14), r15(Operand::R15); static const Reg32 r8d(8), r9d(9), r10d(10), r11d(11), r12d(12), r13d(13), r14d(14), r15d(15); diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 4dbadb68..1bec88ec 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "5.52"; } +const char *getVersionString() const { return "5.601"; } void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); } void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); } void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); } @@ -32,14 +32,15 @@ void blendvps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, is void blsi(const Reg32e& r, const Operand& op) { opGpr(Reg32e(3, r.getBit()), op, r, T_0F38, 0xf3, false); } void blsmsk(const Reg32e& r, const Operand& op) { opGpr(Reg32e(2, r.getBit()), op, r, T_0F38, 0xf3, false); } void blsr(const Reg32e& r, const Operand& op) { opGpr(Reg32e(1, r.getBit()), op, r, T_0F38, 0xf3, false); } +void bnd() { db(0xF2); } void bndcl(const BoundsReg& bnd, const Operand& op) { db(0xF3); opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM()); } void bndcn(const BoundsReg& bnd, const Operand& op) { db(0xF2); opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1B, NONE, !op.isMEM()); } void bndcu(const BoundsReg& bnd, const Operand& op) { db(0xF2); opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM()); } -void bndldx(const BoundsReg& bnd, const Address& addr) { opModM(addr, bnd, 0x0F, 0x1A); } +void bndldx(const BoundsReg& bnd, const Address& addr) { opMIB(addr, bnd, 0x0F, 0x1A); } void bndmk(const BoundsReg& bnd, const Address& addr) { db(0xF3); opModM(addr, bnd, 0x0F, 0x1B); } void bndmov(const Address& addr, const BoundsReg& bnd) { db(0x66); opModM(addr, bnd, 0x0F, 0x1B); } void bndmov(const BoundsReg& bnd, const Operand& op) { db(0x66); opModRM(bnd, op, op.isBNDREG(), op.isMEM(), 0x0F, 0x1A); } -void bndstx(const Address& addr, const BoundsReg& bnd) { opModM(addr, bnd, 0x0F, 0x1B); } +void bndstx(const Address& addr, const BoundsReg& bnd) { opMIB(addr, bnd, 0x0F, 0x1B); } void bsf(const Reg®, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBC); } void bsr(const Reg®, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBD); } void bswap(const Reg32e& reg) { opModR(Reg32(1), reg, 0x0F); } @@ -293,6 +294,9 @@ void fxch(const Fpu& reg) { opFpu(reg, 0xD9, 0xC8); } void fxtract() { db(0xD9); db(0xF4); } void fyl2x() { db(0xD9); db(0xF1); } void fyl2xp1() { db(0xD9); db(0xF9); } +void gf2p8affineinvqb(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, static_cast(imm), 0x3A); } +void gf2p8affineqb(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0xCE, 0x66, isXMM_XMMorMEM, static_cast(imm), 0x3A); } +void gf2p8mulb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, NONE, 0x38); } void haddpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0x66, isXMM_XMMorMEM); } void haddps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0xF2, isXMM_XMMorMEM); } void hsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0x66, isXMM_XMMorMEM); } @@ -758,10 +762,10 @@ void vaddsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) void vaddss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_Z | T_N4, 0x58); } void vaddsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0xD0); } void vaddsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0xD0); } -void vaesdec(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_W0, 0xDE); } -void vaesdeclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_W0, 0xDF); } -void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_W0, 0xDC); } -void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_W0, 0xDD); } +void vaesdec(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDE); } +void vaesdeclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDF); } +void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDC); } +void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDD); } void vaesimc(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_W0, 0xDB); } void vaeskeygenassist(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0xDF, imm); } void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x55); } @@ -1001,6 +1005,9 @@ void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1 void vgatherdps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W0, 0x92, 1); } void vgatherqpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W1, 0x93, 1); } void vgatherqps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W0, 0x93, 2); } +void vgf2p8affineinvqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCF, imm); } +void vgf2p8affineqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCE, imm); } +void vgf2p8mulb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_SAE_Z, 0xCF); } void vhaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0x7C); } void vhaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7C); } void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0x7D); } @@ -1099,7 +1106,7 @@ void vpbroadcastb(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isME void vpbroadcastd(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw Error(ERR_BAD_COMBINATION); opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x58); } void vpbroadcastq(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw Error(ERR_BAD_COMBINATION); opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX, 0x59); } void vpbroadcastw(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw Error(ERR_BAD_COMBINATION); opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x79); } -void vpclmulqdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x44, imm); } +void vpclmulqdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM | T_EVEX, 0x44, imm); } void vpcmpeqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x74); } void vpcmpeqd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x76); } void vpcmpeqq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x29); } @@ -1661,8 +1668,10 @@ void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0xC2, imm); } void vcmpsd(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_N8 | T_F2 | T_0F | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm); } void vcmpss(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_N4 | T_F3 | T_0F | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm); } +void vcompressb(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x63); } void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8A); } void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8A); } +void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); } void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); } void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_YMM | T_MUST_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x79); } void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); } @@ -1784,6 +1793,10 @@ void vpcompressd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 | void vpcompressq(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8B); } void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xC4); } void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xC4); } +void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x50); } +void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x51); } +void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x52); } +void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x53); } void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8D); } void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x75); } void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x76); } @@ -1798,8 +1811,10 @@ void vpermt2ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x void vpermt2q(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7E); } void vpermt2w(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7D); } void vpermw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8D); } +void vpexpandb(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62); } void vpexpandd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x89); } void vpexpandq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x89); } +void vpexpandw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62); } void vpgatherdd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x90, 0); } void vpgatherdq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x90, 1); } void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x91, 2); } @@ -1840,6 +1855,10 @@ void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_ void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x30, true); } void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40); } void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83); } +void vpopcntb(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54); } +void vpopcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x55); } +void vpopcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x55); } +void vpopcntw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54); } void vpord(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xEB); } void vporq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xEB); } void vprold(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 1), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x72, imm); } @@ -1854,6 +1873,19 @@ void vpscatterdd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | void vpscatterdq(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K, 0xA0, 1); } void vpscatterqd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0xA1, 2); } void vpscatterqq(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K, 0xA1, 0); } +void vpshldd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71, imm); } +void vpshldq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71, imm); } +void vpshldvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71); } +void vpshldvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71); } +void vpshldvw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x70); } +void vpshldw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x70, imm); } +void vpshrdd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x73, imm); } +void vpshrdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x73, imm); } +void vpshrdvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x73); } +void vpshrdvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x73); } +void vpshrdvw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x72); } +void vpshrdw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x72, imm); } +void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) { opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F); } void vpsllvw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x12); } void vpsraq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm); } void vpsraq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX, 0xE2); } diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h index eb65004c..e55d66d1 100644 --- a/xbyak/xbyak_util.h +++ b/xbyak/xbyak_util.h @@ -165,100 +165,120 @@ public: static const Type tMOVBE = uint64(1) << 34; // mobve static const Type tAVX512F = uint64(1) << 35; static const Type tAVX512DQ = uint64(1) << 36; - static const Type tAVX512IFMA = uint64(1) << 37; + static const Type tAVX512_IFMA = uint64(1) << 37; + static const Type tAVX512IFMA = tAVX512_IFMA; static const Type tAVX512PF = uint64(1) << 38; static const Type tAVX512ER = uint64(1) << 39; static const Type tAVX512CD = uint64(1) << 40; static const Type tAVX512BW = uint64(1) << 41; static const Type tAVX512VL = uint64(1) << 42; - static const Type tAVX512VBMI = uint64(1) << 43; + static const Type tAVX512_VBMI = uint64(1) << 43; + static const Type tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual static const Type tAVX512_4VNNIW = uint64(1) << 44; static const Type tAVX512_4FMAPS = uint64(1) << 45; static const Type tPREFETCHWT1 = uint64(1) << 46; static const Type tPREFETCHW = uint64(1) << 47; static const Type tSHA = uint64(1) << 48; static const Type tMPX = uint64(1) << 49; + static const Type tAVX512_VBMI2 = uint64(1) << 50; + static const Type tGFNI = uint64(1) << 51; + static const Type tVAES = uint64(1) << 52; + static const Type tVPCLMULQDQ = uint64(1) << 53; + static const Type tAVX512_VNNI = uint64(1) << 54; + static const Type tAVX512_BITALG = uint64(1) << 55; + static const Type tAVX512_VPOPCNTDQ = uint64(1) << 56; Cpu() : type_(NONE) { unsigned int data[4]; + const unsigned int& EAX = data[0]; + const unsigned int& EBX = data[1]; + const unsigned int& ECX = data[2]; + const unsigned int& EDX = data[3]; getCpuid(0, data); - const unsigned int maxNum = data[0]; + const unsigned int maxNum = EAX; static const char intel[] = "ntel"; static const char amd[] = "cAMD"; - if (data[2] == get32bitAsBE(amd)) { + if (ECX == get32bitAsBE(amd)) { type_ |= tAMD; getCpuid(0x80000001, data); - if (data[3] & (1U << 31)) type_ |= t3DN; - if (data[3] & (1U << 15)) type_ |= tCMOV; - if (data[3] & (1U << 30)) type_ |= tE3DN; - if (data[3] & (1U << 22)) type_ |= tMMX2; - if (data[3] & (1U << 27)) type_ |= tRDTSCP; + if (EDX & (1U << 31)) type_ |= t3DN; + if (EDX & (1U << 15)) type_ |= tCMOV; + if (EDX & (1U << 30)) type_ |= tE3DN; + if (EDX & (1U << 22)) type_ |= tMMX2; + if (EDX & (1U << 27)) type_ |= tRDTSCP; } - if (data[2] == get32bitAsBE(intel)) { + if (ECX == get32bitAsBE(intel)) { type_ |= tINTEL; getCpuid(0x80000001, data); - if (data[3] & (1U << 27)) type_ |= tRDTSCP; - if (data[2] & (1U << 5)) type_ |= tLZCNT; - if (data[2] & (1U << 8)) type_ |= tPREFETCHW; + if (EDX & (1U << 27)) type_ |= tRDTSCP; + if (ECX & (1U << 5)) type_ |= tLZCNT; + if (ECX & (1U << 8)) type_ |= tPREFETCHW; } getCpuid(1, data); - if (data[2] & (1U << 0)) type_ |= tSSE3; - if (data[2] & (1U << 9)) type_ |= tSSSE3; - if (data[2] & (1U << 19)) type_ |= tSSE41; - if (data[2] & (1U << 20)) type_ |= tSSE42; - if (data[2] & (1U << 22)) type_ |= tMOVBE; - if (data[2] & (1U << 23)) type_ |= tPOPCNT; - if (data[2] & (1U << 25)) type_ |= tAESNI; - if (data[2] & (1U << 1)) type_ |= tPCLMULQDQ; - if (data[2] & (1U << 27)) type_ |= tOSXSAVE; - if (data[2] & (1U << 30)) type_ |= tRDRAND; - if (data[2] & (1U << 29)) type_ |= tF16C; + if (ECX & (1U << 0)) type_ |= tSSE3; + if (ECX & (1U << 9)) type_ |= tSSSE3; + if (ECX & (1U << 19)) type_ |= tSSE41; + if (ECX & (1U << 20)) type_ |= tSSE42; + if (ECX & (1U << 22)) type_ |= tMOVBE; + if (ECX & (1U << 23)) type_ |= tPOPCNT; + if (ECX & (1U << 25)) type_ |= tAESNI; + if (ECX & (1U << 1)) type_ |= tPCLMULQDQ; + if (ECX & (1U << 27)) type_ |= tOSXSAVE; + if (ECX & (1U << 30)) type_ |= tRDRAND; + if (ECX & (1U << 29)) type_ |= tF16C; - if (data[3] & (1U << 15)) type_ |= tCMOV; - if (data[3] & (1U << 23)) type_ |= tMMX; - if (data[3] & (1U << 25)) type_ |= tMMX2 | tSSE; - if (data[3] & (1U << 26)) type_ |= tSSE2; + if (EDX & (1U << 15)) type_ |= tCMOV; + if (EDX & (1U << 23)) type_ |= tMMX; + if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE; + if (EDX & (1U << 26)) type_ |= tSSE2; if (type_ & tOSXSAVE) { // check XFEATURE_ENABLED_MASK[2:1] = '11b' uint64 bv = getXfeature(); if ((bv & 6) == 6) { - if (data[2] & (1U << 28)) type_ |= tAVX; - if (data[2] & (1U << 12)) type_ |= tFMA; + if (ECX & (1U << 28)) type_ |= tAVX; + if (ECX & (1U << 12)) type_ |= tFMA; if (((bv >> 5) & 7) == 7) { getCpuidEx(7, 0, data); - if (data[1] & (1U << 16)) type_ |= tAVX512F; + if (EBX & (1U << 16)) type_ |= tAVX512F; if (type_ & tAVX512F) { - if (data[1] & (1U << 17)) type_ |= tAVX512DQ; - if (data[1] & (1U << 21)) type_ |= tAVX512IFMA; - if (data[1] & (1U << 26)) type_ |= tAVX512PF; - if (data[1] & (1U << 27)) type_ |= tAVX512ER; - if (data[1] & (1U << 28)) type_ |= tAVX512CD; - if (data[1] & (1U << 30)) type_ |= tAVX512BW; - if (data[1] & (1U << 31)) type_ |= tAVX512VL; - if (data[2] & (1U << 1)) type_ |= tAVX512VBMI; - if (data[3] & (1U << 2)) type_ |= tAVX512_4VNNIW; - if (data[3] & (1U << 3)) type_ |= tAVX512_4FMAPS; + if (EBX & (1U << 17)) type_ |= tAVX512DQ; + if (EBX & (1U << 21)) type_ |= tAVX512_IFMA; + if (EBX & (1U << 26)) type_ |= tAVX512PF; + if (EBX & (1U << 27)) type_ |= tAVX512ER; + if (EBX & (1U << 28)) type_ |= tAVX512CD; + if (EBX & (1U << 30)) type_ |= tAVX512BW; + if (EBX & (1U << 31)) type_ |= tAVX512VL; + if (ECX & (1U << 1)) type_ |= tAVX512_VBMI; + if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2; + if (ECX & (1U << 8)) type_ |= tGFNI; + if (ECX & (1U << 9)) type_ |= tVAES; + if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ; + if (ECX & (1U << 11)) type_ |= tAVX512_VNNI; + if (ECX & (1U << 12)) type_ |= tAVX512_BITALG; + if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ; + if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW; + if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS; } } } } if (maxNum >= 7) { getCpuidEx(7, 0, data); - if (type_ & tAVX && data[1] & 0x20) type_ |= tAVX2; - if (data[1] & (1U << 3)) type_ |= tBMI1; - if (data[1] & (1U << 8)) type_ |= tBMI2; - if (data[1] & (1U << 9)) type_ |= tENHANCED_REP; - if (data[1] & (1U << 18)) type_ |= tRDSEED; - if (data[1] & (1U << 19)) type_ |= tADX; - if (data[1] & (1U << 20)) type_ |= tSMAP; - if (data[1] & (1U << 4)) type_ |= tHLE; - if (data[1] & (1U << 11)) type_ |= tRTM; - if (data[1] & (1U << 14)) type_ |= tMPX; - if (data[1] & (1U << 29)) type_ |= tSHA; - if (data[2] & (1U << 0)) type_ |= tPREFETCHWT1; + if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2; + if (EBX & (1U << 3)) type_ |= tBMI1; + if (EBX & (1U << 8)) type_ |= tBMI2; + if (EBX & (1U << 9)) type_ |= tENHANCED_REP; + if (EBX & (1U << 18)) type_ |= tRDSEED; + if (EBX & (1U << 19)) type_ |= tADX; + if (EBX & (1U << 20)) type_ |= tSMAP; + if (EBX & (1U << 4)) type_ |= tHLE; + if (EBX & (1U << 11)) type_ |= tRTM; + if (EBX & (1U << 14)) type_ |= tMPX; + if (EBX & (1U << 29)) type_ |= tSHA; + if (ECX & (1U << 0)) type_ |= tPREFETCHWT1; } setFamily(); }