From dbb1f8cf373ab2110548985332a9ebf7e71346a9 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Wed, 22 Apr 2020 20:51:10 +0100 Subject: [PATCH] Squashed 'externals/xbyak/' changes from 2794cde7..671fc805 671fc805 update test/cybozu 8ca86231 remove mutable in Address 8b93498f add cmpsb/scasb/... 7eb62750 avoid core_sharing_data_cache = 0 for some cloud envrionment 85767e95 support mingw64 59573e6e add PROTECT_RE mode for protect() 71b75f65 fix push(qword[mem]) 811f4959 Merge branch 'rsdubtso-master' 8e3cb711 Account for potentially zero 0xb leaf when parsing cache/topology via cpuid a816249f update version fe083912 fix to avoid zero division for some virtual machine f0a8f7fa update version cac09b7a Merge pull request #62 from mgouicem/master 1f96b5e0 Fixes an error raised by clang < 3.9 c0f885ac Merge pull request #61 from mgouicem/master bfe2d201 Change default value for n_cores in setCacheHierarchy. fd587b55 change format and add getter for data_cache_size 80b3c7b9 remove macro 88189609 Merge branch 'mgouicem-master' e6b79723 Adding queries to get the cpu topology on Intel architectures. 221384f0 vmov* supports [mem]|k|z c04141ef define XBYAK_NO_OP_NAMES for test af7f05ee add const for Label git-subtree-dir: externals/xbyak git-subtree-split: 671fc805d09d075f48d4625f183ef2e1ef725106 --- gen/avx_type.hpp | 5 ++ gen/gen_avx512.cpp | 65 ++++++++++++++--------- gen/gen_code.cpp | 22 ++++++-- readme.md | 14 +++-- readme.txt | 19 ++++--- sample/static_buf.cpp | 6 ++- sample/test0.cpp | 9 ++-- sample/test_util.cpp | 3 ++ test/Makefile | 1 + test/bad_address.cpp | 2 + test/cybozu/COPYRIGHT | 27 ---------- test/cybozu/inttype.hpp | 76 +++++++++++++++++++++------ test/cybozu/test.hpp | 2 +- test/jmp.cpp | 3 +- test/make_512.cpp | 6 +-- test/make_nm.cpp | 58 +++++++++++++++----- test/nm_frame.cpp | 1 + test/test_address.bat | 2 +- test/test_avx.bat | 2 +- test/test_avx512.bat | 2 +- test/test_nm.bat | 2 +- test/test_nm.sh | 6 +++ xbyak/xbyak.h | 99 ++++++++++++++++++++-------------- xbyak/xbyak_mnemonic.h | 114 ++++++++++++++++++++++------------------ xbyak/xbyak_util.h | 83 +++++++++++++++++++++++++++++ 25 files changed, 425 insertions(+), 204 deletions(-) delete mode 100644 test/cybozu/COPYRIGHT diff --git a/gen/avx_type.hpp b/gen/avx_type.hpp index 244a2c04..6f51166f 100644 --- a/gen/avx_type.hpp +++ b/gen/avx_type.hpp @@ -36,6 +36,7 @@ T_B32 = 1 << 26, // m32bcst T_B64 = 1 << 27, // m64bcst T_M_K = 1 << 28, // mem{k} + T_VSIB = 1 << 29, T_XXX }; @@ -156,5 +157,9 @@ std::string type2String(int type) if (!str.empty()) str += " | "; str += "T_M_K"; } + if (type & T_VSIB) { + if (!str.empty()) str += " | "; + str += "T_VSIB"; + } return str; } diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 5e0591ec..29b95c14 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -202,12 +202,12 @@ void putM_X() const char *name; int type; } tbl[] = { - { 0x7F, "vmovdqa32", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z }, - { 0x7F, "vmovdqa64", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z }, - { 0x7F, "vmovdqu8", T_F2 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z }, - { 0x7F, "vmovdqu16", T_F2 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z }, - { 0x7F, "vmovdqu32", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z }, - { 0x7F, "vmovdqu64", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z }, + { 0x7F, "vmovdqa32", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, + { 0x7F, "vmovdqa64", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, + { 0x7F, "vmovdqu8", T_F2 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, + { 0x7F, "vmovdqu16", T_F2 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, + { 0x7F, "vmovdqu32", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, + { 0x7F, "vmovdqu64", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -533,7 +533,7 @@ void putGather() }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl& p = tbl[i]; - std::string type = type2String(p.type); + std::string type = type2String(p.type | T_VSIB); printf("void %s(const Xmm& x, const Address& addr) { opGather2(x, addr, %s, 0x%02X, %d); }\n", p.name, type.c_str(), p.code, p.mode); } } @@ -557,7 +557,7 @@ void putScatter() }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl& p = tbl[i]; - std::string type = type2String(p.type); + std::string type = type2String(p.type | T_VSIB); printf("void %s(const Address& addr, const Xmm& x) { opGather2(x, addr, %s, 0x%02X, %d); }\n", p.name, type.c_str(), p.code, p.mode); } } @@ -669,26 +669,41 @@ void putMisc() { puts("void vpbroadcastmb2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1, 0x2A); }"); puts("void vpbroadcastmw2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0, 0x3A); }"); + { + const struct Tbl { + const char *name; + int zm; + int type; + uint8 code; + bool isZmm; + } tbl[] = { + { "vgatherpf0dps", 1, T_EW0 | T_N4, 0xC6, true }, + { "vgatherpf0qps", 1, T_EW0 | T_N4, 0xC7, true }, + { "vgatherpf0dpd", 1, T_EW1 | T_N8, 0xC6, false }, + { "vgatherpf0qpd", 1, T_EW1 | T_N8, 0xC7, true }, - puts("void vgatherpf0dps(const Address& addr) { opGatherFetch(addr, zm1, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC6, Operand::ZMM); }"); - puts("void vgatherpf0qps(const Address& addr) { opGatherFetch(addr, zm1, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC7, Operand::ZMM); }"); - puts("void vgatherpf0dpd(const Address& addr) { opGatherFetch(addr, zm1, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC6, Operand::YMM); }"); - puts("void vgatherpf0qpd(const Address& addr) { opGatherFetch(addr, zm1, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC7, Operand::ZMM); }"); + { "vgatherpf1dps", 2, T_EW0 | T_N4, 0xC6, true }, + { "vgatherpf1qps", 2, T_EW0 | T_N4, 0xC7, true }, + { "vgatherpf1dpd", 2, T_EW1 | T_N8, 0xC6, false }, + { "vgatherpf1qpd", 2, T_EW1 | T_N8, 0xC7, true }, - puts("void vgatherpf1dps(const Address& addr) { opGatherFetch(addr, zm2, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC6, Operand::ZMM); }"); - puts("void vgatherpf1qps(const Address& addr) { opGatherFetch(addr, zm2, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC7, Operand::ZMM); }"); - puts("void vgatherpf1dpd(const Address& addr) { opGatherFetch(addr, zm2, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC6, Operand::YMM); }"); - puts("void vgatherpf1qpd(const Address& addr) { opGatherFetch(addr, zm2, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC7, Operand::ZMM); }"); + { "vscatterpf0dps", 5, T_EW0 | T_N4, 0xC6, true }, + { "vscatterpf0qps", 5, T_EW0 | T_N4, 0xC7, true }, + { "vscatterpf0dpd", 5, T_EW1 | T_N8, 0xC6, false }, + { "vscatterpf0qpd", 5, T_EW1 | T_N8, 0xC7, true }, - puts("void vscatterpf0dps(const Address& addr) { opGatherFetch(addr, zm5, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC6, Operand::ZMM); }"); - puts("void vscatterpf0qps(const Address& addr) { opGatherFetch(addr, zm5, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC7, Operand::ZMM); }"); - puts("void vscatterpf0dpd(const Address& addr) { opGatherFetch(addr, zm5, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC6, Operand::YMM); }"); - puts("void vscatterpf0qpd(const Address& addr) { opGatherFetch(addr, zm5, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC7, Operand::ZMM); }"); - - puts("void vscatterpf1dps(const Address& addr) { opGatherFetch(addr, zm6, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC6, Operand::ZMM); }"); - puts("void vscatterpf1qps(const Address& addr) { opGatherFetch(addr, zm6, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC7, Operand::ZMM); }"); - puts("void vscatterpf1dpd(const Address& addr) { opGatherFetch(addr, zm6, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC6, Operand::YMM); }"); - puts("void vscatterpf1qpd(const Address& addr) { opGatherFetch(addr, zm6, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC7, Operand::ZMM); }"); + { "vscatterpf1dps", 6, T_EW0 | T_N4, 0xC6, true }, + { "vscatterpf1qps", 6, T_EW0 | T_N4, 0xC7, true }, + { "vscatterpf1dpd", 6, T_EW1 | T_N8, 0xC6, false }, + { "vscatterpf1qpd", 6, T_EW1 | T_N8, 0xC7, true }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl& p = tbl[i]; + std::string type = type2String(p.type | T_66 | T_0F38 | T_MUST_EVEX | T_M_K | T_VSIB); + printf("void %s(const Address& addr) { opGatherFetch(addr, zm%d, %s, 0x%2X, Operand::%s); }\n" + , p.name, p.zm, type.c_str(), p.code, p.isZmm ? "ZMM" : "YMM"); + } + } puts("void vfpclasspd(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isBit(128|256|512)) throw Error(ERR_BAD_MEM_SIZE); Reg x = k; x.setBit(op.getBit()); opVex(x, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); }"); puts("void vfpclassps(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isBit(128|256|512)) throw Error(ERR_BAD_MEM_SIZE); Reg x = k; x.setBit(op.getBit()); opVex(x, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm); }"); diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index fe0b59ac..37877bfe 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -630,9 +630,18 @@ void put() { "cpuid", 0x0F, 0xA2 }, { "cwd", 0x66, 0x99 }, { "cwde", 0x98 }, + { "cmpsb", 0xA6 }, + { "cmpsw", 0x66, 0xA7 }, + { "cmpsd", 0xA7 }, + { "scasb", 0xAE }, + { "scasw", 0x66, 0xAF }, + { "scasd", 0xAF }, { "movsb", 0xA4 }, { "movsw", 0x66, 0xA5 }, { "movsd", 0xA5 }, + { "stosb", 0xAA }, + { "stosw", 0x66, 0xAB }, + { "stosd", 0xAB }, { "rep", 0xF3 }, { "lahf", 0x9F }, @@ -1233,12 +1242,12 @@ void put() const char *name; int type; } tbl[] = { - { 0x29, "movapd", T_0F | T_66 | T_YMM | T_EVEX | T_EW1 }, - { 0x29, "movaps", T_0F | T_YMM | T_EVEX | T_EW0 }, + { 0x29, "movapd", T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_M_K }, + { 0x29, "movaps", T_0F | T_YMM | T_EVEX | T_EW0 | T_M_K }, { 0x7F, "movdqa", T_0F | T_66 | T_YMM }, { 0x7F, "movdqu", T_0F | T_F3 | T_YMM }, - { 0x11, "movupd", T_0F | T_66 | T_YMM | T_EVEX | T_EW1 }, - { 0x11, "movups", T_0F | T_YMM | T_EVEX | T_EW0 }, + { 0x11, "movupd", T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_M_K }, + { 0x11, "movups", T_0F | T_YMM | T_EVEX | T_EW0 | T_M_K }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -1643,7 +1652,7 @@ void put() }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl& p = tbl[i]; - printf("void %s(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W%d, 0x%x, %d); }\n", p.name, p.w, p.code, p.mode); + printf("void %s(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W%d, 0x%x, %d); }\n", p.name, p.w, p.code, p.mode); } } } @@ -1678,7 +1687,10 @@ void put64() const GenericTbl tbl[] = { { "cdqe", 0x48, 0x98 }, { "cqo", 0x48, 0x99 }, + { "cmpsq", 0x48, 0xA7 }, { "movsq", 0x48, 0xA5 }, + { "scasq", 0x48, 0xAF }, + { "stosq", 0x48, 0xAB }, }; putGeneric(tbl, NUM_OF_ARRAY(tbl)); diff --git a/readme.md b/readme.md index ec1908d5..2c41e822 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,5 @@ -Xbyak 5.601 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++ +Xbyak 5.67 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++ ============= Abstract @@ -327,12 +327,16 @@ License modified new BSD License http://opensource.org/licenses/BSD-3-Clause -The files under test/cybozu/ are copied from cybozulib(https://github.com/herumi/cybozulib/), -which is licensed by BSD-3-Clause and are used for only tests. -The header files under xbyak/ are independent of cybozulib. - History ------------- +* 2018/Aug/14 ver 5.67 remove mutable in Address ; fix setCacheHierarchy for cloud vm +* 2018/Jul/26 ver 5.661 support mingw64 +* 2018/Jul/24 ver 5.66 add CodeArray::PROTECT_RE to mode of protect() +* 2018/Jun/26 ver 5.65 fix push(qword [mem]) +* 2018/Mar/07 ver 5.64 fix zero division in Cpu() on some cpu +* 2018/Feb/14 ver 5.63 fix Cpu::setCacheHierarchy() and fix EvexModifierZero for clang<3.9(thanks to mgouicem) +* 2018/Feb/13 ver 5.62 Cpu::setCacheHierarchy() by mgouicem and rsdubtso +* 2018/Feb/07 ver 5.61 vmov* supports mem{k}{z}(I forgot it) * 2018/Jan/24 ver 5.601 add xword, yword, etc. into Xbyak::util namespace * 2018/Jan/05 ver 5.60 support AVX-512 for Ice lake(319433-030.pdf) * 2017/Aug/22 ver 5.53 fix mpx encoding, add bnd() prefix diff --git a/readme.txt b/readme.txt index aa99b85b..74eb5912 100644 --- a/readme.txt +++ b/readme.txt @@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.601 + C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.67 ----------------------------------------------------------------------------- ◎概要 @@ -335,14 +335,17 @@ http://opensource.org/licenses/BSD-3-Clause sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から いただきました。 -test/cybozu/以下のファイルはcybozulib(https://github.com/herumi/cybozulib/) -の一部を使っています。cybozulibはBSD-3-Clauseライセンスです。 -cybozulibは単体テストでのみ利用されていて、xbyak/ディレクトリ以下のヘッダ -ファイルはcybozulibとは独立に利用できます。 - ----------------------------------------------------------------------------- ◎履歴 +2018/08/14 ver 5.67 Addressクラス内のmutableを削除 ; fix setCacheHierarchy for cloud vm +2018/07/26 ver 5.661 mingw64対応 +2018/07/24 ver 5.66 protect()のmodeにCodeArray::PROTECT_REを追加 +2018/06/26 ver 5.65 fix push(qword [mem]) +2018/03/07 ver 5.64 Cpu()の中でzero divisionが出ることがあるのを修正 +2018/02/14 ver 5.63 Cpu::setCacheHierarchy()の修正とclang<3.9のためのEvexModifierZero修正(thanks to mgouicem) +2018/02/13 ver 5.62 Cpu::setCacheHierarchy() by mgouicem and rsdubtso +2018/02/07 ver 5.61 vmov*がmem{k}{z}形式対応(忘れてた) 2018/01/24 ver 5.601 xword, ywordなどをXbyak::util名前空間に追加 2018/01/05 ver 5.60 Ice lake系命令対応(319433-030.pdf) 2017/08/22 ver 5.53 mpxエンコーディングバグ修正, bnd()プレフィクス追加 @@ -470,7 +473,3 @@ cybozulibは単体テストでのみ利用されていて、xbyak/ディレク ◎著作権者 光成滋生(MITSUNARI Shigeo, herumi@nifty.com) - ---- -$Revision: 1.56 $ -$Date: 2010/04/16 11:58:22 $ diff --git a/sample/static_buf.cpp b/sample/static_buf.cpp index a2ef0e27..7cf8038d 100644 --- a/sample/static_buf.cpp +++ b/sample/static_buf.cpp @@ -13,7 +13,6 @@ struct Code : Xbyak::CodeGenerator { { puts("generate"); printf("ptr=%p, %p\n", getCode(), buf); - Xbyak::CodeArray::protect(buf, sizeof(buf), true); #ifdef XBYAK32 mov(eax, ptr [esp + 4]); add(eax, ptr [esp + 8]); @@ -23,6 +22,11 @@ struct Code : Xbyak::CodeGenerator { lea(rax, ptr [rdi + rsi]); #endif ret(); + Xbyak::CodeArray::protect(buf, sizeof(buf), Xbyak::CodeArray::PROTECT_RE); + } + ~Code() + { + Xbyak::CodeArray::protect(buf, sizeof(buf), Xbyak::CodeArray::PROTECT_RW); } } s_code; diff --git a/sample/test0.cpp b/sample/test0.cpp index e4370929..cd19e484 100644 --- a/sample/test0.cpp +++ b/sample/test0.cpp @@ -162,18 +162,21 @@ int main() { // use memory allocated by user using namespace Xbyak; - const size_t codeSize = 1024; + const size_t codeSize = 4096; uint8 buf[codeSize + 16]; uint8 *p = CodeArray::getAlignedAddress(buf); - CodeArray::protect(p, codeSize, true); Sample s(p, codeSize); + if (!CodeArray::protect(p, codeSize, CodeArray::PROTECT_RWE)) { + fprintf(stderr, "can't protect\n"); + return 1; + } int (*func)(int) = s.getCode(); if (Xbyak::CastTo(func) != p) { fprintf(stderr, "internal error %p %p\n", p, Xbyak::CastTo(func)); return 1; } printf("0 + ... + %d = %d\n", 100, func(100)); - CodeArray::protect(p, codeSize, false); + CodeArray::protect(p, codeSize, CodeArray::PROTECT_RW); } puts("OK"); testReset(); diff --git a/sample/test_util.cpp b/sample/test_util.cpp index bb515db9..9b199353 100644 --- a/sample/test_util.cpp +++ b/sample/test_util.cpp @@ -104,6 +104,9 @@ void putCPUinfo() Core i7-3930K 6 2D */ cpu.putFamily(); + for (unsigned int i = 0; i < cpu.getDataCacheLevels(); i++) { + printf("cache level=%u data cache size=%u cores sharing data cache=%u\n", i, cpu.getDataCacheSize(i), cpu.getCoresSharingDataCache(i)); + } } int main() diff --git a/test/Makefile b/test/Makefile index e07e1bf1..3180f18d 100644 --- a/test/Makefile +++ b/test/Makefile @@ -37,6 +37,7 @@ test: normalize_prefix jmp bad_address $(TARGET) $(MAKE) -C ../gen ./test_nm.sh ./test_nm.sh Y + ./test_nm.sh avx512 ./test_address.sh ./jmp ./bad_address diff --git a/test/bad_address.cpp b/test/bad_address.cpp index 3cac3fa3..a74dd993 100644 --- a/test/bad_address.cpp +++ b/test/bad_address.cpp @@ -27,6 +27,8 @@ struct Code : Xbyak::CodeGenerator { TEST_EXCEPTION(mov(eax, ptr [eax + eax + eax])); TEST_EXCEPTION(mov(eax, ptr [eax * 2 + ecx * 4])); TEST_EXCEPTION(mov(eax, ptr [eax * 2 + ecx * 4])); + TEST_EXCEPTION(mov(eax, ptr [xmm0])); + TEST_EXCEPTION(fld(dword [xmm0])); TEST_EXCEPTION(vgatherdpd(xmm0, ptr [eax * 2], ymm3)); TEST_EXCEPTION(vgatherdpd(xmm0, ptr [xmm0 + xmm1], ymm3)); #ifdef XBYAK64 diff --git a/test/cybozu/COPYRIGHT b/test/cybozu/COPYRIGHT deleted file mode 100644 index a91037bb..00000000 --- a/test/cybozu/COPYRIGHT +++ /dev/null @@ -1,27 +0,0 @@ - -Copyright (c) 2007-2012 Cybozu Labs, Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -Redistributions of source code must retain the above copyright notice, this -list of conditions and the following disclaimer. -Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. -Neither the name of the Cybozu Labs, Inc. nor the names of its contributors may -be used to endorse or promote products derived from this software without -specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -THE POSSIBILITY OF SUCH DAMAGE. diff --git a/test/cybozu/inttype.hpp b/test/cybozu/inttype.hpp index af326622..62856bdb 100644 --- a/test/cybozu/inttype.hpp +++ b/test/cybozu/inttype.hpp @@ -2,10 +2,11 @@ /** @file @brief int type definition and macros - Copyright (C) 2008 Cybozu Labs, Inc., all rights reserved. + @author MITSUNARI Shigeo(@herumi) */ -#if defined(_MSC_VER) && (MSC_VER <= 1500) +#if defined(_MSC_VER) && (MSC_VER <= 1500) && !defined(CYBOZU_DEFINED_INTXX) + #define CYBOZU_DEFINED_INTXX typedef __int64 int64_t; typedef unsigned __int64 uint64_t; typedef unsigned int uint32_t; @@ -38,27 +39,33 @@ #define CYBOZU_ALIGN(x) __attribute__((aligned(x))) #endif #endif +#ifndef CYBOZU_FORCE_INLINE + #ifdef _MSC_VER + #define CYBOZU_FORCE_INLINE __forceinline + #else + #define CYBOZU_FORCE_INLINE __attribute__((always_inline)) + #endif +#endif +#ifndef CYBOZU_UNUSED + #ifdef __GNUC__ + #define CYBOZU_UNUSED __attribute__((unused)) + #else + #define CYBOZU_UNUSED + #endif +#endif #ifndef CYBOZU_ALLOCA #ifdef _MSC_VER #include #define CYBOZU_ALLOCA(x) _malloca(x) #else - #define CYBOZU_ALLOCA_(x) __builtin_alloca(x) - #endif -#endif -#ifndef CYBOZU_FOREACH - // std::vector v; CYBOZU_FOREACH(auto x, v) {...} - #if defined(_MSC_VER) && (_MSC_VER >= 1400) - #define CYBOZU_FOREACH(type_x, xs) for each (type_x in xs) - #elif defined(__GNUC__) - #define CYBOZU_FOREACH(type_x, xs) for (type_x : xs) + #define CYBOZU_ALLOCA(x) __builtin_alloca(x) #endif #endif #ifndef CYBOZU_NUM_OF_ARRAY #define CYBOZU_NUM_OF_ARRAY(x) (sizeof(x) / sizeof(*x)) #endif #ifndef CYBOZU_SNPRINTF - #ifdef _MSC_VER + #if defined(_MSC_VER) && (_MSC_VER < 1900) #define CYBOZU_SNPRINTF(x, len, ...) (void)_snprintf_s(x, len, len - 1, __VA_ARGS__) #else #define CYBOZU_SNPRINTF(x, len, ...) (void)snprintf(x, len, __VA_ARGS__) @@ -68,20 +75,36 @@ #define CYBOZU_CPP_VERSION_CPP03 0 #define CYBOZU_CPP_VERSION_TR1 1 #define CYBOZU_CPP_VERSION_CPP11 2 +#define CYBOZU_CPP_VERSION_CPP14 3 +#define CYBOZU_CPP_VERSION_CPP17 4 -#if (__cplusplus >= 201103) || (_MSC_VER >= 1500) || defined(__GXX_EXPERIMENTAL_CXX0X__) +#ifdef __GNUC__ + #define CYBOZU_GNUC_PREREQ(major, minor) ((__GNUC__) * 100 + (__GNUC_MINOR__) >= (major) * 100 + (minor)) +#else + #define CYBOZU_GNUC_PREREQ(major, minor) 0 +#endif + +#if (__cplusplus >= 201703) + #define CYBOZU_CPP_VERSION CYBOZU_CPP_VERSION_CPP17 +#elif (__cplusplus >= 201402) + #define CYBOZU_CPP_VERSION CYBOZU_CPP_VERSION_CPP14 +#elif (__cplusplus >= 201103) || (_MSC_VER >= 1500) || defined(__GXX_EXPERIMENTAL_CXX0X__) #if defined(_MSC_VER) && (_MSC_VER <= 1600) #define CYBOZU_CPP_VERSION CYBOZU_CPP_VERSION_TR1 #else #define CYBOZU_CPP_VERSION CYBOZU_CPP_VERSION_CPP11 #endif -#elif (__GNUC__ >= 4 && __GNUC_MINOR__ >= 5) || (__clang_major__ >= 3) +#elif CYBOZU_GNUC_PREREQ(4, 5) || (CYBOZU_GNUC_PREREQ(4, 2) && __GLIBCXX__ >= 20070719) || defined(__INTEL_COMPILER) || (__clang_major__ >= 3) #define CYBOZU_CPP_VERSION CYBOZU_CPP_VERSION_TR1 #else #define CYBOZU_CPP_VERSION CYBOZU_CPP_VERSION_CPP03 #endif -#if (CYBOZU_CPP_VERSION == CYBOZU_CPP_VERSION_TR1) +#ifdef CYBOZU_USE_BOOST + #define CYBOZU_NAMESPACE_STD boost + #define CYBOZU_NAMESPACE_TR1_BEGIN + #define CYBOZU_NAMESPACE_TR1_END +#elif (CYBOZU_CPP_VERSION == CYBOZU_CPP_VERSION_TR1) && !defined(__APPLE__) #define CYBOZU_NAMESPACE_STD std::tr1 #define CYBOZU_NAMESPACE_TR1_BEGIN namespace tr1 { #define CYBOZU_NAMESPACE_TR1_END } @@ -92,25 +115,44 @@ #endif #ifndef CYBOZU_OS_BIT - #if defined(_WIN64) || defined(__x86_64__) + #if defined(_WIN64) || defined(__x86_64__) || defined(__AARCH64EL__) || defined(__EMSCRIPTEN__) #define CYBOZU_OS_BIT 64 #else #define CYBOZU_OS_BIT 32 #endif #endif +#ifndef CYBOZU_HOST + #define CYBOZU_HOST_UNKNOWN 0 + #define CYBOZU_HOST_INTEL 1 + #define CYBOZU_HOST_ARM 2 + #if defined(_M_IX86) || defined(_M_AMD64) || defined(__x86_64__) || defined(__i386__) + #define CYBOZU_HOST CYBOZU_HOST_INTEL + #elif defined(__arm__) || defined(__AARCH64EL__) + #define CYBOZU_HOST CYBOZU_HOST_ARM + #else + #define CYBOZU_HOST CYBOZU_HOST_UNKNOWN + #endif +#endif #ifndef CYBOZU_ENDIAN #define CYBOZU_ENDIAN_UNKNOWN 0 #define CYBOZU_ENDIAN_LITTLE 1 #define CYBOZU_ENDIAN_BIG 2 - #if defined(_M_IX86) || defined(_M_AMD64) || defined(__x86_64__) || defined(__i386__) + #if (CYBOZU_HOST == CYBOZU_HOST_INTEL) + #define CYBOZU_ENDIAN CYBOZU_ENDIAN_LITTLE + #elif (CYBOZU_HOST == CYBOZU_HOST_ARM) && (defined(__ARM_EABI__) || defined(__AARCH64EL__)) #define CYBOZU_ENDIAN CYBOZU_ENDIAN_LITTLE #else #define CYBOZU_ENDIAN CYBOZU_ENDIAN_UNKNOWN #endif #endif +#if CYBOZU_CPP_VERSION >= CYBOZU_CPP_VERSION_CPP11 + #define CYBOZU_NOEXCEPT noexcept +#else + #define CYBOZU_NOEXCEPT throw() +#endif namespace cybozu { template void disable_warning_unused_variable(const T&) { } diff --git a/test/cybozu/test.hpp b/test/cybozu/test.hpp index fa735d21..7dfffab9 100644 --- a/test/cybozu/test.hpp +++ b/test/cybozu/test.hpp @@ -3,7 +3,7 @@ @file @brief unit test class - Copyright (C) 2008 Cybozu Labs, Inc., all rights reserved. + @author MITSUNARI Shigeo(@herumi) */ #include diff --git a/test/jmp.cpp b/test/jmp.cpp index 79d54904..2578adbb 100644 --- a/test/jmp.cpp +++ b/test/jmp.cpp @@ -1143,10 +1143,11 @@ CYBOZU_TEST_AUTO(rip_addr_with_fixed_buf) ret(); } } code; - Xbyak::CodeArray::protect(p, 4096, true); + Xbyak::CodeArray::protect(p, 4096, Xbyak::CodeArray::PROTECT_RE); code.getCode()(); CYBOZU_TEST_EQUAL(*x0, 123); CYBOZU_TEST_EQUAL(*x1, 456); CYBOZU_TEST_EQUAL(buf[8], 99); + Xbyak::CodeArray::protect(p, 4096, Xbyak::CodeArray::PROTECT_RW); } #endif diff --git a/test/make_512.cpp b/test/make_512.cpp index 4efd69f6..408f98b7 100644 --- a/test/make_512.cpp +++ b/test/make_512.cpp @@ -840,9 +840,9 @@ public: put(p.name, _YMM|YMM_KZ, _YMM|MEM); put(p.name, _ZMM|ZMM_KZ, _ZMM|MEM); if (!p.M_X) continue; - put(p.name, MEM, _XMM); - put(p.name, MEM, _YMM); - put(p.name, MEM, _ZMM); + put(p.name, MEM|MEM_K, _XMM); + put(p.name, MEM|MEM_K, _YMM); + put(p.name, MEM|MEM_K, _ZMM); } put("vsqrtpd", XMM_KZ, M_1to2 | _MEM); put("vsqrtpd", YMM_KZ, M_1to4 | _MEM); diff --git a/test/make_nm.cpp b/test/make_nm.cpp index 72c82472..f109f1ec 100644 --- a/test/make_nm.cpp +++ b/test/make_nm.cpp @@ -1,4 +1,5 @@ #include +#define XBYAK_NO_OP_NAMES #include "xbyak/xbyak.h" #include "xbyak/xbyak_bin2hex.h" #include @@ -121,6 +122,15 @@ class Test { void operator=(const Test&); const bool isXbyak_; int funcNum_; + /* + and_, or_, xor_, not_ => and, or, xor, not + */ + std::string removeUnderScore(std::string s) const + { + if (!isXbyak_ && s[s.size() - 1] == '_') s.resize(s.size() - 1); + return s; + } + // check all op1, op2, op3 void put(const std::string& nm, uint64 op1 = NOPARA, uint64 op2 = NOPARA, uint64 op3 = NOPARA, uint64 op4 = NOPARA) const { @@ -448,6 +458,10 @@ class Test { #ifdef XBYAK64 "cdqe", "cqo", + "cmpsq", + "movsq", + "scasq", + "stosq", #else "aaa", "aad", @@ -476,6 +490,18 @@ class Test { "lahf", // "lock", + "cmpsb", + "cmpsw", + "cmpsd", + "movsb", + "movsw", + "movsd", + "scasb", + "scasw", + "scasd", + "stosb", + "stosw", + "stosd", "nop", "sahf", @@ -951,15 +977,16 @@ class Test { static const char tbl[][16] = { "adc", "add", - "and", + "and_", "cmp", - "or", + "or_", "sbb", "sub", - "xor", + "xor_", }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { - const char *p = tbl[i]; + const std::string s = removeUnderScore(tbl[i]); + const char *p = s.c_str(); put(p, REG32, REG32|MEM); put(p, REG64, REG64|MEM); put(p, REG16, REG16|MEM); @@ -1017,10 +1044,11 @@ class Test { "imul", "mul", "neg", - "not", + "not_", }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { - const char *p = tbl[i]; + const std::string s = removeUnderScore(tbl[i]); + const char *p = s.c_str(); put(p, REG32e|REG16|REG8|REG8_3); put(p, MEM32|MEM16|MEM8); } @@ -1042,15 +1070,19 @@ class Test { push word 2 reduce 2-byte stack, so I can't support it */ - const char *p = "push"; - put(p, REG16); - put(p, IMM8); // IMM16 decrease -2 from esp - put(p, MEM16); + put("push", IMM8|IMM32); + if (isXbyak_) { + puts("push(word, 1000);dump();"); + } else { + puts("push word 1000"); + } + + put("push", REG16|MEM16); put("pop", REG16|MEM16); #ifdef XBYAK64 - put("push", REG64); - put("pop", REG64); + put("push", REG64|IMM32|MEM64); + put("pop", REG64|MEM64); #else put("push", REG32|IMM32|MEM32); put("pop", REG32|MEM32); @@ -2672,7 +2704,7 @@ public: }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const char *name = tbl[i]; - put(name, MEM, ZMM); + put(name, MEM|MEM_K, ZMM|XMM|YMM); put(name, ZMM, MEM); } } diff --git a/test/nm_frame.cpp b/test/nm_frame.cpp index 697c2c4a..9deceba2 100644 --- a/test/nm_frame.cpp +++ b/test/nm_frame.cpp @@ -1,4 +1,5 @@ #include +#define XBYAK_NO_OP_NAMES #define XBYAK_ENABLE_OMITTED_OPERAND #include "xbyak/xbyak.h" diff --git a/test/test_address.bat b/test/test_address.bat index f96542f1..f82c1256 100644 --- a/test/test_address.bat +++ b/test/test_address.bat @@ -31,7 +31,7 @@ address %1% jit > nm.cpp echo cl -I../ -DXBYAK_TEST nm_frame.cpp %OPT% %OPT2% cl -I../ -DXBYAK_TEST nm_frame.cpp %OPT% %OPT2% nm_frame > x.lst -diff x.lst ok.lst +diff -w x.lst ok.lst wc x.lst :end diff --git a/test/test_avx.bat b/test/test_avx.bat index 5e51aa20..e40de157 100644 --- a/test/test_avx.bat +++ b/test/test_avx.bat @@ -38,5 +38,5 @@ make_nm jit > nm.cpp echo cl -I../ -DXBYAK_TEST nm_frame.cpp %OPT% %OPT2% cl -I../ -DXBYAK_TEST nm_frame.cpp %OPT% %OPT2% nm_frame |%FILTER% > x.lst -diff x.lst ok.lst +diff -w x.lst ok.lst wc x.lst diff --git a/test/test_avx512.bat b/test/test_avx512.bat index a49d562c..52341460 100644 --- a/test/test_avx512.bat +++ b/test/test_avx512.bat @@ -27,5 +27,5 @@ awk "{if (index($3, ""-"")) { conti=substr($3, 0, length($3) - 1) } else { conti make_512 jit > nm.cpp cl -I../ -DXBYAK_TEST nm_frame.cpp %OPT% %OPT2% /DXBYAK_AVX512 nm_frame |%FILTER% > x.lst -diff x.lst ok.lst +diff -w x.lst ok.lst wc x.lst diff --git a/test/test_nm.bat b/test/test_nm.bat index 32abfac7..0d63b650 100644 --- a/test/test_nm.bat +++ b/test/test_nm.bat @@ -39,5 +39,5 @@ if /i "%Y%"=="1" ( make_nm jit > nm.cpp cl -I../ -DXBYAK_TEST nm_frame.cpp %OPT% %OPT2% nm_frame |%FILTER% > x.lst -diff x.lst ok.lst +diff -w x.lst ok.lst wc x.lst diff --git a/test/test_nm.sh b/test/test_nm.sh index 412dbf45..6001ace9 100755 --- a/test/test_nm.sh +++ b/test/test_nm.sh @@ -19,6 +19,12 @@ else if ($1 == "Y64") then set OPT2="-DUSE_YASM -DXBYAK64" set OPT3=win64 set FILTER=./normalize_prefix +else if ($1 == "avx512") then + echo "nasm(64bit) + avx512" + set EXE=nasm + set OPT2="-DXBYAK64 -DUSE_AVX512" + set OPT3=win64 + set FILTER=./normalize_prefix else echo "nasm(32bit)" set EXE=nasm diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 6ab93a09..87d8519a 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -105,7 +105,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x5601 /* 0xABCD = A.BC(D) */ + VERSION = 0x5670 /* 0xABCD = A.BC(D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -566,7 +566,7 @@ struct EvexModifierRounding { explicit EvexModifierRounding(int rounding) : rounding(rounding) {} int rounding; }; -struct EvexModifierZero{}; +struct EvexModifierZero{EvexModifierZero() {}}; struct Xmm : public Mmx { explicit Xmm(int idx = 0, Kind kind = Operand::XMM, int bit = 128) : Mmx(idx, kind, bit) { } @@ -614,16 +614,16 @@ struct Reg64 : public Reg32e { }; struct RegRip { sint64 disp_; - Label* label_; + const Label* label_; bool isAddr_; - explicit RegRip(sint64 disp = 0, Label* label = 0, bool isAddr = false) : disp_(disp), label_(label), isAddr_(isAddr) {} + explicit RegRip(sint64 disp = 0, const Label* label = 0, bool isAddr = false) : disp_(disp), label_(label), isAddr_(isAddr) {} friend const RegRip operator+(const RegRip& r, sint64 disp) { return RegRip(r.disp_ + disp, r.label_, r.isAddr_); } friend const RegRip operator-(const RegRip& r, sint64 disp) { return RegRip(r.disp_ - disp, r.label_, r.isAddr_); } - friend const RegRip operator+(const RegRip& r, Label& label) { + friend const RegRip operator+(const RegRip& r, const Label& label) { if (r.label_ || r.isAddr_) throw Error(ERR_BAD_ADDRESSING); return RegRip(r.disp_, &label); } @@ -848,10 +848,15 @@ protected: uint64 disp = i->getVal(top_); rewrite(i->codeOffset, disp, i->jmpSize); } - if (alloc_->useProtect() && !protect(top_, size_, true)) throw Error(ERR_CANT_PROTECT); + if (alloc_->useProtect() && !protect(top_, size_, PROTECT_RWE)) throw Error(ERR_CANT_PROTECT); isCalledCalcJmpAddress_ = true; } public: + enum ProtectMode { + PROTECT_RW = 0, // read/write + PROTECT_RWE = 1, // read/write/exec + PROTECT_RE = 2 // read/exec + }; explicit CodeArray(size_t maxSize, void *userPtr = 0, Allocator *allocator = 0) : type_(userPtr == AutoGrow ? AUTO_GROW : userPtr ? USER_BUF : ALLOC_BUF) , alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_) @@ -861,7 +866,7 @@ public: , isCalledCalcJmpAddress_(false) { if (maxSize_ > 0 && top_ == 0) throw Error(ERR_CANT_ALLOC); - if ((type_ == ALLOC_BUF && alloc_->useProtect()) && !protect(top_, maxSize, true)) { + if ((type_ == ALLOC_BUF && alloc_->useProtect()) && !protect(top_, maxSize, PROTECT_RWE)) { alloc_->free(top_); throw Error(ERR_CANT_PROTECT); } @@ -869,7 +874,7 @@ public: virtual ~CodeArray() { if (isAllocType()) { - if (alloc_->useProtect()) protect(top_, maxSize_, false); + if (alloc_->useProtect()) protect(top_, maxSize_, PROTECT_RW); alloc_->free(top_); } } @@ -960,19 +965,36 @@ public: change exec permission of memory @param addr [in] buffer address @param size [in] buffer size - @param canExec [in] true(enable to exec), false(disable to exec) + @param protectMode [in] mode(RW/RWE/RE) @return true(success), false(failure) */ - static inline bool protect(const void *addr, size_t size, bool canExec) + static inline bool protect(const void *addr, size_t size, int protectMode) { +#if defined(_WIN32) + const DWORD c_rw = PAGE_READWRITE; + const DWORD c_rwe = PAGE_EXECUTE_READWRITE; + const DWORD c_re = PAGE_EXECUTE_READ; + DWORD mode; +#else + const int c_rw = PROT_READ | PROT_WRITE; + const int c_rwe = PROT_READ | PROT_WRITE | PROT_EXEC; + const int c_re = PROT_READ | PROT_EXEC; + int mode; +#endif + switch (protectMode) { + case PROTECT_RW: mode = c_rw; break; + case PROTECT_RWE: mode = c_rwe; break; + case PROTECT_RE: mode = c_re; break; + default: + return false; + } #if defined(_WIN32) DWORD oldProtect; - return VirtualProtect(const_cast(addr), size, canExec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE, &oldProtect) != 0; + return VirtualProtect(const_cast(addr), size, mode, &oldProtect) != 0; #elif defined(__GNUC__) size_t pageSize = sysconf(_SC_PAGESIZE); size_t iaddr = reinterpret_cast(addr); size_t roundAddr = iaddr & ~(pageSize - static_cast(1)); - int mode = PROT_READ | PROT_WRITE | (canExec ? PROT_EXEC : 0); return mprotect(reinterpret_cast(roundAddr), size + (iaddr - roundAddr), mode) == 0; #else return true; @@ -999,46 +1021,43 @@ public: M_ripAddr }; Address(uint32 sizeBit, bool broadcast, const RegExp& e) - : Operand(0, MEM, sizeBit), e_(e), label_(0), mode_(M_ModRM), permitVsib_(false), broadcast_(broadcast) + : Operand(0, MEM, sizeBit), e_(e), label_(0), mode_(M_ModRM), broadcast_(broadcast) { e_.verify(); } #ifdef XBYAK64 explicit Address(size_t disp) - : Operand(0, MEM, 64), e_(disp), label_(0), mode_(M_64bitDisp), permitVsib_(false), broadcast_(false){ } + : Operand(0, MEM, 64), e_(disp), label_(0), mode_(M_64bitDisp), broadcast_(false){ } Address(uint32 sizeBit, bool broadcast, const RegRip& addr) - : Operand(0, MEM, sizeBit), e_(addr.disp_), label_(addr.label_), mode_(addr.isAddr_ ? M_ripAddr : M_rip), permitVsib_(false), broadcast_(broadcast) { } + : Operand(0, MEM, sizeBit), e_(addr.disp_), label_(addr.label_), mode_(addr.isAddr_ ? M_ripAddr : M_rip), broadcast_(broadcast) { } #endif - void permitVsib() const { permitVsib_ = true; } RegExp getRegExp(bool optimize = true) const { return optimize ? e_.optimize() : e_; } Mode getMode() const { return mode_; } - bool is32bit() const { verify(); return e_.getBase().getBit() == 32 || e_.getIndex().getBit() == 32; } - bool isOnlyDisp() const { verify(); return !e_.getBase().getBit() && !e_.getIndex().getBit(); } // for mov eax - size_t getDisp() const { verify(); return e_.getDisp(); } + bool is32bit() const { return e_.getBase().getBit() == 32 || e_.getIndex().getBit() == 32; } + bool isOnlyDisp() const { return !e_.getBase().getBit() && !e_.getIndex().getBit(); } // for mov eax + size_t getDisp() const { return e_.getDisp(); } uint8 getRex() const { - verify(); if (mode_ != M_ModRM) return 0; return getRegExp().getRex(); } - bool is64bitDisp() const { verify(); return mode_ == M_64bitDisp; } // for moffset + bool is64bitDisp() const { return mode_ == M_64bitDisp; } // for moffset bool isBroadcast() const { return broadcast_; } const Label* getLabel() const { return label_; } bool operator==(const Address& rhs) const { - return getBit() == rhs.getBit() && e_ == rhs.e_ && label_ == rhs.label_ && mode_ == rhs.mode_ && permitVsib_ == rhs.permitVsib_ && broadcast_ == rhs.broadcast_; + return getBit() == rhs.getBit() && e_ == rhs.e_ && label_ == rhs.label_ && mode_ == rhs.mode_ && broadcast_ == rhs.broadcast_; } bool operator!=(const Address& rhs) const { return !operator==(rhs); } + bool isVsib() const { return e_.isVsib(); } private: RegExp e_; const Label* label_; Mode mode_; - mutable bool permitVsib_; bool broadcast_; - void verify() const { if (e_.isVsib() && !permitVsib_) throw Error(ERR_BAD_VSIB_ADDRESSING); } }; inline const Address& Operand::getAddress() const @@ -1443,6 +1462,7 @@ private: T_B32 = 1 << 26, // m32bcst T_B64 = 1 << 27, // m64bcst T_M_K = 1 << 28, // mem{k} + T_VSIB = 1 << 29, T_XXX }; void vex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false) @@ -1669,8 +1689,9 @@ private: // reg is reg field of ModRM // immSize is the size for immediate value // disp8N = 0(normal), disp8N = 1(force disp32), disp8N = {2, 4, 8} ; compressed displacement - void opAddr(const Address &addr, int reg, int immSize = 0, int disp8N = 0) + void opAddr(const Address &addr, int reg, int immSize = 0, int disp8N = 0, bool permitVisb = false) { + if (!permitVisb && addr.isVsib()) throw Error(ERR_BAD_VSIB_ADDRESSING); if (addr.getMode() == Address::M_ModRM) { setSIB(addr.getRegExp(), reg, disp8N); } else if (addr.getMode() == Address::M_rip || addr.getMode() == Address::M_ripAddr) { @@ -1812,15 +1833,20 @@ private: } void opPushPop(const Operand& op, int code, int ext, int alt) { - if (op.isREG()) { - if (op.isBit(16)) db(0x66); - if (op.getReg().getIdx() >= 8) db(0x41); - db(alt | (op.getIdx() & 7)); - } else if (op.isMEM()) { - opModM(op.getAddress(), Reg(ext, Operand::REG, op.getBit()), code); - } else { - throw Error(ERR_BAD_COMBINATION); + int bit = op.getBit(); + if (bit == 16 || bit == BIT) { + if (bit == 16) db(0x66); + if (op.isREG()) { + if (op.getReg().getIdx() >= 8) db(0x41); + db(alt | (op.getIdx() & 7)); + return; + } + if (op.isMEM()) { + opModM(op.getAddress(), Reg(ext, Operand::REG, 32), code); + return; + } } + throw Error(ERR_BAD_COMBINATION); } void verifyMemHasSize(const Operand& op) const { @@ -1925,7 +1951,7 @@ private: } else { vex(r, base, p1, type, code, x); } - opAddr(addr, r.getIdx(), (imm8 != NONE) ? 1 : 0, disp8N); + opAddr(addr, r.getIdx(), (imm8 != NONE) ? 1 : 0, disp8N, (type & T_VSIB) != 0); } else { const Reg& base = op2.getReg(); if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || base.hasEvex()) { @@ -2026,8 +2052,7 @@ private: } if (!isOK) throw Error(ERR_BAD_VSIB_ADDRESSING); } - addr.permitVsib(); - opAVX_X_X_XM(isAddrYMM ? Ymm(x1.getIdx()) : x1, isAddrYMM ? Ymm(x2.getIdx()) : x2, addr, type | T_YMM, code); + opAVX_X_X_XM(isAddrYMM ? Ymm(x1.getIdx()) : x1, isAddrYMM ? Ymm(x2.getIdx()) : x2, addr, type, code); } enum { xx_yy_zz = 0, @@ -2051,7 +2076,6 @@ private: { if (x.hasZero()) throw Error(ERR_INVALID_ZERO); checkGather2(x, addr.getRegExp().getIndex(), mode); - addr.permitVsib(); opVex(x, 0, addr, type, code); } /* @@ -2071,7 +2095,6 @@ private: { if (addr.hasZero()) throw Error(ERR_INVALID_ZERO); if (addr.getRegExp().getIndex().getKind() != kind) throw Error(ERR_BAD_VSIB_ADDRESSING); - addr.permitVsib(); opVex(x, 0, addr, type, code); } public: diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 1bec88ec..9de558df 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "5.601"; } +const char *getVersionString() const { return "5.67"; } void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); } void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); } void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); } @@ -122,8 +122,11 @@ void cmpordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 7); } void cmpordss(const Xmm& x, const Operand& op) { cmpss(x, op, 7); } void cmppd(const Xmm& xmm, const Operand& op, uint8 imm8) { opGen(xmm, op, 0xC2, 0x66, isXMM_XMMorMEM, imm8); } void cmpps(const Xmm& xmm, const Operand& op, uint8 imm8) { opGen(xmm, op, 0xC2, 0x100, isXMM_XMMorMEM, imm8); } +void cmpsb() { db(0xA6); } +void cmpsd() { db(0xA7); } void cmpsd(const Xmm& xmm, const Operand& op, uint8 imm8) { opGen(xmm, op, 0xC2, 0xF2, isXMM_XMMorMEM, imm8); } void cmpss(const Xmm& xmm, const Operand& op, uint8 imm8) { opGen(xmm, op, 0xC2, 0xF3, isXMM_XMMorMEM, imm8); } +void cmpsw() { db(0x66); db(0xA7); } void cmpunordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 3); } void cmpunordps(const Xmm& x, const Operand& op) { cmpps(x, op, 3); } void cmpunordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 3); } @@ -683,6 +686,9 @@ void sar(const Operand& op, int imm) { opShift(op, imm, 7); } void sarx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_F3 | T_0F38, 0xf7, false); } void sbb(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x18, 3); } void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); } +void scasb() { db(0xAE); } +void scasd() { db(0xAF); } +void scasw() { db(0x66); db(0xAF); } void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }//-V524 void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }//-V524 void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }//-V524 @@ -742,6 +748,9 @@ void stc() { db(0xF9); } void std() { db(0xFD); } void sti() { db(0xFB); } void stmxcsr(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0xAE); } +void stosb() { db(0xAA); } +void stosd() { db(0xAB); } +void stosw() { db(0x66); db(0xAB); } void sub(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x28, 5); } void sub(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x28); } void subpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x66, isXMM_XMMorMEM); } @@ -1001,10 +1010,10 @@ void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_X void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBE); } void vfnmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBF); } void vfnmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBF); } -void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W1, 0x92, 0); } -void vgatherdps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W0, 0x92, 1); } -void vgatherqpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W1, 0x93, 1); } -void vgatherqps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W0, 0x93, 2); } +void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x92, 0); } +void vgatherdps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x92, 1); } +void vgatherqpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x93, 1); } +void vgatherqps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x93, 2); } void vgf2p8affineinvqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCF, imm); } void vgf2p8affineqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCE, imm); } void vgf2p8mulb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_SAE_Z, 0xCF); } @@ -1030,9 +1039,9 @@ void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5D); } void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_Z | T_N8, 0x5D); } void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_Z | T_N4, 0x5D); } -void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x29); } +void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x29); } void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x28); } -void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX, 0x29); } +void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x29); } void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x28); } void vmovd(const Operand& op, const Xmm& x) { if (!op.isREG(32) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E); } void vmovd(const Xmm& x, const Operand& op) { if (!op.isREG(32) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E); } @@ -1068,9 +1077,9 @@ void vmovsldup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | void vmovss(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_M_K, 0x11); } void vmovss(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10); } void vmovss(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10); } -void vmovupd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x11); } +void vmovupd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x11); } void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x10); } -void vmovups(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX, 0x11); } +void vmovups(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x11); } void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x10); } void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x42, imm); } void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x59); } @@ -1135,10 +1144,10 @@ void vpextrb(const Operand& op, const Xmm& x, uint8 imm) { if (!((op.isREG(8|16| void vpextrd(const Operand& op, const Xmm& x, uint8 imm) { if (!((op.isREG(32) || op.isMEM()) && x.isXMM())) throw Error(ERR_BAD_COMBINATION); opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x16, imm); } void vpextrq(const Operand& op, const Xmm& x, uint8 imm) { if (!((op.isREG(64) || op.isMEM()) && x.isXMM())) throw Error(ERR_BAD_COMBINATION); opVex(x, 0, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x16, imm); } void vpextrw(const Operand& op, const Xmm& x, uint8 imm) { if (!((op.isREG(16|i32e) || op.isMEM()) && x.isXMM())) throw Error(ERR_BAD_COMBINATION); if (op.isREG() && x.getIdx() < 16) { opAVX_X_X_XM(Xmm(op.getIdx()), xm0, x, T_0F | T_66, 0xC5, imm); } else { opVex(x, 0, op, T_0F3A | T_66 | T_EVEX | T_N2, 0x15, imm); } } -void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W0, 0x90, 1); } -void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W1, 0x90, 0); } -void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W0, 0x91, 2); } -void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W1, 0x91, 1); } +void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x90, 1); } +void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x90, 0); } +void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x91, 2); } +void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x91, 1); } void vphaddd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x02); } void vphaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x03); } void vphaddw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x01); } @@ -1544,7 +1553,10 @@ void jrcxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); } void jrcxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); } void cdqe() { db(0x48); db(0x98); } void cqo() { db(0x48); db(0x99); } +void cmpsq() { db(0x48); db(0xA7); } void movsq() { db(0x48); db(0xA5); } +void scasq() { db(0x48); db(0xAF); } +void stosq() { db(0x48); db(0xAB); } void cmpxchg16b(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xC7); } void movq(const Reg64& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x7E); } void movq(const Mmx& mmx, const Reg64& reg) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x6E); } @@ -1717,18 +1729,18 @@ void vfpclasspd(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isBit(1 void vfpclassps(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isBit(128|256|512)) throw Error(ERR_BAD_MEM_SIZE); Reg x = k; x.setBit(op.getBit()); opVex(x, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm); } void vfpclasssd(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isXMEM()) throw Error(ERR_BAD_MEM_SIZE); opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW1 | T_N8, 0x67, imm); } void vfpclassss(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isXMEM()) throw Error(ERR_BAD_MEM_SIZE); opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm); } -void vgatherdpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x92, 1); } -void vgatherdps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x92, 0); } -void vgatherpf0dpd(const Address& addr) { opGatherFetch(addr, zm1, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC6, Operand::YMM); } -void vgatherpf0dps(const Address& addr) { opGatherFetch(addr, zm1, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC6, Operand::ZMM); } -void vgatherpf0qpd(const Address& addr) { opGatherFetch(addr, zm1, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC7, Operand::ZMM); } -void vgatherpf0qps(const Address& addr) { opGatherFetch(addr, zm1, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC7, Operand::ZMM); } -void vgatherpf1dpd(const Address& addr) { opGatherFetch(addr, zm2, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC6, Operand::YMM); } -void vgatherpf1dps(const Address& addr) { opGatherFetch(addr, zm2, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC6, Operand::ZMM); } -void vgatherpf1qpd(const Address& addr) { opGatherFetch(addr, zm2, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC7, Operand::ZMM); } -void vgatherpf1qps(const Address& addr) { opGatherFetch(addr, zm2, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC7, Operand::ZMM); } -void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x93, 0); } -void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x93, 2); } +void vgatherdpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 1); } +void vgatherdps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 0); } +void vgatherpf0dpd(const Address& addr) { opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM); } +void vgatherpf0dps(const Address& addr) { opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM); } +void vgatherpf0qpd(const Address& addr) { opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vgatherpf0qps(const Address& addr) { opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vgatherpf1dpd(const Address& addr) { opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM); } +void vgatherpf1dps(const Address& addr) { opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM); } +void vgatherpf1qpd(const Address& addr) { opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vgatherpf1qps(const Address& addr) { opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 0); } +void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 2); } void vgetexppd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x42); } void vgetexpps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x42); } void vgetexpsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x43); } @@ -1745,17 +1757,17 @@ void vinserti32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8 imm) {i void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8 imm) {if (!op.is(Operand::MEM | Operand::YMM)) throw Error(ERR_BAD_COMBINATION); opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3A, imm); } void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8 imm) {if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))) throw Error(ERR_BAD_COMBINATION); opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x38, imm); } void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8 imm) {if (!op.is(Operand::MEM | Operand::YMM)) throw Error(ERR_BAD_COMBINATION); opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3A, imm); } -void vmovdqa32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x7F); } +void vmovdqa32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); } void vmovdqa32(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); } -void vmovdqa64(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x7F); } +void vmovdqa64(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); } void vmovdqa64(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); } -void vmovdqu16(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x7F); } +void vmovdqu16(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); } void vmovdqu16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); } -void vmovdqu32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x7F); } +void vmovdqu32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); } void vmovdqu32(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); } -void vmovdqu64(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x7F); } +void vmovdqu64(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); } void vmovdqu64(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); } -void vmovdqu8(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x7F); } +void vmovdqu8(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); } void vmovdqu8(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); } void vp4dpwssd(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x52); } void vp4dpwssds(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x53); } @@ -1815,10 +1827,10 @@ void vpexpandb(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N1 | T void vpexpandd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x89); } void vpexpandq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x89); } void vpexpandw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62); } -void vpgatherdd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x90, 0); } -void vpgatherdq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x90, 1); } -void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x91, 2); } -void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x91, 0); } +void vpgatherdd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 0); } +void vpgatherdq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 1); } +void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 2); } +void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0); } void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44); } void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44); } void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB5); } @@ -1869,10 +1881,10 @@ void vprord(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.get void vprorq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 0), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm); } void vprorvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x14); } void vprorvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x14); } -void vpscatterdd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0xA0, 0); } -void vpscatterdq(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K, 0xA0, 1); } -void vpscatterqd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0xA1, 2); } -void vpscatterqq(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K, 0xA1, 0); } +void vpscatterdd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 0); } +void vpscatterdq(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 1); } +void vpscatterqd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 2); } +void vpscatterqq(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 0); } void vpshldd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71, imm); } void vpshldq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71, imm); } void vpshldvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71); } @@ -1936,18 +1948,18 @@ void vscalefpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x void vscalefps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x2C); } void vscalefsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_ER_X | T_MUST_EVEX, 0x2D); } void vscalefss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x2D); } -void vscatterdpd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K, 0xA2, 1); } -void vscatterdps(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0xA2, 0); } -void vscatterpf0dpd(const Address& addr) { opGatherFetch(addr, zm5, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC6, Operand::YMM); } -void vscatterpf0dps(const Address& addr) { opGatherFetch(addr, zm5, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC6, Operand::ZMM); } -void vscatterpf0qpd(const Address& addr) { opGatherFetch(addr, zm5, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC7, Operand::ZMM); } -void vscatterpf0qps(const Address& addr) { opGatherFetch(addr, zm5, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC7, Operand::ZMM); } -void vscatterpf1dpd(const Address& addr) { opGatherFetch(addr, zm6, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC6, Operand::YMM); } -void vscatterpf1dps(const Address& addr) { opGatherFetch(addr, zm6, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC6, Operand::ZMM); } -void vscatterpf1qpd(const Address& addr) { opGatherFetch(addr, zm6, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC7, Operand::ZMM); } -void vscatterpf1qps(const Address& addr) { opGatherFetch(addr, zm6, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC7, Operand::ZMM); } -void vscatterqpd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K, 0xA3, 0); } -void vscatterqps(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0xA3, 2); } +void vscatterdpd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 1); } +void vscatterdps(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 0); } +void vscatterpf0dpd(const Address& addr) { opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM); } +void vscatterpf0dps(const Address& addr) { opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM); } +void vscatterpf0qpd(const Address& addr) { opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vscatterpf0qps(const Address& addr) { opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vscatterpf1dpd(const Address& addr) { opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM); } +void vscatterpf1dps(const Address& addr) { opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM); } +void vscatterpf1qpd(const Address& addr) { opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vscatterpf1qps(const Address& addr) { opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vscatterqpd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 0); } +void vscatterqps(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 2); } void vshuff32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x23, imm); } void vshuff64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x23, imm); } void vshufi32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x43, imm); } diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h index e55d66d1..0f6aada0 100644 --- a/xbyak/xbyak_util.h +++ b/xbyak/xbyak_util.h @@ -84,6 +84,67 @@ class Cpu { displayModel = model; } } + unsigned int extractBit(unsigned int val, unsigned int base, unsigned int end) + { + return (val >> base) & ((1u << (end - base)) - 1); + } + void setCacheHierarchy() + { + if ((type_ & tINTEL) == 0) return; + const unsigned int NO_CACHE = 0; + const unsigned int DATA_CACHE = 1; +// const unsigned int INSTRUCTION_CACHE = 2; + const unsigned int UNIFIED_CACHE = 3; + unsigned int smt_width = 0; + unsigned int n_cores = 0; + unsigned int data[4]; + + /* + if leaf 11 exists, we use it to get the number of smt cores and cores on socket + If x2APIC is supported, these are the only correct numbers. + + leaf 0xB can be zeroed-out by a hypervisor + */ + getCpuidEx(0x0, 0, data); + if (data[0] >= 0xB) { + getCpuidEx(0xB, 0, data); // CPUID for SMT Level + smt_width = data[1] & 0x7FFF; + getCpuidEx(0xB, 1, data); // CPUID for CORE Level + n_cores = data[1] & 0x7FFF; + } + + /* + Assumptions: + the first level of data cache is not shared (which is the + case for every existing architecture) and use this to + determine the SMT width for arch not supporting leaf 11. + when leaf 4 reports a number of core less than n_cores + on socket reported by leaf 11, then it is a correct number + of cores not an upperbound. + */ + for (int i = 0; data_cache_levels < maxNumberCacheLevels; i++) { + getCpuidEx(0x4, i, data); + unsigned int cacheType = extractBit(data[0], 0, 4); + if (cacheType == NO_CACHE) break; + if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) { + unsigned int nb_logical_cores = extractBit(data[0], 14, 25) + 1; + if (n_cores != 0) { // true only if leaf 0xB is supported and valid + nb_logical_cores = (std::min)(nb_logical_cores, n_cores); + } + assert(nb_logical_cores != 0); + data_cache_size[data_cache_levels] = + (extractBit(data[1], 22, 31) + 1) + * (extractBit(data[1], 12, 21) + 1) + * (extractBit(data[1], 0, 11) + 1) + * (data[2] + 1); + if (cacheType == DATA_CACHE && smt_width == 0) smt_width = nb_logical_cores; + assert(smt_width != 0); + cores_sharing_data_cache[data_cache_levels] = (std::max)(nb_logical_cores / smt_width, 1u); + data_cache_levels++; + } + } + } + public: int model; int family; @@ -92,6 +153,25 @@ public: int extFamily; int displayFamily; // family + extFamily int displayModel; // model + extModel + + // may I move these members into private? + static const unsigned int maxNumberCacheLevels = 10; + unsigned int data_cache_size[maxNumberCacheLevels]; + unsigned int cores_sharing_data_cache[maxNumberCacheLevels]; + unsigned int data_cache_levels; + + unsigned int getDataCacheLevels() const { return data_cache_levels; } + unsigned int getCoresSharingDataCache(unsigned int i) const + { + if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER); + return cores_sharing_data_cache[i]; + } + unsigned int getDataCacheSize(unsigned int i) const + { + if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER); + return data_cache_size[i]; + } + /* data[] = { eax, ebx, ecx, edx } */ @@ -124,6 +204,7 @@ public: #endif } typedef uint64 Type; + static const Type NONE = 0; static const Type tMMX = 1 << 0; static const Type tMMX2 = 1 << 1; @@ -190,6 +271,7 @@ public: Cpu() : type_(NONE) + , data_cache_levels(0) { unsigned int data[4]; const unsigned int& EAX = data[0]; @@ -281,6 +363,7 @@ public: if (ECX & (1U << 0)) type_ |= tPREFETCHWT1; } setFamily(); + setCacheHierarchy(); } void putFamily() const {