diff --git a/externals/xbyak/gen/gen_avx512.cpp b/externals/xbyak/gen/gen_avx512.cpp index 5e0591ec..526877ee 100644 --- a/externals/xbyak/gen/gen_avx512.cpp +++ b/externals/xbyak/gen/gen_avx512.cpp @@ -202,12 +202,12 @@ void putM_X() const char *name; int type; } tbl[] = { - { 0x7F, "vmovdqa32", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z }, - { 0x7F, "vmovdqa64", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z }, - { 0x7F, "vmovdqu8", T_F2 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z }, - { 0x7F, "vmovdqu16", T_F2 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z }, - { 0x7F, "vmovdqu32", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z }, - { 0x7F, "vmovdqu64", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z }, + { 0x7F, "vmovdqa32", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, + { 0x7F, "vmovdqa64", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, + { 0x7F, "vmovdqu8", T_F2 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, + { 0x7F, "vmovdqu16", T_F2 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, + { 0x7F, "vmovdqu32", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, + { 0x7F, "vmovdqu64", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; diff --git a/externals/xbyak/gen/gen_code.cpp b/externals/xbyak/gen/gen_code.cpp index fe0b59ac..d34ad754 100644 --- a/externals/xbyak/gen/gen_code.cpp +++ b/externals/xbyak/gen/gen_code.cpp @@ -1233,12 +1233,12 @@ void put() const char *name; int type; } tbl[] = { - { 0x29, "movapd", T_0F | T_66 | T_YMM | T_EVEX | T_EW1 }, - { 0x29, "movaps", T_0F | T_YMM | T_EVEX | T_EW0 }, + { 0x29, "movapd", T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_M_K }, + { 0x29, "movaps", T_0F | T_YMM | T_EVEX | T_EW0 | T_M_K }, { 0x7F, "movdqa", T_0F | T_66 | T_YMM }, { 0x7F, "movdqu", T_0F | T_F3 | T_YMM }, - { 0x11, "movupd", T_0F | T_66 | T_YMM | T_EVEX | T_EW1 }, - { 0x11, "movups", T_0F | T_YMM | T_EVEX | T_EW0 }, + { 0x11, "movupd", T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_M_K }, + { 0x11, "movups", T_0F | T_YMM | T_EVEX | T_EW0 | T_M_K }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; diff --git a/externals/xbyak/readme.md b/externals/xbyak/readme.md index ec1908d5..04bbc941 100644 --- a/externals/xbyak/readme.md +++ b/externals/xbyak/readme.md @@ -1,5 +1,5 @@ -Xbyak 5.601 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++ +Xbyak 5.65 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++ ============= Abstract @@ -333,6 +333,11 @@ The header files under xbyak/ are independent of cybozulib. History ------------- +* 2018/Jun/26 ver 5.65 fix push(qword [mem]) +* 2018/Mar/07 ver 5.64 fix zero division in Cpu() on some cpu +* 2018/Feb/14 ver 5.63 fix Cpu::setCacheHierarchy() and fix EvexModifierZero for clang<3.9(thanks to mgouicem) +* 2018/Feb/13 ver 5.62 Cpu::setCacheHierarchy() by mgouicem and rsdubtso +* 2018/Feb/07 ver 5.61 vmov* supports mem{k}{z}(I forgot it) * 2018/Jan/24 ver 5.601 add xword, yword, etc. into Xbyak::util namespace * 2018/Jan/05 ver 5.60 support AVX-512 for Ice lake(319433-030.pdf) * 2017/Aug/22 ver 5.53 fix mpx encoding, add bnd() prefix diff --git a/externals/xbyak/readme.txt b/externals/xbyak/readme.txt index aa99b85b..b1c15a20 100644 --- a/externals/xbyak/readme.txt +++ b/externals/xbyak/readme.txt @@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.601 + C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.65 ----------------------------------------------------------------------------- ◎概要 @@ -343,6 +343,11 @@ cybozulibは単体テストでのみ利用されていて、xbyak/ディレク ----------------------------------------------------------------------------- ◎履歴 +2018/06/26 ver 5.65 fix push(qword [mem]) +2018/03/07 ver 5.64 Cpu()の中でzero divisionが出ることがあるのを修正 +2018/02/14 ver 5.63 Cpu::setCacheHierarchy()の修正とclang<3.9のためのEvexModifierZero修正(thanks to mgouicem) +2018/02/13 ver 5.62 Cpu::setCacheHierarchy() by mgouicem and rsdubtso +2018/02/07 ver 5.61 vmov*がmem{k}{z}形式対応(忘れてた) 2018/01/24 ver 5.601 xword, ywordなどをXbyak::util名前空間に追加 2018/01/05 ver 5.60 Ice lake系命令対応(319433-030.pdf) 2017/08/22 ver 5.53 mpxエンコーディングバグ修正, bnd()プレフィクス追加 @@ -470,7 +475,3 @@ cybozulibは単体テストでのみ利用されていて、xbyak/ディレク ◎著作権者 光成滋生(MITSUNARI Shigeo, herumi@nifty.com) - ---- -$Revision: 1.56 $ -$Date: 2010/04/16 11:58:22 $ diff --git a/externals/xbyak/sample/test_util.cpp b/externals/xbyak/sample/test_util.cpp index bb515db9..9b199353 100644 --- a/externals/xbyak/sample/test_util.cpp +++ b/externals/xbyak/sample/test_util.cpp @@ -104,6 +104,9 @@ void putCPUinfo() Core i7-3930K 6 2D */ cpu.putFamily(); + for (unsigned int i = 0; i < cpu.getDataCacheLevels(); i++) { + printf("cache level=%u data cache size=%u cores sharing data cache=%u\n", i, cpu.getDataCacheSize(i), cpu.getCoresSharingDataCache(i)); + } } int main() diff --git a/externals/xbyak/test/Makefile b/externals/xbyak/test/Makefile index e07e1bf1..3180f18d 100644 --- a/externals/xbyak/test/Makefile +++ b/externals/xbyak/test/Makefile @@ -37,6 +37,7 @@ test: normalize_prefix jmp bad_address $(TARGET) $(MAKE) -C ../gen ./test_nm.sh ./test_nm.sh Y + ./test_nm.sh avx512 ./test_address.sh ./jmp ./bad_address diff --git a/externals/xbyak/test/make_512.cpp b/externals/xbyak/test/make_512.cpp index 4efd69f6..408f98b7 100644 --- a/externals/xbyak/test/make_512.cpp +++ b/externals/xbyak/test/make_512.cpp @@ -840,9 +840,9 @@ public: put(p.name, _YMM|YMM_KZ, _YMM|MEM); put(p.name, _ZMM|ZMM_KZ, _ZMM|MEM); if (!p.M_X) continue; - put(p.name, MEM, _XMM); - put(p.name, MEM, _YMM); - put(p.name, MEM, _ZMM); + put(p.name, MEM|MEM_K, _XMM); + put(p.name, MEM|MEM_K, _YMM); + put(p.name, MEM|MEM_K, _ZMM); } put("vsqrtpd", XMM_KZ, M_1to2 | _MEM); put("vsqrtpd", YMM_KZ, M_1to4 | _MEM); diff --git a/externals/xbyak/test/make_nm.cpp b/externals/xbyak/test/make_nm.cpp index 72c82472..cd9db1b4 100644 --- a/externals/xbyak/test/make_nm.cpp +++ b/externals/xbyak/test/make_nm.cpp @@ -1,4 +1,5 @@ #include +#define XBYAK_NO_OP_NAMES #include "xbyak/xbyak.h" #include "xbyak/xbyak_bin2hex.h" #include @@ -121,6 +122,15 @@ class Test { void operator=(const Test&); const bool isXbyak_; int funcNum_; + /* + and_, or_, xor_, not_ => and, or, xor, not + */ + std::string removeUnderScore(std::string s) const + { + if (!isXbyak_ && s[s.size() - 1] == '_') s.resize(s.size() - 1); + return s; + } + // check all op1, op2, op3 void put(const std::string& nm, uint64 op1 = NOPARA, uint64 op2 = NOPARA, uint64 op3 = NOPARA, uint64 op4 = NOPARA) const { @@ -951,15 +961,16 @@ class Test { static const char tbl[][16] = { "adc", "add", - "and", + "and_", "cmp", - "or", + "or_", "sbb", "sub", - "xor", + "xor_", }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { - const char *p = tbl[i]; + const std::string s = removeUnderScore(tbl[i]); + const char *p = s.c_str(); put(p, REG32, REG32|MEM); put(p, REG64, REG64|MEM); put(p, REG16, REG16|MEM); @@ -1017,10 +1028,11 @@ class Test { "imul", "mul", "neg", - "not", + "not_", }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { - const char *p = tbl[i]; + const std::string s = removeUnderScore(tbl[i]); + const char *p = s.c_str(); put(p, REG32e|REG16|REG8|REG8_3); put(p, MEM32|MEM16|MEM8); } @@ -1042,15 +1054,19 @@ class Test { push word 2 reduce 2-byte stack, so I can't support it */ - const char *p = "push"; - put(p, REG16); - put(p, IMM8); // IMM16 decrease -2 from esp - put(p, MEM16); + put("push", IMM8|IMM32); + if (isXbyak_) { + puts("push(word, 1000);dump();"); + } else { + puts("push word 1000"); + } + + put("push", REG16|MEM16); put("pop", REG16|MEM16); #ifdef XBYAK64 - put("push", REG64); - put("pop", REG64); + put("push", REG64|IMM32|MEM64); + put("pop", REG64|MEM64); #else put("push", REG32|IMM32|MEM32); put("pop", REG32|MEM32); @@ -2672,7 +2688,7 @@ public: }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const char *name = tbl[i]; - put(name, MEM, ZMM); + put(name, MEM|MEM_K, ZMM|XMM|YMM); put(name, ZMM, MEM); } } diff --git a/externals/xbyak/test/nm_frame.cpp b/externals/xbyak/test/nm_frame.cpp index 697c2c4a..9deceba2 100644 --- a/externals/xbyak/test/nm_frame.cpp +++ b/externals/xbyak/test/nm_frame.cpp @@ -1,4 +1,5 @@ #include +#define XBYAK_NO_OP_NAMES #define XBYAK_ENABLE_OMITTED_OPERAND #include "xbyak/xbyak.h" diff --git a/externals/xbyak/test/test_nm.sh b/externals/xbyak/test/test_nm.sh index 412dbf45..6001ace9 100755 --- a/externals/xbyak/test/test_nm.sh +++ b/externals/xbyak/test/test_nm.sh @@ -19,6 +19,12 @@ else if ($1 == "Y64") then set OPT2="-DUSE_YASM -DXBYAK64" set OPT3=win64 set FILTER=./normalize_prefix +else if ($1 == "avx512") then + echo "nasm(64bit) + avx512" + set EXE=nasm + set OPT2="-DXBYAK64 -DUSE_AVX512" + set OPT3=win64 + set FILTER=./normalize_prefix else echo "nasm(32bit)" set EXE=nasm diff --git a/externals/xbyak/xbyak/xbyak.h b/externals/xbyak/xbyak/xbyak.h index 6ab93a09..d7035225 100644 --- a/externals/xbyak/xbyak/xbyak.h +++ b/externals/xbyak/xbyak/xbyak.h @@ -105,7 +105,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x5601 /* 0xABCD = A.BC(D) */ + VERSION = 0x5650 /* 0xABCD = A.BC(D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -566,7 +566,7 @@ struct EvexModifierRounding { explicit EvexModifierRounding(int rounding) : rounding(rounding) {} int rounding; }; -struct EvexModifierZero{}; +struct EvexModifierZero{EvexModifierZero() {}}; struct Xmm : public Mmx { explicit Xmm(int idx = 0, Kind kind = Operand::XMM, int bit = 128) : Mmx(idx, kind, bit) { } @@ -614,16 +614,16 @@ struct Reg64 : public Reg32e { }; struct RegRip { sint64 disp_; - Label* label_; + const Label* label_; bool isAddr_; - explicit RegRip(sint64 disp = 0, Label* label = 0, bool isAddr = false) : disp_(disp), label_(label), isAddr_(isAddr) {} + explicit RegRip(sint64 disp = 0, const Label* label = 0, bool isAddr = false) : disp_(disp), label_(label), isAddr_(isAddr) {} friend const RegRip operator+(const RegRip& r, sint64 disp) { return RegRip(r.disp_ + disp, r.label_, r.isAddr_); } friend const RegRip operator-(const RegRip& r, sint64 disp) { return RegRip(r.disp_ - disp, r.label_, r.isAddr_); } - friend const RegRip operator+(const RegRip& r, Label& label) { + friend const RegRip operator+(const RegRip& r, const Label& label) { if (r.label_ || r.isAddr_) throw Error(ERR_BAD_ADDRESSING); return RegRip(r.disp_, &label); } @@ -1812,15 +1812,20 @@ private: } void opPushPop(const Operand& op, int code, int ext, int alt) { - if (op.isREG()) { - if (op.isBit(16)) db(0x66); - if (op.getReg().getIdx() >= 8) db(0x41); - db(alt | (op.getIdx() & 7)); - } else if (op.isMEM()) { - opModM(op.getAddress(), Reg(ext, Operand::REG, op.getBit()), code); - } else { - throw Error(ERR_BAD_COMBINATION); + int bit = op.getBit(); + if (bit == 16 || bit == BIT) { + if (bit == 16) db(0x66); + if (op.isREG()) { + if (op.getReg().getIdx() >= 8) db(0x41); + db(alt | (op.getIdx() & 7)); + return; + } + if (op.isMEM()) { + opModM(op.getAddress(), Reg(ext, Operand::REG, 32), code); + return; + } } + throw Error(ERR_BAD_COMBINATION); } void verifyMemHasSize(const Operand& op) const { diff --git a/externals/xbyak/xbyak/xbyak_mnemonic.h b/externals/xbyak/xbyak/xbyak_mnemonic.h index 1bec88ec..fea242ab 100644 --- a/externals/xbyak/xbyak/xbyak_mnemonic.h +++ b/externals/xbyak/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "5.601"; } +const char *getVersionString() const { return "5.65"; } void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); } void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); } void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); } @@ -1030,9 +1030,9 @@ void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5D); } void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_Z | T_N8, 0x5D); } void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_Z | T_N4, 0x5D); } -void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x29); } +void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x29); } void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x28); } -void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX, 0x29); } +void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x29); } void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x28); } void vmovd(const Operand& op, const Xmm& x) { if (!op.isREG(32) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E); } void vmovd(const Xmm& x, const Operand& op) { if (!op.isREG(32) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E); } @@ -1068,9 +1068,9 @@ void vmovsldup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | void vmovss(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_M_K, 0x11); } void vmovss(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10); } void vmovss(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10); } -void vmovupd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x11); } +void vmovupd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x11); } void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x10); } -void vmovups(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX, 0x11); } +void vmovups(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x11); } void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x10); } void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x42, imm); } void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x59); } @@ -1745,17 +1745,17 @@ void vinserti32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8 imm) {i void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8 imm) {if (!op.is(Operand::MEM | Operand::YMM)) throw Error(ERR_BAD_COMBINATION); opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3A, imm); } void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8 imm) {if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))) throw Error(ERR_BAD_COMBINATION); opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x38, imm); } void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8 imm) {if (!op.is(Operand::MEM | Operand::YMM)) throw Error(ERR_BAD_COMBINATION); opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3A, imm); } -void vmovdqa32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x7F); } +void vmovdqa32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); } void vmovdqa32(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); } -void vmovdqa64(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x7F); } +void vmovdqa64(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); } void vmovdqa64(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); } -void vmovdqu16(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x7F); } +void vmovdqu16(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); } void vmovdqu16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); } -void vmovdqu32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x7F); } +void vmovdqu32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); } void vmovdqu32(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); } -void vmovdqu64(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x7F); } +void vmovdqu64(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); } void vmovdqu64(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); } -void vmovdqu8(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x7F); } +void vmovdqu8(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); } void vmovdqu8(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); } void vp4dpwssd(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x52); } void vp4dpwssds(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x53); } diff --git a/externals/xbyak/xbyak/xbyak_util.h b/externals/xbyak/xbyak/xbyak_util.h index e55d66d1..eb27cc9b 100644 --- a/externals/xbyak/xbyak/xbyak_util.h +++ b/externals/xbyak/xbyak/xbyak_util.h @@ -84,6 +84,67 @@ class Cpu { displayModel = model; } } + unsigned int extractBit(unsigned int val, unsigned int base, unsigned int end) + { + return (val >> base) & ((1u << (end - base)) - 1); + } + void setCacheHierarchy() + { + if ((type_ & tINTEL) == 0) return; + const unsigned int NO_CACHE = 0; + const unsigned int DATA_CACHE = 1; +// const unsigned int INSTRUCTION_CACHE = 2; + const unsigned int UNIFIED_CACHE = 3; + unsigned int smt_width = 0; + unsigned int n_cores = 0; + unsigned int data[4]; + + /* + if leaf 11 exists, we use it to get the number of smt cores and cores on socket + If x2APIC is supported, these are the only correct numbers. + + leaf 0xB can be zeroed-out by a hypervisor + */ + getCpuidEx(0x0, 0, data); + if (data[0] >= 0xB) { + getCpuidEx(0xB, 0, data); // CPUID for SMT Level + smt_width = data[1] & 0x7FFF; + getCpuidEx(0xB, 1, data); // CPUID for CORE Level + n_cores = data[1] & 0x7FFF; + } + + /* + Assumptions: + the first level of data cache is not shared (which is the + case for every existing architecture) and use this to + determine the SMT width for arch not supporting leaf 11. + when leaf 4 reports a number of core less than n_cores + on socket reported by leaf 11, then it is a correct number + of cores not an upperbound. + */ + for (int i = 0; data_cache_levels < maxNumberCacheLevels; i++) { + getCpuidEx(0x4, i, data); + unsigned int cacheType = extractBit(data[0], 0, 4); + if (cacheType == NO_CACHE) break; + if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) { + unsigned int nb_logical_cores = extractBit(data[0], 14, 25) + 1; + if (n_cores != 0) { // true only if leaf 0xB is supported and valid + nb_logical_cores = (std::min)(nb_logical_cores, n_cores); + } + assert(nb_logical_cores != 0); + data_cache_size[data_cache_levels] = + (extractBit(data[1], 22, 31) + 1) + * (extractBit(data[1], 12, 21) + 1) + * (extractBit(data[1], 0, 11) + 1) + * (data[2] + 1); + if (cacheType == DATA_CACHE && smt_width == 0) smt_width = nb_logical_cores; + assert(smt_width != 0); + cores_sharing_data_cache[data_cache_levels] = nb_logical_cores / smt_width; + data_cache_levels++; + } + } + } + public: int model; int family; @@ -92,6 +153,25 @@ public: int extFamily; int displayFamily; // family + extFamily int displayModel; // model + extModel + + // may I move these members into private? + static const unsigned int maxNumberCacheLevels = 10; + unsigned int data_cache_size[maxNumberCacheLevels]; + unsigned int cores_sharing_data_cache[maxNumberCacheLevels]; + unsigned int data_cache_levels; + + unsigned int getDataCacheLevels() const { return data_cache_levels; } + unsigned int getCoresSharingDataCache(unsigned int i) const + { + if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER); + return cores_sharing_data_cache[i]; + } + unsigned int getDataCacheSize(unsigned int i) const + { + if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER); + return data_cache_size[i]; + } + /* data[] = { eax, ebx, ecx, edx } */ @@ -124,6 +204,7 @@ public: #endif } typedef uint64 Type; + static const Type NONE = 0; static const Type tMMX = 1 << 0; static const Type tMMX2 = 1 << 1; @@ -190,6 +271,7 @@ public: Cpu() : type_(NONE) + , data_cache_levels(0) { unsigned int data[4]; const unsigned int& EAX = data[0]; @@ -281,6 +363,7 @@ public: if (ECX & (1U << 0)) type_ |= tPREFETCHWT1; } setFamily(); + setCacheHierarchy(); } void putFamily() const {