diff --git a/externals/xbyak/.github/workflows/main.yml b/externals/xbyak/.github/workflows/main.yml index a2a8c7f9..0e291ae2 100644 --- a/externals/xbyak/.github/workflows/main.yml +++ b/externals/xbyak/.github/workflows/main.yml @@ -1,13 +1,21 @@ name: test on: [push] +defaults: + run: + shell: sh + +permissions: + contents: read + jobs: - build: - name: test + test: runs-on: ubuntu-latest + container: + image: debian:testing steps: - - uses: actions/checkout@v2 - - run: sudo apt update - - run: sudo apt install nasm yasm g++-multilib tcsh + - uses: actions/checkout@v3 + - run: apt -y update + - run: apt -y install g++-multilib libboost-dev make nasm yasm - run: make test - run: make -C sample CXXFLAGS="-DXBYAK_NO_EXCEPTION" diff --git a/externals/xbyak/Android.bp b/externals/xbyak/Android.bp new file mode 100644 index 00000000..c1e53fb5 --- /dev/null +++ b/externals/xbyak/Android.bp @@ -0,0 +1,8 @@ +//################################################# +cc_library_headers { + name: "xbyak_headers", + vendor: true, + export_include_dirs: [ + "xbyak" + ], +} diff --git a/externals/xbyak/CMakeLists.txt b/externals/xbyak/CMakeLists.txt index 835bec73..a4c2de7d 100644 --- a/externals/xbyak/CMakeLists.txt +++ b/externals/xbyak/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 2.6...3.0.2) -project(xbyak LANGUAGES CXX VERSION 6.61) +project(xbyak LANGUAGES CXX VERSION 6.68) file(GLOB headers xbyak/*.h) diff --git a/externals/xbyak/doc/changelog.md b/externals/xbyak/doc/changelog.md index 93d23d95..8be31852 100644 --- a/externals/xbyak/doc/changelog.md +++ b/externals/xbyak/doc/changelog.md @@ -1,5 +1,14 @@ # History +* 2022/Dec/07 ver 6.68 support prefetchit{0,1} +* 2022/Nov/30 ver 6.67 support CMPccXADD +* 2022/Nov/25 ver 6.66 support RAO-INT +* 2022/Nov/22 ver 6.65 consider x32 +* 2022/Nov/04 ver 6.64 some vmov* support addressing with mask +* 2022/Oct/06 ver 6.63 vpmadd52{h,l}uq support AVX-IFMA +* 2022/Oct/05 ver 6.63 support amx_fp16/avx_vnni_int8/avx_ne_convert and add setDefaultEncoding() +* 2022/Aug/15 ver 6.62 add serialize instruction +* 2022/Aug/02 ver 6.61.1 noexcept is supported by Visual Studio 2015 or later * 2022/Jul/29 ver 6.61 fix exception of movzx eax, ah in 64-bit mode * 2022/Jun/16 ver 6.60.2 fix detection of GFNI, VAES, and VPCLMULQDQ * 2022/Jun/15 ver 6.60.1 fix link error of Xbyak::util::Cpu on Visual Studio with /O0 option diff --git a/externals/xbyak/doc/install.md b/externals/xbyak/doc/install.md index ddc1a104..bbec93d2 100644 --- a/externals/xbyak/doc/install.md +++ b/externals/xbyak/doc/install.md @@ -12,3 +12,15 @@ make install ``` These files are copied into `/usr/local/include/xbyak`. + +# Building xbyak - Using vcpkg + +You can download and install xbyak using the [vcpkg](https://github.com/Microsoft/vcpkg) dependency manager: + + git clone https://github.com/Microsoft/vcpkg.git + cd vcpkg + ./bootstrap-vcpkg.sh + ./vcpkg integrate install + ./vcpkg install xbyak + +The xbyak port in vcpkg is kept up to date by Microsoft team members and community contributors. If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository. diff --git a/externals/xbyak/doc/usage.md b/externals/xbyak/doc/usage.md index 7dad2455..7b5678e7 100644 --- a/externals/xbyak/doc/usage.md +++ b/externals/xbyak/doc/usage.md @@ -110,7 +110,15 @@ vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding +setDefaultEncoding(VexEncoding); // default encoding is VEX +vpdpbusd(xm0, xm1, xm2); // VEX encoding ``` + +- setDefaultEncoding(PreferredEncoding encoding); + - Set the default encoding to select EVEX or VEX. + - The default value is EvexEncoding. + - This function affects only an instruction that has a PreferredEncoding argument such as vpdpbusd. + ### Remark * `k1`, ..., `k7` are opmask registers. - `k0` is dealt as no mask. diff --git a/externals/xbyak/gen/Makefile b/externals/xbyak/gen/Makefile index 97a68465..f254d71a 100644 --- a/externals/xbyak/gen/Makefile +++ b/externals/xbyak/gen/Makefile @@ -1,6 +1,6 @@ TARGET=../xbyak/xbyak_mnemonic.h BIN=sortline gen_code gen_avx512 -CFLAGS=-I../ -O2 -DXBYAK_NO_OP_NAMES -Wall -Wextra -Wno-missing-field-initializers $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) +CFLAGS=-I../ -O2 -Wall -Wextra -Wno-missing-field-initializers $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) all: $(TARGET) ../CMakeLists.txt ../meson.build ../readme.md ../readme.txt sortline: sortline.cpp $(CXX) $(CFLAGS) $< -o $@ diff --git a/externals/xbyak/gen/gen_avx512.cpp b/externals/xbyak/gen/gen_avx512.cpp index 35960bbd..8283a54c 100644 --- a/externals/xbyak/gen/gen_avx512.cpp +++ b/externals/xbyak/gen/gen_avx512.cpp @@ -387,9 +387,6 @@ void putX_X_XM_IMM() { 0x57, "vreducess", T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N4, true }, { 0x57, "vreducesh", T_0F3A | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, true }, - { 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, - { 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, - { 0x70, "vpshldw", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z, true }, { 0x71, "vpshldd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, true }, { 0x71, "vpshldq", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, true }, @@ -695,29 +692,29 @@ void putMov() int type; int mode; } tbl[] = { - { 0x32, "vpmovqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL, false }, - { 0x22, "vpmovsqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL, false }, - { 0x12, "vpmovusqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL, false }, + { 0x32, "vpmovqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false }, + { 0x22, "vpmovsqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false }, + { 0x12, "vpmovusqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false }, - { 0x34, "vpmovqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, - { 0x24, "vpmovsqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, - { 0x14, "vpmovusqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, + { 0x34, "vpmovqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false }, + { 0x24, "vpmovsqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false }, + { 0x14, "vpmovusqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false }, - { 0x35, "vpmovqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, - { 0x25, "vpmovsqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, - { 0x15, "vpmovusqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, + { 0x35, "vpmovqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, + { 0x25, "vpmovsqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, + { 0x15, "vpmovusqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, - { 0x31, "vpmovdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, - { 0x21, "vpmovsdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, - { 0x11, "vpmovusdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, + { 0x31, "vpmovdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false }, + { 0x21, "vpmovsdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false }, + { 0x11, "vpmovusdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false }, - { 0x33, "vpmovdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, - { 0x23, "vpmovsdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, - { 0x13, "vpmovusdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, + { 0x33, "vpmovdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, + { 0x23, "vpmovsdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, + { 0x13, "vpmovusdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, - { 0x30, "vpmovwb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, - { 0x20, "vpmovswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, - { 0x10, "vpmovuswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, + { 0x30, "vpmovwb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, + { 0x20, "vpmovswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, + { 0x10, "vpmovuswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl& p = tbl[i]; @@ -827,7 +824,6 @@ void putMisc() puts("void vfpclasssh(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isXMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_0F3A | T_MUST_EVEX | T_EW0 | T_N2, 0x67, imm); }"); puts("void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) { opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F); }"); - puts("void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }"); puts("void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68); }"); puts("void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68); }"); diff --git a/externals/xbyak/gen/gen_code.cpp b/externals/xbyak/gen/gen_code.cpp index a8b169e5..95680536 100644 --- a/externals/xbyak/gen/gen_code.cpp +++ b/externals/xbyak/gen/gen_code.cpp @@ -560,6 +560,8 @@ void put() { 0, "nta", 0x18}, { 2, "wt1", 0x0D}, { 1, "w", 0x0D}, + { 7, "it0", 0x18}, + { 6, "it1", 0x18}, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -693,6 +695,7 @@ void put() { "lock", 0xF0 }, { "sahf", 0x9E }, + { "serialize", 0x0F, 0x01, 0xE8 }, { "stc", 0xF9 }, { "std", 0xFD }, { "sti", 0xFB }, @@ -806,6 +809,23 @@ void put() printf("void %s(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x%02X, %d); }\n", p->name, p->code, p->ext); } } + { + const struct Tbl { + const char *name; + uint8_t prefix; + } tbl[] = { + { "aadd", 0 }, + { "aand", 0x66 }, + { "aor", 0xF2 }, + { "axor", 0xF3 }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl *p = &tbl[i]; + printf("void %s(const Address& addr, const Reg32e ®) { ", p->name); + if (p->prefix) printf("db(0x%02X); ", p->prefix); + printf("opModM(addr, reg, 0x0F, 0x38, 0x0FC); }\n"); + } + } { const struct Tbl { @@ -1666,6 +1686,25 @@ void put() puts("void vcvtps2ph(const Operand& op, const Xmm& x, uint8_t imm) { checkCvt1(x, op); opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y | T_M_K, 0x1D, imm); }"); } + { + const struct Tbl { + const char *name; + int type; + uint8_t code; + } tbl[] = { + { "vbcstnebf162ps", T_F3 | T_0F38 | T_W0 | T_B16 | T_YMM, 0xB1 }, + { "vbcstnesh2ps", T_66 | T_0F38 | T_W0 | T_B16 | T_YMM, 0xB1 }, + { "vcvtneebf162ps", T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0 }, + { "vcvtneeph2ps", T_66 | T_0F38 | T_W0 | T_YMM, 0xB0 }, + { "vcvtneobf162ps", T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0 }, + { "vcvtneoph2ps", T_0F38 | T_W0 | T_YMM, 0xB0 } + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl& p = tbl[i]; + printf("void %s(const Xmm& x, const Address& addr) { opVex(x, 0, addr, %s, 0x%02X); }\n", p.name, type2String(p.type).c_str(), p.code); + } + puts("void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72); }"); + } // haswell gpr(reg, reg, r/m) { const struct Tbl { @@ -1755,11 +1794,33 @@ void put() { 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, { 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, { 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, + { 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 }, + { 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; std::string type = type2String(p->type); - printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, type.c_str(), p->code); + printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, type.c_str(), p->code); + } + } + // avx-vnni-int8 + { + const struct Tbl { + uint8_t code; + const char *name; + int type; + } tbl[] = { + { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM }, + { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM }, + { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM }, + { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, + { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, + { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl *p = &tbl[i]; + std::string type = type2String(p->type); + printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, type.c_str(), p->code); } } } @@ -1824,6 +1885,34 @@ void put64() puts("void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); }"); puts("void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); }"); + // CMPccXADD + { + const struct Tbl { + const char *name; + uint8_t code; + } tbl[] = { + { "be", 0xE6 }, + { "b", 0xE2 }, + { "le", 0xEE }, + { "l", 0xEC }, + { "nbe", 0xE7 }, + { "nb", 0xE3 }, + { "nle", 0xEF }, + { "nl", 0xED }, + { "no", 0xE1 }, + { "np", 0xEB }, + { "ns", 0xE9 }, + { "nz", 0xE5 }, + { "o", 0xE0 }, + { "p", 0xEA }, + { "s", 0xE8 }, + { "z", 0xE4 }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl *p = &tbl[i]; + printf("void cmp%sxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0x%02X, false); }\n", p->name, p->code); + } + } } void putAMX_TILE() @@ -1842,6 +1931,7 @@ void putAMX_INT8() puts("void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }"); puts("void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }"); puts("void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }"); + puts("void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }"); } void putAMX_BF16() { diff --git a/externals/xbyak/meson.build b/externals/xbyak/meson.build index 73282e13..9daaa8f9 100644 --- a/externals/xbyak/meson.build +++ b/externals/xbyak/meson.build @@ -5,7 +5,7 @@ project( 'xbyak', 'cpp', - version: '6.61', + version: '6.68', license: 'BSD-3-Clause', default_options: 'b_ndebug=if-release' ) diff --git a/externals/xbyak/readme.md b/externals/xbyak/readme.md index 69ef3c28..ae7c6341 100644 --- a/externals/xbyak/readme.md +++ b/externals/xbyak/readme.md @@ -1,5 +1,5 @@ -# Xbyak 6.61 [![Badge Build]][Build Status] +# Xbyak 6.68 [![Badge Build]][Build Status] *A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)* @@ -28,6 +28,7 @@ If you want to use them, then specify `-fno-operator-names` option to gcc/clang. ### News +- add amx_fp16/avx_vnni_int8/avx_ne_convert/avx-ifma - add movdiri, movdir64b, clwb, cldemote - WAITPKG instructions (tpause, umonitor, umwait) are supported. - MmapAllocator supports memfd with user-defined strings. see sample/memfd.cpp diff --git a/externals/xbyak/readme.txt b/externals/xbyak/readme.txt index 14c1ffb3..819fc419 100644 --- a/externals/xbyak/readme.txt +++ b/externals/xbyak/readme.txt @@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.61 + C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.68 ----------------------------------------------------------------------------- ◎概要 @@ -166,13 +166,15 @@ vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64], vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding - +setDefaultEncoding(VexEncoding); // default encoding is VEX +vpdpbusd(xm0, xm1, xm2); // VEX encoding 注意 * k1, ..., k7 は新しいopmaskレジスタです。 * z, sae, rn-sae, rd-sae, ru-sae, rz-saeの代わりにT_z, T_sae, T_rn_sae, T_rd_sae, T_ru_sae, T_rz_saeを使ってください。 * `k4 | k3`と`k3 | k4`は意味が異なります。 * {1toX}の代わりにptr_bを使ってください。Xは自動的に決まります。 * 一部の命令はメモリサイズを指定するためにxword/yword/zword(_b)を使ってください。 +* setDefaultEncoding()でencoding省略時のEVEX/VEXを設定できます。 ・ラベル @@ -400,6 +402,15 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から ----------------------------------------------------------------------------- ◎履歴 +2022/12/07 ver 6.68 prefetchit{0,1}サポート +2022/11/30 ver 6.67 CMPccXADDサポート +2022/11/25 ver 6.66 RAO-INTサポート +2022/11/22 ver 6.65 x32動作確認 +2022/11/04 ver 6.64 vmov*命令をmaskつきアドレッシング対応修正 +2022/10/06 ver 6.63 AVX-IFMA用のvpmadd52{h,l}uq対応 +2022/10/05 amx_fp16/avx_vnni_int8/avx_ne_convertt対応とsetDefaultEncoding()追加 +2022/09/15 ver 6.62 serialize追加 +2022/08/02 ver 6.61.1 noexceptはVisual Studio 2015以降対応 2022/07/29 ver 6.61 movzx eax, ahがエラーになるのを修正 2022/06/16 ver 6.60.2 GFNI, VAES, VPCLMULQDQの判定修正 2022/06/15 ver 6.60.1 Visual Studio /O0でXbyak::util::Cpuがリンクエラーになるのに対応 diff --git a/externals/xbyak/sample/Makefile b/externals/xbyak/sample/Makefile index 7c910bb8..91663607 100644 --- a/externals/xbyak/sample/Makefile +++ b/externals/xbyak/sample/Makefile @@ -1,6 +1,7 @@ XBYAK_INC=../xbyak/xbyak.h +CXX?=g++ -BOOST_EXIST=$(shell echo "\#include " | (gcc -E - 2>/dev/null) | grep "boost/spirit/core.hpp" >/dev/null && echo "1") +BOOST_EXIST=$(shell echo "#include " | $(CXX) -x c++ -c - 2>/dev/null && echo 1) UNAME_M=$(shell uname -m) ONLY_64BIT=0 @@ -104,7 +105,7 @@ profiler-vtune: profiler.cpp ../xbyak/xbyak_util.h $(CXX) $(CFLAGS) profiler.cpp -o $@ -DXBYAK_USE_VTUNE -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl clean: - rm -rf *.o $(TARGET) *.exe profiler profiler-vtune + rm -rf $(TARGET) profiler profiler-vtune test : test0.cpp $(XBYAK_INC) test64: test0.cpp $(XBYAK_INC) diff --git a/externals/xbyak/sample/quantize.cpp b/externals/xbyak/sample/quantize.cpp index 6bdf0d00..ba0fd22d 100644 --- a/externals/xbyak/sample/quantize.cpp +++ b/externals/xbyak/sample/quantize.cpp @@ -199,7 +199,7 @@ int main(int argc, char *argv[]) quantize2(dest2, src, qTbl); for (int i = 0; i < N; i++) { if (dest[i] != dest2[i]) { - printf("err[%d] %d %d\n", i, dest[i], dest2[i]); + printf("err[%d] %u %u\n", i, dest[i], dest2[i]); } } diff --git a/externals/xbyak/sample/test_util.cpp b/externals/xbyak/sample/test_util.cpp index 2488ce15..96e9d213 100644 --- a/externals/xbyak/sample/test_util.cpp +++ b/externals/xbyak/sample/test_util.cpp @@ -89,6 +89,13 @@ void putCPUinfo(bool onlyCpuidFeature) { Cpu::tMOVDIRI, "movdiri" }, { Cpu::tMOVDIR64B, "movdir64b" }, { Cpu::tCLZERO, "clzero" }, + { Cpu::tAMX_FP16, "amx_fp16" }, + { Cpu::tAVX_VNNI_INT8, "avx_vnni_int8" }, + { Cpu::tAVX_NE_CONVERT, "avx_ne_convert" }, + { Cpu::tAVX_IFMA, "avx_ifma" }, + { Cpu::tRAO_INT, "rao-int" }, + { Cpu::tCMPCCXADD, "cmpccxadd" }, + { Cpu::tPREFETCHITI, "prefetchiti" }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str); diff --git a/externals/xbyak/sample/toyvm.cpp b/externals/xbyak/sample/toyvm.cpp index 1e558ff0..dff0cb7d 100644 --- a/externals/xbyak/sample/toyvm.cpp +++ b/externals/xbyak/sample/toyvm.cpp @@ -5,8 +5,8 @@ mem_ 4byte x 65536 - ٤Ƥ̿4byte - ¨ͤ16bit + all instructions are fixed at 4 bytes. + all immediate values are 16-bit. R = A or B vldiR, imm ; R = imm @@ -109,7 +109,7 @@ public: reg[r] -= imm; break; case PUT: - printf("%c %8d(0x%08x)\n", 'A' + r, reg[r], reg[r]); + printf("%c %8u(0x%08x)\n", 'A' + r, reg[r], reg[r]); break; case JNZ: if (reg[r] != 0) pc += static_cast(imm); @@ -294,7 +294,7 @@ lp: p = t; n--; if (n != 0) goto lp; - printf("c=%d(0x%08x)\n", c, c); + printf("c=%u(0x%08x)\n", c, c); } int main() diff --git a/externals/xbyak/test/Makefile b/externals/xbyak/test/Makefile index 0e7b889d..eecdbe72 100644 --- a/externals/xbyak/test/Makefile +++ b/externals/xbyak/test/Makefile @@ -1,6 +1,9 @@ -TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 +TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32 XBYAK_INC=../xbyak/xbyak.h UNAME_S=$(shell uname -s) +ifeq ($(shell ./detect_x32),x32) +X32?=1 +endif BIT=32 ifeq ($(shell uname -m),x86_64) BIT=64 @@ -20,9 +23,9 @@ endif all: $(TARGET) -CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith +CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wwrite-strings -Wfloat-equal -Wpointer-arith -CFLAGS=-O2 -fomit-frame-pointer -Wall -fno-operator-names -I../ -I./ $(CFLAGS_WARN) $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) #-std=c++0x +CFLAGS=-O2 -Wall -I../ -I./ $(CFLAGS_WARN) $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) #-std=c++0x make_nm: $(CXX) $(CFLAGS) make_nm.cpp -o $@ normalize_prefix: normalize_prefix.cpp ../xbyak/xbyak.h @@ -53,12 +56,11 @@ noexception: noexception.cpp ../xbyak/xbyak.h test_nm: normalize_prefix $(TARGET) $(MAKE) -C ../gen ifneq ($(ONLY_64BIT),1) - ./test_nm.sh - ./test_nm.sh noexcept - ./noexception - ./test_nm.sh Y - ./test_nm.sh avx512 - ./test_address.sh + CXX=$(CXX) ./test_nm.sh + CXX=$(CXX) ./test_nm.sh noexcept + CXX=$(CXX) ./test_nm.sh Y + CXX=$(CXX) ./test_nm.sh avx512 + CXX=$(CXX) ./test_address.sh ./jmp ./cvt_test32 endif @@ -67,32 +69,38 @@ endif ./misc32 ./cvt_test ifeq ($(BIT),64) - ./test_address.sh 64 - ./test_nm.sh 64 - ./test_nm.sh Y64 + CXX=$(CXX) ./test_address.sh 64 +ifneq ($(X32),1) + CXX=$(CXX) ./test_nm.sh 64 + CXX=$(CXX) ./test_nm.sh Y64 +endif ./jmp64 endif test_avx: normalize_prefix ifneq ($(ONLY_64BIT),0) - ./test_avx.sh - ./test_avx.sh Y + CXX=$(CXX) ./test_avx.sh + CXX=$(CXX) ./test_avx.sh Y endif ifeq ($(BIT),64) - ./test_address.sh 64 - ./test_avx.sh 64 - ./test_avx.sh Y64 + CXX=$(CXX) ./test_avx.sh 64 +ifneq ($(X32),1) + CXX=$(CXX) ./test_avx.sh Y64 +endif endif test_avx512: normalize_prefix ifneq ($(ONLY_64BIT),0) - ./test_avx512.sh + CXX=$(CXX) ./test_avx512.sh endif ifeq ($(BIT),64) - ./test_avx512.sh 64 + CXX=$(CXX) ./test_avx512.sh 64 endif -test: +detect_x32: detect_x32.c + $(CC) $< -o $@ + +test: detect_x32 $(MAKE) test_nm $(MAKE) test_avx $(MAKE) test_avx512 @@ -104,4 +112,3 @@ lib_run: lib_test.cpp lib_run.cpp lib.h $(CXX) $(CFLAGS) lib_run.cpp lib_test.cpp -o lib_run make_nm: make_nm.cpp $(XBYAK_INC) - diff --git a/externals/xbyak/test/Makefile.win b/externals/xbyak/test/Makefile.win index 4025ae2c..96105b3f 100644 --- a/externals/xbyak/test/Makefile.win +++ b/externals/xbyak/test/Makefile.win @@ -1,4 +1,4 @@ -OPT=/EHsc -I../xbyak /W4 -D_CRT_SECURE_NO_WARNINGS +OPT=/EHsc -I../xbyak /W4 -D_CRT_SECURE_NO_WARNINGS -I ../ ../xbyak/xbyak_mnemonic.h: ../gen/gen_code.exe ../gen/gen_avx512.exe ../gen/gen_code.exe > $@ ../gen/gen_avx512.exe >> $@ diff --git a/externals/xbyak/test/detect_x32.c b/externals/xbyak/test/detect_x32.c new file mode 100644 index 00000000..549b8d50 --- /dev/null +++ b/externals/xbyak/test/detect_x32.c @@ -0,0 +1,8 @@ +#include + +int main() +{ +#if defined(__x86_64__) && defined(__ILP32__) + puts("x32"); +#endif +} diff --git a/externals/xbyak/test/make_512.cpp b/externals/xbyak/test/make_512.cpp index 83994ab1..39bfa991 100644 --- a/externals/xbyak/test/make_512.cpp +++ b/externals/xbyak/test/make_512.cpp @@ -1807,44 +1807,44 @@ public: put("vpmovd2m", K, _XMM | _YMM | _ZMM); put("vpmovq2m", K, _XMM | _YMM | _ZMM); - put("vpmovqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); - put("vpmovsqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); - put("vpmovusqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); + put("vpmovqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); + put("vpmovsqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); + put("vpmovusqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); - put("vpmovqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); - put("vpmovsqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); - put("vpmovusqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); + put("vpmovqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); + put("vpmovsqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); + put("vpmovusqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); - put("vpmovqd", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovqd", YMM_KZ | _MEM, _ZMM); + put("vpmovqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovqd", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovsqd", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovsqd", YMM_KZ | _MEM, _ZMM); + put("vpmovsqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovsqd", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovusqd", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovusqd", YMM_KZ | _MEM, _ZMM); + put("vpmovusqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovusqd", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); - put("vpmovsdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); - put("vpmovusdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); + put("vpmovdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); + put("vpmovsdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); + put("vpmovusdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); - put("vpmovdw", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovdw", YMM_KZ | _MEM, _ZMM); + put("vpmovdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovdw", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovsdw", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovsdw", YMM_KZ | _MEM, _ZMM); + put("vpmovsdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovsdw", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovusdw", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovusdw", YMM_KZ | _MEM, _ZMM); + put("vpmovusdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovusdw", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovwb", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovwb", YMM_KZ | _MEM, _ZMM); + put("vpmovwb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovwb", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovswb", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovswb", YMM_KZ | _MEM, _ZMM); + put("vpmovswb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovswb", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovuswb", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovuswb", YMM_KZ | _MEM, _ZMM); + put("vpmovuswb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovuswb", YMM_KZ | _MEM | MEM_K, _ZMM); } void putRot() { diff --git a/externals/xbyak/test/make_nm.cpp b/externals/xbyak/test/make_nm.cpp index 801ffe04..e5939eb7 100644 --- a/externals/xbyak/test/make_nm.cpp +++ b/externals/xbyak/test/make_nm.cpp @@ -533,6 +533,7 @@ class Test { "nop", "sahf", + "serialize", "stc", "std", "sti", @@ -1017,9 +1018,7 @@ class Test { } void putCmov() const { - const struct { - const char *s; - } tbl[] = { + const char tbl[][4] = { "o", "no", "b", @@ -1053,11 +1052,11 @@ class Test { }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { char buf[32]; - snprintf(buf, sizeof(buf), "cmov%s", tbl[i].s); + snprintf(buf, sizeof(buf), "cmov%s", tbl[i]); put(buf, REG16, REG16|MEM); put(buf, REG32, REG32|MEM); put(buf, REG64, REG64|MEM); - snprintf(buf, sizeof(buf), "set%s", tbl[i].s); + snprintf(buf, sizeof(buf), "set%s", tbl[i]); put(buf, REG8|REG8_3|MEM); } } @@ -1294,7 +1293,7 @@ class Test { put(p, REG64, "0x1234567890abcdefLL", "0x1234567890abcdef"); put("movbe", REG16|REG32e, MEM); put("movbe", MEM, REG16|REG32e); -#ifdef XBYAK64 +#if defined(XBYAK64) && !defined(__ILP32__) put(p, RAX|EAX|AX|AL, "ptr [0x1234567890abcdefLL]", "[qword 0x1234567890abcdef]"); put(p, "ptr [0x1234567890abcdefLL]", "[qword 0x1234567890abcdef]", RAX|EAX|AX|AL); put(p, "qword [rax], 0"); @@ -2608,7 +2607,7 @@ public: putMPX(); #endif -#ifdef XBYAK64 +#if defined(XBYAK64) && !defined(__ILP32__) #ifdef USE_YASM putRip(); diff --git a/externals/xbyak/test/misc.cpp b/externals/xbyak/test/misc.cpp index 236dfb86..2090dca9 100644 --- a/externals/xbyak/test/misc.cpp +++ b/externals/xbyak/test/misc.cpp @@ -5,6 +5,7 @@ #include #include #include +#include using namespace Xbyak; @@ -97,13 +98,17 @@ CYBOZU_TEST_AUTO(mov_const) } #ifdef XBYAK64 CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x7fffffff])); - CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x17fffffff]), Xbyak::Error); + if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32 + CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x17fffffff]), Xbyak::Error); + } #ifdef XBYAK_OLD_DISP_CHECK CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x80000000])); CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0xffffffff])); #else - CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x80000000ull]), Xbyak::Error); - CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0xffffffffull]), Xbyak::Error); + if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32 + CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x80000000ull]), Xbyak::Error); + CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0xffffffffull]), Xbyak::Error); + } #endif #endif } @@ -875,6 +880,10 @@ CYBOZU_TEST_AUTO(vnni) vpdpbusd(xm0, xm1, xm2); vpdpbusd(xm0, xm1, xm2, EvexEncoding); // EVEX vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX + setDefaultEncoding(VexEncoding); + vpdpbusd(xm0, xm1, xm2); // VEX + setDefaultEncoding(EvexEncoding); + vpdpbusd(xm0, xm1, xm2); // EVEX } void badVex() { @@ -885,6 +894,8 @@ CYBOZU_TEST_AUTO(vnni) 0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2, 0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2, 0xC4, 0xE2, 0x71, 0x50, 0xC2, + 0xC4, 0xE2, 0x71, 0x50, 0xC2, + 0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2, }; const size_t n = sizeof(tbl) / sizeof(tbl[0]); CYBOZU_TEST_EQUAL(c.getSize(), n); @@ -1975,3 +1986,175 @@ CYBOZU_TEST_AUTO(cpu) Cpu cpu; CYBOZU_TEST_EQUAL(cpu.has(Cpu::tINTEL) && cpu.has(Cpu::tAMD), cpu.has(Cpu::tINTEL | Cpu::tAMD)); } + +CYBOZU_TEST_AUTO(minmax) +{ + using namespace Xbyak::util; + CYBOZU_TEST_EQUAL((std::min)(3, 4), local::min_(3, 4)); + CYBOZU_TEST_EQUAL((std::max)(3, 4), local::max_(3, 4)); +} + +CYBOZU_TEST_AUTO(rao_int) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { +#ifdef XBYAK64 + aadd(ptr[rax], ecx); + aadd(ptr[eax], ecx); + aadd(ptr[rax], r10); + aand(ptr[rax], ecx); + aand(ptr[eax], ecx); + aand(ptr[rax], r10); + aor(ptr[rax], ecx); + aor(ptr[eax], ecx); + aor(ptr[rax], r10); + axor(ptr[rax], ecx); + axor(ptr[eax], ecx); + axor(ptr[rax], r10); +#else + aadd(ptr[eax], ecx); + aand(ptr[eax], ecx); + aor(ptr[eax], ecx); + axor(ptr[eax], ecx); +#endif + } + } c; + const uint8_t tbl[] = { +#ifdef XBYAK64 + // aadd + 0x0f, 0x38, 0xfc, 0x08, + 0x67, 0x0f, 0x38, 0xfc, 0x08, + 0x4c, 0x0f, 0x38, 0xfc, 0x10, + + // aand + 0x66, 0x0f, 0x38, 0xfc, 0x08, + 0x66, 0x67, 0x0f, 0x38, 0xfc, 0x08, + 0x66, 0x4c, 0x0f, 0x38, 0xfc, 0x10, + + // aor + 0xf2, 0x0f, 0x38, 0xfc, 0x08, + 0xf2, 0x67, 0x0f, 0x38, 0xfc, 0x08, + 0xf2, 0x4c, 0x0f, 0x38, 0xfc, 0x10, + + // axor + 0xf3, 0x0f, 0x38, 0xfc, 0x08, + 0xf3, 0x67, 0x0f, 0x38, 0xfc, 0x08, + 0xf3, 0x4c, 0x0f, 0x38, 0xfc, 0x10, +#else + // aadd + 0x0f, 0x38, 0xfc, 0x08, + // aand + 0x66, 0x0f, 0x38, 0xfc, 0x08, + // aor + 0xf2, 0x0f, 0x38, 0xfc, 0x08, + // axor + 0xf3, 0x0f, 0x38, 0xfc, 0x08, +#endif + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} + +#ifdef XBYAK64 +CYBOZU_TEST_AUTO(CMPccXADD) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + // 32bit reg + cmpbexadd(ptr[rax+r10*4], ecx, edx); + cmpbxadd(ptr[rax+r10*4], ecx, edx); + cmplexadd(ptr[rax+r10*4], ecx, edx); + cmplxadd(ptr[rax+r10*4], ecx, edx); + cmpnbexadd(ptr[rax+r10*4], ecx, edx); + cmpnbxadd(ptr[rax+r10*4], ecx, edx); + cmpnlexadd(ptr[rax+r10*4], ecx, edx); + cmpnlxadd(ptr[rax+r10*4], ecx, edx); + cmpnoxadd(ptr[rax+r10*4], ecx, edx); + cmpnpxadd(ptr[rax+r10*4], ecx, edx); + cmpnsxadd(ptr[rax+r10*4], ecx, edx); + cmpnzxadd(ptr[rax+r10*4], ecx, edx); + cmpoxadd(ptr[rax+r10*4], ecx, edx); + cmppxadd(ptr[rax+r10*4], ecx, edx); + cmpsxadd(ptr[rax+r10*4], ecx, edx); + cmpzxadd(ptr[rax+r10*4], ecx, edx); + // 64bit reg + cmpbexadd(ptr[rax+r10*4], rcx, rdx); + cmpbxadd(ptr[rax+r10*4], rcx, rdx); + cmplexadd(ptr[rax+r10*4], rcx, rdx); + cmplxadd(ptr[rax+r10*4], rcx, rdx); + cmpnbexadd(ptr[rax+r10*4], rcx, rdx); + cmpnbxadd(ptr[rax+r10*4], rcx, rdx); + cmpnlexadd(ptr[rax+r10*4], rcx, rdx); + cmpnlxadd(ptr[rax+r10*4], rcx, rdx); + cmpnoxadd(ptr[rax+r10*4], rcx, rdx); + cmpnpxadd(ptr[rax+r10*4], rcx, rdx); + cmpnsxadd(ptr[rax+r10*4], rcx, rdx); + cmpnzxadd(ptr[rax+r10*4], rcx, rdx); + cmpoxadd(ptr[rax+r10*4], rcx, rdx); + cmppxadd(ptr[rax+r10*4], rcx, rdx); + cmpsxadd(ptr[rax+r10*4], rcx, rdx); + cmpzxadd(ptr[rax+r10*4], rcx, rdx); + } + } c; + const uint8_t tbl[] = { + // 32bit reg + 0xc4, 0xa2, 0x69, 0xe6, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe2, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xee, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xec, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe7, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe3, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xef, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xed, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe1, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xeb, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe9, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe5, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe0, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xea, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe8, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe4, 0x0c, 0x90, + // 64bit reg + 0xc4, 0xa2, 0xe9, 0xe6, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe2, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xee, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xec, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe7, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe3, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xef, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xed, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe1, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xeb, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe9, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe5, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe0, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xea, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe8, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe4, 0x0c, 0x90, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} + +CYBOZU_TEST_AUTO(prefetchiti) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + prefetchit0(ptr[rax]); + prefetchit1(ptr[rax]); + } + } c; + const uint8_t tbl[] = { + 0x0f, 0x18, 0x38, + 0x0f, 0x18, 0x30 + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} +#endif diff --git a/externals/xbyak/test/noexception.cpp b/externals/xbyak/test/noexception.cpp index 04a6dbc2..9ef0ee83 100644 --- a/externals/xbyak/test/noexception.cpp +++ b/externals/xbyak/test/noexception.cpp @@ -56,7 +56,7 @@ void test2() void test3() { static struct EmptyAllocator : Xbyak::Allocator { - uint8_t *alloc() { return 0; } + uint8_t *alloc(size_t) { return 0; } } emptyAllocator; struct Code : CodeGenerator { Code() : CodeGenerator(8, 0, &emptyAllocator) diff --git a/externals/xbyak/test/test_address.sh b/externals/xbyak/test/test_address.sh index d283a5f3..6c9e9b0d 100755 --- a/externals/xbyak/test/test_address.sh +++ b/externals/xbyak/test/test_address.sh @@ -1,13 +1,17 @@ #!/bin/sh +set -e + FILTER="grep -v warning" sub() { -CFLAGS="-Wall -fno-operator-names -I../ $OPT2" +CFLAGS="-Wall -I../ $OPT2" +CXX=${CXX:=g++} + echo "compile address.cpp" -g++ $CFLAGS address.cpp -o address +$CXX $CFLAGS address.cpp -o address ./address $1 > a.asm echo "asm" @@ -17,7 +21,7 @@ awk '{printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER > ok.lst echo "xbyak" ./address $1 jit > nm.cpp echo "compile nm_frame.cpp" -g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame +$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame ./nm_frame > x.lst diff ok.lst x.lst && echo "ok" diff --git a/externals/xbyak/test/test_avx.sh b/externals/xbyak/test/test_avx.sh index 34dc1e55..647d4d3a 100755 --- a/externals/xbyak/test/test_avx.sh +++ b/externals/xbyak/test/test_avx.sh @@ -1,6 +1,9 @@ #!/bin/sh +set -e + FILTER="grep -v warning" +CXX=${CXX:=g++} case $1 in Y) @@ -31,9 +34,9 @@ Y64) ;; esac -CFLAGS="-Wall -fno-operator-names -I../ $OPT2 -DUSE_AVX" +CFLAGS="-Wall -I../ $OPT2 -DUSE_AVX" echo "compile make_nm.cpp" -g++ $CFLAGS make_nm.cpp -o make_nm +$CXX $CFLAGS make_nm.cpp -o make_nm ./make_nm > a.asm echo "asm" @@ -43,6 +46,6 @@ awk '$3 != "1+1" {printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER echo "xbyak" ./make_nm jit > nm.cpp echo "compile nm_frame.cpp" -g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame +$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame ./nm_frame | $FILTER > x.lst diff -B ok.lst x.lst && echo "ok" diff --git a/externals/xbyak/test/test_avx512.sh b/externals/xbyak/test/test_avx512.sh index 17edfeec..01079f1e 100755 --- a/externals/xbyak/test/test_avx512.sh +++ b/externals/xbyak/test/test_avx512.sh @@ -1,6 +1,9 @@ #!/bin/sh +set -e + FILTER="grep -v warning" +CXX=${CXX:=g++} case $1 in 64) @@ -18,9 +21,9 @@ case $1 in ;; esac -CFLAGS="-Wall -fno-operator-names -I../ $OPT2 -DUSE_AVX512" +CFLAGS="-Wall -I../ $OPT2 -DUSE_AVX512" echo "compile make_512.cpp" -g++ $CFLAGS make_512.cpp -o make_512 +$CXX $CFLAGS make_512.cpp -o make_512 ./make_512 > a.asm echo "asm" @@ -30,6 +33,6 @@ awk '{printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER > ok.lst echo "xbyak" ./make_512 jit > nm.cpp echo "compile nm_frame.cpp" -g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame -DXBYAK_AVX512 +$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame -DXBYAK_AVX512 ./nm_frame | $FILTER > x.lst diff -B ok.lst x.lst && echo "ok" diff --git a/externals/xbyak/test/test_nm.sh b/externals/xbyak/test/test_nm.sh index afa2b1eb..cda7d88a 100755 --- a/externals/xbyak/test/test_nm.sh +++ b/externals/xbyak/test/test_nm.sh @@ -1,6 +1,9 @@ #!/bin/sh +set -e + FILTER=cat +CXX=${CXX:=g++} case $1 in Y) @@ -44,9 +47,9 @@ noexcept) ;; esac -CFLAGS="-Wall -fno-operator-names -I../ $OPT2" +CFLAGS="-Wall -I../ $OPT2" echo "compile make_nm.cpp with $CFLAGS" -g++ $CFLAGS make_nm.cpp -o make_nm +$CXX $CFLAGS make_nm.cpp -o make_nm ./make_nm > a.asm echo "asm" @@ -56,6 +59,6 @@ awk '$3 != "1+1" {printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER echo "xbyak" ./make_nm jit > nm.cpp echo "compile nm_frame.cpp" -g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame +$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame ./nm_frame | $FILTER > x.lst diff -B ok.lst x.lst && echo "ok" diff --git a/externals/xbyak/xbyak/xbyak.h b/externals/xbyak/xbyak/xbyak.h index eecea612..226c8d18 100644 --- a/externals/xbyak/xbyak/xbyak.h +++ b/externals/xbyak/xbyak/xbyak.h @@ -118,7 +118,7 @@ #endif #endif -#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1800) +#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1900) #undef XBYAK_TLS #define XBYAK_TLS thread_local #define XBYAK_VARIADIC_TEMPLATE @@ -144,11 +144,18 @@ #pragma warning(disable : 4127) /* constant expresison */ #endif +// disable -Warray-bounds because it may be a bug of gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104603 +#if defined(__GNUC__) && !defined(__clang__) + #define XBYAK_DISABLE_WARNING_ARRAY_BOUNDS + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Warray-bounds" +#endif + namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x6610 /* 0xABCD = A.BC(.D) */ + VERSION = 0x6680 /* 0xABCD = A.BC(.D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -371,7 +378,7 @@ inline bool IsInInt32(uint64_t x) { return ~uint64_t(0x7fffffffu) <= x || x <= 0 inline uint32_t VerifyInInt32(uint64_t x) { -#ifdef XBYAK64 +#if defined(XBYAK64) && !defined(__ILP32__) if (!IsInInt32(x)) XBYAK_THROW_RET(ERR_OFFSET_IS_TOO_BIG, 0) #endif return static_cast(x); @@ -1478,7 +1485,6 @@ public: clabelDefList_.clear(); clabelUndefList_.clear(); resetLabelPtrList(); - ClearError(); } void enterLocal() { @@ -1820,7 +1826,7 @@ private: void setSIB(const RegExp& e, int reg, int disp8N = 0) { uint64_t disp64 = e.getDisp(); -#ifdef XBYAK64 +#if defined(XBYAK64) && !defined(__ILP32__) #ifdef XBYAK_OLD_DISP_CHECK // treat 0xffffffff as 0xffffffffffffffff uint64_t high = disp64 >> 32; @@ -2412,18 +2418,21 @@ private: if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING) opVex(x, 0, addr, type, code); } - void opVnni(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding) + void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding) { + opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code0); + } + int orEvexIf(PreferredEncoding encoding) { if (encoding == DefaultEncoding) { - encoding = EvexEncoding; + encoding = defaultEncoding_; } if (encoding == EvexEncoding) { #ifdef XBYAK_DISABLE_AVX512 XBYAK_THROW(ERR_EVEX_IS_INVALID) #endif - type |= T_MUST_EVEX; + return T_MUST_EVEX; } - opAVX_X_X_XM(x1, x2, op, type, code0); + return 0; } void opInOut(const Reg& a, const Reg& d, uint8_t code) { @@ -2508,6 +2517,7 @@ public: #endif private: bool isDefaultJmpNEAR_; + PreferredEncoding defaultEncoding_; public: void L(const std::string& label) { labelMgr_.defineSlabel(label); } void L(Label& label) { labelMgr_.defineClabel(label); } @@ -2787,11 +2797,13 @@ public: , es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs) #endif , isDefaultJmpNEAR_(false) + , defaultEncoding_(EvexEncoding) { labelMgr_.set(this); } void reset() { + ClearError(); resetSize(); labelMgr_.reset(); labelMgr_.set(this); @@ -2823,6 +2835,9 @@ public: #undef jnl #endif + // set default encoding to select Vex or Evex + void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; } + /* use single byte nop if useMultiByteNop = false */ @@ -2927,6 +2942,10 @@ static const XBYAK_CONSTEXPR Segment es(Segment::es), cs(Segment::cs), ss(Segmen #pragma warning(pop) #endif +#if defined(__GNUC__) && !defined(__clang__) + #pragma GCC diagnostic pop +#endif + } // end of namespace #endif // XBYAK_XBYAK_H_ diff --git a/externals/xbyak/xbyak/xbyak_mnemonic.h b/externals/xbyak/xbyak/xbyak_mnemonic.h index 5871557d..7c74e54e 100644 --- a/externals/xbyak/xbyak/xbyak_mnemonic.h +++ b/externals/xbyak/xbyak/xbyak_mnemonic.h @@ -1,4 +1,6 @@ -const char *getVersionString() const { return "6.61"; } +const char *getVersionString() const { return "6.68"; } +void aadd(const Address& addr, const Reg32e ®) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); } +void aand(const Address& addr, const Reg32e ®) { db(0x66); opModM(addr, reg, 0x0F, 0x38, 0x0FC); } void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); } void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); } void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); } @@ -24,6 +26,8 @@ void andnpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x66, isXM void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); } void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); } void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); } +void aor(const Address& addr, const Reg32e ®) { db(0xF2); opModM(addr, reg, 0x0F, 0x38, 0x0FC); } +void axor(const Address& addr, const Reg32e ®) { db(0xF3); opModM(addr, reg, 0x0F, 0x38, 0x0FC); } void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf7, false); } void blendpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast(imm), 0x3A); } void blendps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast(imm), 0x3A); } @@ -654,6 +658,8 @@ void pmuludq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF4); } void popcnt(const Reg®, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); } void popf() { db(0x9D); } void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); } +void prefetchit0(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0x18); } +void prefetchit1(const Address& addr) { opModM(addr, Reg32(6), 0x0F, 0x18); } void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, 0x18); } void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); } void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); } @@ -747,6 +753,7 @@ void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); } void scasb() { db(0xAE); } void scasd() { db(0xAF); } void scasw() { db(0x66); db(0xAF); } +void serialize() { db(0x0F); db(0x01); db(0xE8); } void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }//-V524 void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }//-V524 void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }//-V524 @@ -844,6 +851,8 @@ void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); } void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); } void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); } +void vbcstnebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); } +void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); } void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm); } void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm); } void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4); } @@ -988,6 +997,11 @@ void vcomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); } void vcvtdq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6); } void vcvtdq2ps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B); } +void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72); } void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); } void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); } void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); } @@ -1191,10 +1205,16 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1 void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); } void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); } void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); } -void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); } -void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); } -void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); } -void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); } +void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x50); } +void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x51); } +void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x50); } +void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x51); } +void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); } +void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); } +void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x50); } +void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x51); } +void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); } +void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); } void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); } void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); } void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36); } @@ -1226,6 +1246,8 @@ void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); } void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); } void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); } +void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB5, encoding); } +void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB4, encoding); } void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04); } void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5); } void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); } @@ -1642,6 +1664,22 @@ void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()) void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x2C); } void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); } void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); } +void cmpbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE6, false); } +void cmpbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE2, false); } +void cmplexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEE, false); } +void cmplxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEC, false); } +void cmpnbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE7, false); } +void cmpnbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE3, false); } +void cmpnlexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEF, false); } +void cmpnlxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xED, false); } +void cmpnoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE1, false); } +void cmpnpxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEB, false); } +void cmpnsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE9, false); } +void cmpnzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE5, false); } +void cmpoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE0, false); } +void cmppxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEA, false); } +void cmpsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE8, false); } +void cmpzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE4, false); } void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); } void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); } void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); } @@ -1653,6 +1691,7 @@ void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); } void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); } void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); } +void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); } void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); } #else void jcxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); } @@ -1907,7 +1946,6 @@ void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 | void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); } void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B); } void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); } -void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); } void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A); } void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); } void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); } @@ -2141,38 +2179,36 @@ void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0); } void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44); } void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44); } -void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB5); } -void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB4); } void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D); } void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F); } void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39); } void vpminuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3B); } void vpmovb2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x29); } void vpmovd2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x39); } -void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x31, false); } -void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x33, true); } +void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x31, false); } +void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x33, true); } void vpmovm2b(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x28); } void vpmovm2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x38); } void vpmovm2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x38); } void vpmovm2w(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x28); } void vpmovq2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x39); } -void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x32, false); } -void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x35, true); } -void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x34, false); } -void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x21, false); } -void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x23, true); } -void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x22, false); } -void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x25, true); } -void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x24, false); } -void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x20, true); } -void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x11, false); } -void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x13, true); } -void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x12, false); } -void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x15, true); } -void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x14, false); } -void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x10, true); } +void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x32, false); } +void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x35, true); } +void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x34, false); } +void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x21, false); } +void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x23, true); } +void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x22, false); } +void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x25, true); } +void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x24, false); } +void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x20, true); } +void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x11, false); } +void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x13, true); } +void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x12, false); } +void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x15, true); } +void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x14, false); } +void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x10, true); } void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x29); } -void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x30, true); } +void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x30, true); } void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40); } void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83); } void vpopcntb(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54); } diff --git a/externals/xbyak/xbyak/xbyak_util.h b/externals/xbyak/xbyak/xbyak_util.h index db8ac005..da7b68b0 100644 --- a/externals/xbyak/xbyak/xbyak_util.h +++ b/externals/xbyak/xbyak/xbyak_util.h @@ -4,7 +4,6 @@ #ifdef XBYAK_ONLY_CLASS_CPU #include #include -#include #include #ifndef XBYAK_THROW #define XBYAK_THROW(x) ; @@ -96,6 +95,11 @@ struct TypeT { template TypeT operator|(TypeT, TypeT) { return TypeT(); } +template +inline T max_(T x, T y) { return x >= y ? x : y; } +template +inline T min_(T x, T y) { return x < y ? x : y; } + } // local /** @@ -193,8 +197,8 @@ private: /* Fallback values in case a hypervisor has 0xB leaf zeroed-out. */ - numCores_[SmtLevel - 1] = (std::max)(1u, numCores_[SmtLevel - 1]); - numCores_[CoreLevel - 1] = (std::max)(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]); + numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]); + numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]); } else { /* Failed to deremine num of cores without x2APIC support. @@ -237,7 +241,7 @@ private: if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) { uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1; if (logical_cores != 0) { // true only if leaf 0xB is supported and valid - actual_logical_cores = (std::min)(actual_logical_cores, logical_cores); + actual_logical_cores = local::min_(actual_logical_cores, logical_cores); } assert(actual_logical_cores != 0); dataCacheSize_[dataCacheLevels_] = @@ -247,7 +251,7 @@ private: * (data[2] + 1); if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores; assert(smt_width != 0); - coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u); + coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u); dataCacheLevels_++; } } @@ -302,7 +306,7 @@ public: static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4]) { #ifdef XBYAK_INTEL_CPU_SPECIFIC - #ifdef _MSC_VER + #ifdef _WIN32 __cpuidex(reinterpret_cast(data), eaxIn, ecxIn); #else __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]); @@ -406,6 +410,13 @@ public: XBYAK_DEFINE_TYPE(65, tMOVDIRI); XBYAK_DEFINE_TYPE(66, tMOVDIR64B); XBYAK_DEFINE_TYPE(67, tCLZERO); // AMD Zen + XBYAK_DEFINE_TYPE(68, tAMX_FP16); + XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8); + XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT); + XBYAK_DEFINE_TYPE(71, tAVX_IFMA); + XBYAK_DEFINE_TYPE(72, tRAO_INT); + XBYAK_DEFINE_TYPE(73, tCMPCCXADD); + XBYAK_DEFINE_TYPE(74, tPREFETCHITI); #undef XBYAK_SPLIT_ID #undef XBYAK_DEFINE_TYPE @@ -545,10 +556,17 @@ public: if (EDX & (1U << 22)) type_ |= tAMX_BF16; if (maxNumSubLeaves >= 1) { getCpuidEx(7, 1, data); + if (EAX & (1U << 3)) type_ |= tRAO_INT; if (EAX & (1U << 4)) type_ |= tAVX_VNNI; if (type_ & tAVX512F) { if (EAX & (1U << 5)) type_ |= tAVX512_BF16; } + if (EAX & (1U << 7)) type_ |= tCMPCCXADD; + if (EAX & (1U << 21)) type_ |= tAMX_FP16; + if (EAX & (1U << 23)) type_ |= tAVX_IFMA; + if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8; + if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT; + if (EDX & (1U << 14)) type_ |= tPREFETCHITI; } } setFamily(); @@ -771,7 +789,7 @@ public: const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0); if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM) const Reg64& _rsp = code->rsp; - saveNum_ = (std::max)(0, allRegNum - noSaveNum); + saveNum_ = local::max_(0, allRegNum - noSaveNum); const int *tbl = getOrderTbl() + noSaveNum; for (int i = 0; i < saveNum_; i++) { code->push(Reg64(tbl[i]));