From f6fdb5f55a88f73ef7bef45f50cf5878ceec9781 Mon Sep 17 00:00:00 2001 From: Merry Date: Fri, 30 Dec 2022 23:05:02 +0000 Subject: [PATCH] Squashed 'externals/xbyak/' changes from 88f2f771f..a1ac3750f a1ac3750f Merge branch 'dev' 5f4ba971f v6.68 cac2c175f update doc 1b08a8248 add test of prefetchiti bef70d9b1 add prefetchit{0,1} f66785876 add detection of prefetchiti 124617ac9 Merge branch 'dev' 1bce8be17 Merge commit 'fbb18f6' into dev bebfe64a3 [test] X32 is optional and remove CFLAGS d8c332cef Merge branch 'Tachi107-small-changes' into dev fbb18f69d ci: use containers 8a974696c test: simplify makefile 1efe9fe7c Merge branch 'dev' bb70083e6 v6.67 436e452d7 update doc 47ff6ef42 I get an error after all on GitHub action 445c0dcec add test of CMPccXADD c9347907d add CMPccXADD dc792cc56 add detection of cmpccxadd 62be84cc8 fix detection of boost b5ac7b0f0 Merge branch 'dev' 05dd400e0 recover a removed line 94eff6246 v6.66 ec3fadeba update doc 00bfaaa7f add test of rao-int 0f2f1aaa6 support rio-int 6c047f480 detect rao-int f07c5c255 fix args of EmptyAllocator::alloc in test 04d3eb5f5 stop if a test script causes an error 1c1d2366f disable boost sample cfb1127c3 tweak 5fcbeb7c4 [sample] change the way of detection of boost bafc1ee60 CXX uses g++ as default value d8cabc6cb remove warning of blace bc73a0816 remove -fno-operator-names option 6989aea94 use CXX instead of g++ a7c5a1bd7 use English 1bfbd8c4d Fix incorrect format strings 0ecef5c28 Merge branch 'dev' 7556c20ba update doc a15709271 v6.65 3b83aab3e add detect_x32 to TARGET 8c64bbbc3 use gcc instead of dpkg for portability 5e9a9b96f test_avx512.sh runs on x32 8ae01b0c2 disable some tests on x32 83b3da217 x32 does not check large disp 693ab8c9d sizeof(void*) = 4 on x32, so disable the test 348e3e548 Merge branch 'dev' 11b9c4dc0 v6.64 459636196 add T_M_K flag to vpmov* 1d3722928 add include path 5e27eddae move CrearError from LabelManager::reset() to CodeGenerator::reset() f8ea5c28d Merge branch 'dev' 20b2b1eae v6.63 3706869f8 desc. of setDefaultEncoding d6f2d7577 add test of setDefaultEncoding 3b0a19c41 vpmadd52{h,l}uq for avx-ifma 95752ebd7 add tAVX_IFMA cd36e31ea [sample] show AMX_FP16/AVX_VNNI_INT8/AVX_NE_CONVERT e5858af27 add setDefaultEncoding 2f7fb0220 modify gen.cpp for AVX-NE-CONVERT/AVX-VNNI-INT8/AMX-FP16 1c5cb7efa add AVX-NE-CONVERT instructions 564fe9acd add AVX-VNNI-INT8 instructions cd14d07b1 add AMX-FP16 instruction 7811f593c Merge pull request #161 from scribam/patch-1 2218f6c08 Update changelog.md 7bccdbbb4 Merge pull request #160 from herumi/dev 5fcf87596 compile nasm-2.15 instead of apt install e31961ea8 v6.62 ff4f9e65c update doc 9c8fb81db disable wrong detection of gcc -Warray-bounds 6b7519659 add serialize e16582696 Merge branch 'JonLiu1993-vcpkg-installition' into dev edbb410fb Add vcpkg installation instructions 055d31242 Merge branch 'dev' 21ab98441 Merge branch 'akodanka-enable_CIV' into dev ed4d598e3 Changes to compile xbyak project for openvino b652430c4 mingw uses __cpuidex 48457bfa0 Merge branch 'dev' 29cb524d1 v6.61.2 1a9a0b0e1 avoid including algorithm header in xbyak_util.h 6fadefd04 Merge branch 'dev' fc1c18a9d update doc f7cae7f11 v6.61.1 6f5ec5cf3 Merge branch 'doyaGu-master' into dev 4554d6bb9 Fix error related to XBYAK_NOEXCEPT git-subtree-dir: externals/xbyak git-subtree-split: a1ac3750f9a639b5a6c6d6c7da4259b8d6790989 --- .github/workflows/main.yml | 18 +++- Android.bp | 8 ++ CMakeLists.txt | 2 +- doc/changelog.md | 9 ++ doc/install.md | 12 +++ doc/usage.md | 8 ++ gen/Makefile | 2 +- gen/gen_avx512.cpp | 40 ++++---- gen/gen_code.cpp | 92 +++++++++++++++++- meson.build | 2 +- readme.md | 3 +- readme.txt | 15 ++- sample/Makefile | 5 +- sample/quantize.cpp | 2 +- sample/test_util.cpp | 7 ++ sample/toyvm.cpp | 8 +- test/Makefile | 49 +++++----- test/Makefile.win | 2 +- test/detect_x32.c | 8 ++ test/make_512.cpp | 54 +++++------ test/make_nm.cpp | 13 ++- test/misc.cpp | 189 ++++++++++++++++++++++++++++++++++++- test/noexception.cpp | 2 +- test/test_address.sh | 10 +- test/test_avx.sh | 9 +- test/test_avx512.sh | 9 +- test/test_nm.sh | 9 +- xbyak/xbyak.h | 37 ++++++-- xbyak/xbyak_mnemonic.h | 88 ++++++++++++----- xbyak/xbyak_util.h | 32 +++++-- 30 files changed, 589 insertions(+), 155 deletions(-) create mode 100644 Android.bp create mode 100644 test/detect_x32.c diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a2a8c7f9..0e291ae2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,13 +1,21 @@ name: test on: [push] +defaults: + run: + shell: sh + +permissions: + contents: read + jobs: - build: - name: test + test: runs-on: ubuntu-latest + container: + image: debian:testing steps: - - uses: actions/checkout@v2 - - run: sudo apt update - - run: sudo apt install nasm yasm g++-multilib tcsh + - uses: actions/checkout@v3 + - run: apt -y update + - run: apt -y install g++-multilib libboost-dev make nasm yasm - run: make test - run: make -C sample CXXFLAGS="-DXBYAK_NO_EXCEPTION" diff --git a/Android.bp b/Android.bp new file mode 100644 index 00000000..c1e53fb5 --- /dev/null +++ b/Android.bp @@ -0,0 +1,8 @@ +//################################################# +cc_library_headers { + name: "xbyak_headers", + vendor: true, + export_include_dirs: [ + "xbyak" + ], +} diff --git a/CMakeLists.txt b/CMakeLists.txt index 835bec73..a4c2de7d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 2.6...3.0.2) -project(xbyak LANGUAGES CXX VERSION 6.61) +project(xbyak LANGUAGES CXX VERSION 6.68) file(GLOB headers xbyak/*.h) diff --git a/doc/changelog.md b/doc/changelog.md index 93d23d95..8be31852 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -1,5 +1,14 @@ # History +* 2022/Dec/07 ver 6.68 support prefetchit{0,1} +* 2022/Nov/30 ver 6.67 support CMPccXADD +* 2022/Nov/25 ver 6.66 support RAO-INT +* 2022/Nov/22 ver 6.65 consider x32 +* 2022/Nov/04 ver 6.64 some vmov* support addressing with mask +* 2022/Oct/06 ver 6.63 vpmadd52{h,l}uq support AVX-IFMA +* 2022/Oct/05 ver 6.63 support amx_fp16/avx_vnni_int8/avx_ne_convert and add setDefaultEncoding() +* 2022/Aug/15 ver 6.62 add serialize instruction +* 2022/Aug/02 ver 6.61.1 noexcept is supported by Visual Studio 2015 or later * 2022/Jul/29 ver 6.61 fix exception of movzx eax, ah in 64-bit mode * 2022/Jun/16 ver 6.60.2 fix detection of GFNI, VAES, and VPCLMULQDQ * 2022/Jun/15 ver 6.60.1 fix link error of Xbyak::util::Cpu on Visual Studio with /O0 option diff --git a/doc/install.md b/doc/install.md index ddc1a104..bbec93d2 100644 --- a/doc/install.md +++ b/doc/install.md @@ -12,3 +12,15 @@ make install ``` These files are copied into `/usr/local/include/xbyak`. + +# Building xbyak - Using vcpkg + +You can download and install xbyak using the [vcpkg](https://github.com/Microsoft/vcpkg) dependency manager: + + git clone https://github.com/Microsoft/vcpkg.git + cd vcpkg + ./bootstrap-vcpkg.sh + ./vcpkg integrate install + ./vcpkg install xbyak + +The xbyak port in vcpkg is kept up to date by Microsoft team members and community contributors. If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository. diff --git a/doc/usage.md b/doc/usage.md index 7dad2455..7b5678e7 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -110,7 +110,15 @@ vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding +setDefaultEncoding(VexEncoding); // default encoding is VEX +vpdpbusd(xm0, xm1, xm2); // VEX encoding ``` + +- setDefaultEncoding(PreferredEncoding encoding); + - Set the default encoding to select EVEX or VEX. + - The default value is EvexEncoding. + - This function affects only an instruction that has a PreferredEncoding argument such as vpdpbusd. + ### Remark * `k1`, ..., `k7` are opmask registers. - `k0` is dealt as no mask. diff --git a/gen/Makefile b/gen/Makefile index 97a68465..f254d71a 100644 --- a/gen/Makefile +++ b/gen/Makefile @@ -1,6 +1,6 @@ TARGET=../xbyak/xbyak_mnemonic.h BIN=sortline gen_code gen_avx512 -CFLAGS=-I../ -O2 -DXBYAK_NO_OP_NAMES -Wall -Wextra -Wno-missing-field-initializers $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) +CFLAGS=-I../ -O2 -Wall -Wextra -Wno-missing-field-initializers $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) all: $(TARGET) ../CMakeLists.txt ../meson.build ../readme.md ../readme.txt sortline: sortline.cpp $(CXX) $(CFLAGS) $< -o $@ diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 35960bbd..8283a54c 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -387,9 +387,6 @@ void putX_X_XM_IMM() { 0x57, "vreducess", T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N4, true }, { 0x57, "vreducesh", T_0F3A | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, true }, - { 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, - { 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, - { 0x70, "vpshldw", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z, true }, { 0x71, "vpshldd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, true }, { 0x71, "vpshldq", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, true }, @@ -695,29 +692,29 @@ void putMov() int type; int mode; } tbl[] = { - { 0x32, "vpmovqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL, false }, - { 0x22, "vpmovsqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL, false }, - { 0x12, "vpmovusqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL, false }, + { 0x32, "vpmovqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false }, + { 0x22, "vpmovsqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false }, + { 0x12, "vpmovusqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false }, - { 0x34, "vpmovqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, - { 0x24, "vpmovsqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, - { 0x14, "vpmovusqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, + { 0x34, "vpmovqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false }, + { 0x24, "vpmovsqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false }, + { 0x14, "vpmovusqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false }, - { 0x35, "vpmovqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, - { 0x25, "vpmovsqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, - { 0x15, "vpmovusqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, + { 0x35, "vpmovqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, + { 0x25, "vpmovsqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, + { 0x15, "vpmovusqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, - { 0x31, "vpmovdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, - { 0x21, "vpmovsdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, - { 0x11, "vpmovusdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, + { 0x31, "vpmovdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false }, + { 0x21, "vpmovsdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false }, + { 0x11, "vpmovusdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false }, - { 0x33, "vpmovdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, - { 0x23, "vpmovsdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, - { 0x13, "vpmovusdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, + { 0x33, "vpmovdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, + { 0x23, "vpmovsdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, + { 0x13, "vpmovusdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, - { 0x30, "vpmovwb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, - { 0x20, "vpmovswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, - { 0x10, "vpmovuswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, + { 0x30, "vpmovwb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, + { 0x20, "vpmovswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, + { 0x10, "vpmovuswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl& p = tbl[i]; @@ -827,7 +824,6 @@ void putMisc() puts("void vfpclasssh(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isXMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_0F3A | T_MUST_EVEX | T_EW0 | T_N2, 0x67, imm); }"); puts("void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) { opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F); }"); - puts("void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }"); puts("void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68); }"); puts("void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68); }"); diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index a8b169e5..95680536 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -560,6 +560,8 @@ void put() { 0, "nta", 0x18}, { 2, "wt1", 0x0D}, { 1, "w", 0x0D}, + { 7, "it0", 0x18}, + { 6, "it1", 0x18}, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -693,6 +695,7 @@ void put() { "lock", 0xF0 }, { "sahf", 0x9E }, + { "serialize", 0x0F, 0x01, 0xE8 }, { "stc", 0xF9 }, { "std", 0xFD }, { "sti", 0xFB }, @@ -806,6 +809,23 @@ void put() printf("void %s(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x%02X, %d); }\n", p->name, p->code, p->ext); } } + { + const struct Tbl { + const char *name; + uint8_t prefix; + } tbl[] = { + { "aadd", 0 }, + { "aand", 0x66 }, + { "aor", 0xF2 }, + { "axor", 0xF3 }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl *p = &tbl[i]; + printf("void %s(const Address& addr, const Reg32e ®) { ", p->name); + if (p->prefix) printf("db(0x%02X); ", p->prefix); + printf("opModM(addr, reg, 0x0F, 0x38, 0x0FC); }\n"); + } + } { const struct Tbl { @@ -1666,6 +1686,25 @@ void put() puts("void vcvtps2ph(const Operand& op, const Xmm& x, uint8_t imm) { checkCvt1(x, op); opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y | T_M_K, 0x1D, imm); }"); } + { + const struct Tbl { + const char *name; + int type; + uint8_t code; + } tbl[] = { + { "vbcstnebf162ps", T_F3 | T_0F38 | T_W0 | T_B16 | T_YMM, 0xB1 }, + { "vbcstnesh2ps", T_66 | T_0F38 | T_W0 | T_B16 | T_YMM, 0xB1 }, + { "vcvtneebf162ps", T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0 }, + { "vcvtneeph2ps", T_66 | T_0F38 | T_W0 | T_YMM, 0xB0 }, + { "vcvtneobf162ps", T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0 }, + { "vcvtneoph2ps", T_0F38 | T_W0 | T_YMM, 0xB0 } + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl& p = tbl[i]; + printf("void %s(const Xmm& x, const Address& addr) { opVex(x, 0, addr, %s, 0x%02X); }\n", p.name, type2String(p.type).c_str(), p.code); + } + puts("void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72); }"); + } // haswell gpr(reg, reg, r/m) { const struct Tbl { @@ -1755,11 +1794,33 @@ void put() { 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, { 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, { 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, + { 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 }, + { 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; std::string type = type2String(p->type); - printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, type.c_str(), p->code); + printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, type.c_str(), p->code); + } + } + // avx-vnni-int8 + { + const struct Tbl { + uint8_t code; + const char *name; + int type; + } tbl[] = { + { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM }, + { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM }, + { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM }, + { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, + { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, + { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl *p = &tbl[i]; + std::string type = type2String(p->type); + printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, type.c_str(), p->code); } } } @@ -1824,6 +1885,34 @@ void put64() puts("void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); }"); puts("void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); }"); + // CMPccXADD + { + const struct Tbl { + const char *name; + uint8_t code; + } tbl[] = { + { "be", 0xE6 }, + { "b", 0xE2 }, + { "le", 0xEE }, + { "l", 0xEC }, + { "nbe", 0xE7 }, + { "nb", 0xE3 }, + { "nle", 0xEF }, + { "nl", 0xED }, + { "no", 0xE1 }, + { "np", 0xEB }, + { "ns", 0xE9 }, + { "nz", 0xE5 }, + { "o", 0xE0 }, + { "p", 0xEA }, + { "s", 0xE8 }, + { "z", 0xE4 }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl *p = &tbl[i]; + printf("void cmp%sxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0x%02X, false); }\n", p->name, p->code); + } + } } void putAMX_TILE() @@ -1842,6 +1931,7 @@ void putAMX_INT8() puts("void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }"); puts("void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }"); puts("void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }"); + puts("void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }"); } void putAMX_BF16() { diff --git a/meson.build b/meson.build index 73282e13..9daaa8f9 100644 --- a/meson.build +++ b/meson.build @@ -5,7 +5,7 @@ project( 'xbyak', 'cpp', - version: '6.61', + version: '6.68', license: 'BSD-3-Clause', default_options: 'b_ndebug=if-release' ) diff --git a/readme.md b/readme.md index 69ef3c28..ae7c6341 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,5 @@ -# Xbyak 6.61 [![Badge Build]][Build Status] +# Xbyak 6.68 [![Badge Build]][Build Status] *A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)* @@ -28,6 +28,7 @@ If you want to use them, then specify `-fno-operator-names` option to gcc/clang. ### News +- add amx_fp16/avx_vnni_int8/avx_ne_convert/avx-ifma - add movdiri, movdir64b, clwb, cldemote - WAITPKG instructions (tpause, umonitor, umwait) are supported. - MmapAllocator supports memfd with user-defined strings. see sample/memfd.cpp diff --git a/readme.txt b/readme.txt index 14c1ffb3..819fc419 100644 --- a/readme.txt +++ b/readme.txt @@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.61 + C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.68 ----------------------------------------------------------------------------- ◎概要 @@ -166,13 +166,15 @@ vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64], vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding - +setDefaultEncoding(VexEncoding); // default encoding is VEX +vpdpbusd(xm0, xm1, xm2); // VEX encoding 注意 * k1, ..., k7 は新しいopmaskレジスタです。 * z, sae, rn-sae, rd-sae, ru-sae, rz-saeの代わりにT_z, T_sae, T_rn_sae, T_rd_sae, T_ru_sae, T_rz_saeを使ってください。 * `k4 | k3`と`k3 | k4`は意味が異なります。 * {1toX}の代わりにptr_bを使ってください。Xは自動的に決まります。 * 一部の命令はメモリサイズを指定するためにxword/yword/zword(_b)を使ってください。 +* setDefaultEncoding()でencoding省略時のEVEX/VEXを設定できます。 ・ラベル @@ -400,6 +402,15 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から ----------------------------------------------------------------------------- ◎履歴 +2022/12/07 ver 6.68 prefetchit{0,1}サポート +2022/11/30 ver 6.67 CMPccXADDサポート +2022/11/25 ver 6.66 RAO-INTサポート +2022/11/22 ver 6.65 x32動作確認 +2022/11/04 ver 6.64 vmov*命令をmaskつきアドレッシング対応修正 +2022/10/06 ver 6.63 AVX-IFMA用のvpmadd52{h,l}uq対応 +2022/10/05 amx_fp16/avx_vnni_int8/avx_ne_convertt対応とsetDefaultEncoding()追加 +2022/09/15 ver 6.62 serialize追加 +2022/08/02 ver 6.61.1 noexceptはVisual Studio 2015以降対応 2022/07/29 ver 6.61 movzx eax, ahがエラーになるのを修正 2022/06/16 ver 6.60.2 GFNI, VAES, VPCLMULQDQの判定修正 2022/06/15 ver 6.60.1 Visual Studio /O0でXbyak::util::Cpuがリンクエラーになるのに対応 diff --git a/sample/Makefile b/sample/Makefile index 7c910bb8..91663607 100644 --- a/sample/Makefile +++ b/sample/Makefile @@ -1,6 +1,7 @@ XBYAK_INC=../xbyak/xbyak.h +CXX?=g++ -BOOST_EXIST=$(shell echo "\#include " | (gcc -E - 2>/dev/null) | grep "boost/spirit/core.hpp" >/dev/null && echo "1") +BOOST_EXIST=$(shell echo "#include " | $(CXX) -x c++ -c - 2>/dev/null && echo 1) UNAME_M=$(shell uname -m) ONLY_64BIT=0 @@ -104,7 +105,7 @@ profiler-vtune: profiler.cpp ../xbyak/xbyak_util.h $(CXX) $(CFLAGS) profiler.cpp -o $@ -DXBYAK_USE_VTUNE -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl clean: - rm -rf *.o $(TARGET) *.exe profiler profiler-vtune + rm -rf $(TARGET) profiler profiler-vtune test : test0.cpp $(XBYAK_INC) test64: test0.cpp $(XBYAK_INC) diff --git a/sample/quantize.cpp b/sample/quantize.cpp index 6bdf0d00..ba0fd22d 100644 --- a/sample/quantize.cpp +++ b/sample/quantize.cpp @@ -199,7 +199,7 @@ int main(int argc, char *argv[]) quantize2(dest2, src, qTbl); for (int i = 0; i < N; i++) { if (dest[i] != dest2[i]) { - printf("err[%d] %d %d\n", i, dest[i], dest2[i]); + printf("err[%d] %u %u\n", i, dest[i], dest2[i]); } } diff --git a/sample/test_util.cpp b/sample/test_util.cpp index 2488ce15..96e9d213 100644 --- a/sample/test_util.cpp +++ b/sample/test_util.cpp @@ -89,6 +89,13 @@ void putCPUinfo(bool onlyCpuidFeature) { Cpu::tMOVDIRI, "movdiri" }, { Cpu::tMOVDIR64B, "movdir64b" }, { Cpu::tCLZERO, "clzero" }, + { Cpu::tAMX_FP16, "amx_fp16" }, + { Cpu::tAVX_VNNI_INT8, "avx_vnni_int8" }, + { Cpu::tAVX_NE_CONVERT, "avx_ne_convert" }, + { Cpu::tAVX_IFMA, "avx_ifma" }, + { Cpu::tRAO_INT, "rao-int" }, + { Cpu::tCMPCCXADD, "cmpccxadd" }, + { Cpu::tPREFETCHITI, "prefetchiti" }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str); diff --git a/sample/toyvm.cpp b/sample/toyvm.cpp index 1e558ff0..dff0cb7d 100644 --- a/sample/toyvm.cpp +++ b/sample/toyvm.cpp @@ -5,8 +5,8 @@ mem_ 4byte x 65536 - ٤Ƥ̿4byte - ¨ͤ16bit + all instructions are fixed at 4 bytes. + all immediate values are 16-bit. R = A or B vldiR, imm ; R = imm @@ -109,7 +109,7 @@ public: reg[r] -= imm; break; case PUT: - printf("%c %8d(0x%08x)\n", 'A' + r, reg[r], reg[r]); + printf("%c %8u(0x%08x)\n", 'A' + r, reg[r], reg[r]); break; case JNZ: if (reg[r] != 0) pc += static_cast(imm); @@ -294,7 +294,7 @@ lp: p = t; n--; if (n != 0) goto lp; - printf("c=%d(0x%08x)\n", c, c); + printf("c=%u(0x%08x)\n", c, c); } int main() diff --git a/test/Makefile b/test/Makefile index 0e7b889d..eecdbe72 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,6 +1,9 @@ -TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 +TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32 XBYAK_INC=../xbyak/xbyak.h UNAME_S=$(shell uname -s) +ifeq ($(shell ./detect_x32),x32) +X32?=1 +endif BIT=32 ifeq ($(shell uname -m),x86_64) BIT=64 @@ -20,9 +23,9 @@ endif all: $(TARGET) -CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith +CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wwrite-strings -Wfloat-equal -Wpointer-arith -CFLAGS=-O2 -fomit-frame-pointer -Wall -fno-operator-names -I../ -I./ $(CFLAGS_WARN) $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) #-std=c++0x +CFLAGS=-O2 -Wall -I../ -I./ $(CFLAGS_WARN) $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) #-std=c++0x make_nm: $(CXX) $(CFLAGS) make_nm.cpp -o $@ normalize_prefix: normalize_prefix.cpp ../xbyak/xbyak.h @@ -53,12 +56,11 @@ noexception: noexception.cpp ../xbyak/xbyak.h test_nm: normalize_prefix $(TARGET) $(MAKE) -C ../gen ifneq ($(ONLY_64BIT),1) - ./test_nm.sh - ./test_nm.sh noexcept - ./noexception - ./test_nm.sh Y - ./test_nm.sh avx512 - ./test_address.sh + CXX=$(CXX) ./test_nm.sh + CXX=$(CXX) ./test_nm.sh noexcept + CXX=$(CXX) ./test_nm.sh Y + CXX=$(CXX) ./test_nm.sh avx512 + CXX=$(CXX) ./test_address.sh ./jmp ./cvt_test32 endif @@ -67,32 +69,38 @@ endif ./misc32 ./cvt_test ifeq ($(BIT),64) - ./test_address.sh 64 - ./test_nm.sh 64 - ./test_nm.sh Y64 + CXX=$(CXX) ./test_address.sh 64 +ifneq ($(X32),1) + CXX=$(CXX) ./test_nm.sh 64 + CXX=$(CXX) ./test_nm.sh Y64 +endif ./jmp64 endif test_avx: normalize_prefix ifneq ($(ONLY_64BIT),0) - ./test_avx.sh - ./test_avx.sh Y + CXX=$(CXX) ./test_avx.sh + CXX=$(CXX) ./test_avx.sh Y endif ifeq ($(BIT),64) - ./test_address.sh 64 - ./test_avx.sh 64 - ./test_avx.sh Y64 + CXX=$(CXX) ./test_avx.sh 64 +ifneq ($(X32),1) + CXX=$(CXX) ./test_avx.sh Y64 +endif endif test_avx512: normalize_prefix ifneq ($(ONLY_64BIT),0) - ./test_avx512.sh + CXX=$(CXX) ./test_avx512.sh endif ifeq ($(BIT),64) - ./test_avx512.sh 64 + CXX=$(CXX) ./test_avx512.sh 64 endif -test: +detect_x32: detect_x32.c + $(CC) $< -o $@ + +test: detect_x32 $(MAKE) test_nm $(MAKE) test_avx $(MAKE) test_avx512 @@ -104,4 +112,3 @@ lib_run: lib_test.cpp lib_run.cpp lib.h $(CXX) $(CFLAGS) lib_run.cpp lib_test.cpp -o lib_run make_nm: make_nm.cpp $(XBYAK_INC) - diff --git a/test/Makefile.win b/test/Makefile.win index 4025ae2c..96105b3f 100644 --- a/test/Makefile.win +++ b/test/Makefile.win @@ -1,4 +1,4 @@ -OPT=/EHsc -I../xbyak /W4 -D_CRT_SECURE_NO_WARNINGS +OPT=/EHsc -I../xbyak /W4 -D_CRT_SECURE_NO_WARNINGS -I ../ ../xbyak/xbyak_mnemonic.h: ../gen/gen_code.exe ../gen/gen_avx512.exe ../gen/gen_code.exe > $@ ../gen/gen_avx512.exe >> $@ diff --git a/test/detect_x32.c b/test/detect_x32.c new file mode 100644 index 00000000..549b8d50 --- /dev/null +++ b/test/detect_x32.c @@ -0,0 +1,8 @@ +#include + +int main() +{ +#if defined(__x86_64__) && defined(__ILP32__) + puts("x32"); +#endif +} diff --git a/test/make_512.cpp b/test/make_512.cpp index 83994ab1..39bfa991 100644 --- a/test/make_512.cpp +++ b/test/make_512.cpp @@ -1807,44 +1807,44 @@ public: put("vpmovd2m", K, _XMM | _YMM | _ZMM); put("vpmovq2m", K, _XMM | _YMM | _ZMM); - put("vpmovqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); - put("vpmovsqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); - put("vpmovusqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); + put("vpmovqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); + put("vpmovsqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); + put("vpmovusqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); - put("vpmovqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); - put("vpmovsqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); - put("vpmovusqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); + put("vpmovqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); + put("vpmovsqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); + put("vpmovusqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); - put("vpmovqd", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovqd", YMM_KZ | _MEM, _ZMM); + put("vpmovqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovqd", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovsqd", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovsqd", YMM_KZ | _MEM, _ZMM); + put("vpmovsqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovsqd", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovusqd", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovusqd", YMM_KZ | _MEM, _ZMM); + put("vpmovusqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovusqd", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); - put("vpmovsdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); - put("vpmovusdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); + put("vpmovdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); + put("vpmovsdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); + put("vpmovusdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); - put("vpmovdw", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovdw", YMM_KZ | _MEM, _ZMM); + put("vpmovdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovdw", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovsdw", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovsdw", YMM_KZ | _MEM, _ZMM); + put("vpmovsdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovsdw", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovusdw", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovusdw", YMM_KZ | _MEM, _ZMM); + put("vpmovusdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovusdw", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovwb", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovwb", YMM_KZ | _MEM, _ZMM); + put("vpmovwb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovwb", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovswb", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovswb", YMM_KZ | _MEM, _ZMM); + put("vpmovswb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovswb", YMM_KZ | _MEM | MEM_K, _ZMM); - put("vpmovuswb", XMM_KZ | _MEM, _XMM | _YMM); - put("vpmovuswb", YMM_KZ | _MEM, _ZMM); + put("vpmovuswb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); + put("vpmovuswb", YMM_KZ | _MEM | MEM_K, _ZMM); } void putRot() { diff --git a/test/make_nm.cpp b/test/make_nm.cpp index 801ffe04..e5939eb7 100644 --- a/test/make_nm.cpp +++ b/test/make_nm.cpp @@ -533,6 +533,7 @@ class Test { "nop", "sahf", + "serialize", "stc", "std", "sti", @@ -1017,9 +1018,7 @@ class Test { } void putCmov() const { - const struct { - const char *s; - } tbl[] = { + const char tbl[][4] = { "o", "no", "b", @@ -1053,11 +1052,11 @@ class Test { }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { char buf[32]; - snprintf(buf, sizeof(buf), "cmov%s", tbl[i].s); + snprintf(buf, sizeof(buf), "cmov%s", tbl[i]); put(buf, REG16, REG16|MEM); put(buf, REG32, REG32|MEM); put(buf, REG64, REG64|MEM); - snprintf(buf, sizeof(buf), "set%s", tbl[i].s); + snprintf(buf, sizeof(buf), "set%s", tbl[i]); put(buf, REG8|REG8_3|MEM); } } @@ -1294,7 +1293,7 @@ class Test { put(p, REG64, "0x1234567890abcdefLL", "0x1234567890abcdef"); put("movbe", REG16|REG32e, MEM); put("movbe", MEM, REG16|REG32e); -#ifdef XBYAK64 +#if defined(XBYAK64) && !defined(__ILP32__) put(p, RAX|EAX|AX|AL, "ptr [0x1234567890abcdefLL]", "[qword 0x1234567890abcdef]"); put(p, "ptr [0x1234567890abcdefLL]", "[qword 0x1234567890abcdef]", RAX|EAX|AX|AL); put(p, "qword [rax], 0"); @@ -2608,7 +2607,7 @@ public: putMPX(); #endif -#ifdef XBYAK64 +#if defined(XBYAK64) && !defined(__ILP32__) #ifdef USE_YASM putRip(); diff --git a/test/misc.cpp b/test/misc.cpp index 236dfb86..2090dca9 100644 --- a/test/misc.cpp +++ b/test/misc.cpp @@ -5,6 +5,7 @@ #include #include #include +#include using namespace Xbyak; @@ -97,13 +98,17 @@ CYBOZU_TEST_AUTO(mov_const) } #ifdef XBYAK64 CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x7fffffff])); - CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x17fffffff]), Xbyak::Error); + if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32 + CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x17fffffff]), Xbyak::Error); + } #ifdef XBYAK_OLD_DISP_CHECK CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x80000000])); CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0xffffffff])); #else - CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x80000000ull]), Xbyak::Error); - CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0xffffffffull]), Xbyak::Error); + if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32 + CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x80000000ull]), Xbyak::Error); + CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0xffffffffull]), Xbyak::Error); + } #endif #endif } @@ -875,6 +880,10 @@ CYBOZU_TEST_AUTO(vnni) vpdpbusd(xm0, xm1, xm2); vpdpbusd(xm0, xm1, xm2, EvexEncoding); // EVEX vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX + setDefaultEncoding(VexEncoding); + vpdpbusd(xm0, xm1, xm2); // VEX + setDefaultEncoding(EvexEncoding); + vpdpbusd(xm0, xm1, xm2); // EVEX } void badVex() { @@ -885,6 +894,8 @@ CYBOZU_TEST_AUTO(vnni) 0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2, 0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2, 0xC4, 0xE2, 0x71, 0x50, 0xC2, + 0xC4, 0xE2, 0x71, 0x50, 0xC2, + 0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2, }; const size_t n = sizeof(tbl) / sizeof(tbl[0]); CYBOZU_TEST_EQUAL(c.getSize(), n); @@ -1975,3 +1986,175 @@ CYBOZU_TEST_AUTO(cpu) Cpu cpu; CYBOZU_TEST_EQUAL(cpu.has(Cpu::tINTEL) && cpu.has(Cpu::tAMD), cpu.has(Cpu::tINTEL | Cpu::tAMD)); } + +CYBOZU_TEST_AUTO(minmax) +{ + using namespace Xbyak::util; + CYBOZU_TEST_EQUAL((std::min)(3, 4), local::min_(3, 4)); + CYBOZU_TEST_EQUAL((std::max)(3, 4), local::max_(3, 4)); +} + +CYBOZU_TEST_AUTO(rao_int) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { +#ifdef XBYAK64 + aadd(ptr[rax], ecx); + aadd(ptr[eax], ecx); + aadd(ptr[rax], r10); + aand(ptr[rax], ecx); + aand(ptr[eax], ecx); + aand(ptr[rax], r10); + aor(ptr[rax], ecx); + aor(ptr[eax], ecx); + aor(ptr[rax], r10); + axor(ptr[rax], ecx); + axor(ptr[eax], ecx); + axor(ptr[rax], r10); +#else + aadd(ptr[eax], ecx); + aand(ptr[eax], ecx); + aor(ptr[eax], ecx); + axor(ptr[eax], ecx); +#endif + } + } c; + const uint8_t tbl[] = { +#ifdef XBYAK64 + // aadd + 0x0f, 0x38, 0xfc, 0x08, + 0x67, 0x0f, 0x38, 0xfc, 0x08, + 0x4c, 0x0f, 0x38, 0xfc, 0x10, + + // aand + 0x66, 0x0f, 0x38, 0xfc, 0x08, + 0x66, 0x67, 0x0f, 0x38, 0xfc, 0x08, + 0x66, 0x4c, 0x0f, 0x38, 0xfc, 0x10, + + // aor + 0xf2, 0x0f, 0x38, 0xfc, 0x08, + 0xf2, 0x67, 0x0f, 0x38, 0xfc, 0x08, + 0xf2, 0x4c, 0x0f, 0x38, 0xfc, 0x10, + + // axor + 0xf3, 0x0f, 0x38, 0xfc, 0x08, + 0xf3, 0x67, 0x0f, 0x38, 0xfc, 0x08, + 0xf3, 0x4c, 0x0f, 0x38, 0xfc, 0x10, +#else + // aadd + 0x0f, 0x38, 0xfc, 0x08, + // aand + 0x66, 0x0f, 0x38, 0xfc, 0x08, + // aor + 0xf2, 0x0f, 0x38, 0xfc, 0x08, + // axor + 0xf3, 0x0f, 0x38, 0xfc, 0x08, +#endif + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} + +#ifdef XBYAK64 +CYBOZU_TEST_AUTO(CMPccXADD) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + // 32bit reg + cmpbexadd(ptr[rax+r10*4], ecx, edx); + cmpbxadd(ptr[rax+r10*4], ecx, edx); + cmplexadd(ptr[rax+r10*4], ecx, edx); + cmplxadd(ptr[rax+r10*4], ecx, edx); + cmpnbexadd(ptr[rax+r10*4], ecx, edx); + cmpnbxadd(ptr[rax+r10*4], ecx, edx); + cmpnlexadd(ptr[rax+r10*4], ecx, edx); + cmpnlxadd(ptr[rax+r10*4], ecx, edx); + cmpnoxadd(ptr[rax+r10*4], ecx, edx); + cmpnpxadd(ptr[rax+r10*4], ecx, edx); + cmpnsxadd(ptr[rax+r10*4], ecx, edx); + cmpnzxadd(ptr[rax+r10*4], ecx, edx); + cmpoxadd(ptr[rax+r10*4], ecx, edx); + cmppxadd(ptr[rax+r10*4], ecx, edx); + cmpsxadd(ptr[rax+r10*4], ecx, edx); + cmpzxadd(ptr[rax+r10*4], ecx, edx); + // 64bit reg + cmpbexadd(ptr[rax+r10*4], rcx, rdx); + cmpbxadd(ptr[rax+r10*4], rcx, rdx); + cmplexadd(ptr[rax+r10*4], rcx, rdx); + cmplxadd(ptr[rax+r10*4], rcx, rdx); + cmpnbexadd(ptr[rax+r10*4], rcx, rdx); + cmpnbxadd(ptr[rax+r10*4], rcx, rdx); + cmpnlexadd(ptr[rax+r10*4], rcx, rdx); + cmpnlxadd(ptr[rax+r10*4], rcx, rdx); + cmpnoxadd(ptr[rax+r10*4], rcx, rdx); + cmpnpxadd(ptr[rax+r10*4], rcx, rdx); + cmpnsxadd(ptr[rax+r10*4], rcx, rdx); + cmpnzxadd(ptr[rax+r10*4], rcx, rdx); + cmpoxadd(ptr[rax+r10*4], rcx, rdx); + cmppxadd(ptr[rax+r10*4], rcx, rdx); + cmpsxadd(ptr[rax+r10*4], rcx, rdx); + cmpzxadd(ptr[rax+r10*4], rcx, rdx); + } + } c; + const uint8_t tbl[] = { + // 32bit reg + 0xc4, 0xa2, 0x69, 0xe6, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe2, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xee, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xec, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe7, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe3, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xef, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xed, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe1, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xeb, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe9, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe5, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe0, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xea, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe8, 0x0c, 0x90, + 0xc4, 0xa2, 0x69, 0xe4, 0x0c, 0x90, + // 64bit reg + 0xc4, 0xa2, 0xe9, 0xe6, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe2, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xee, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xec, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe7, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe3, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xef, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xed, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe1, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xeb, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe9, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe5, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe0, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xea, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe8, 0x0c, 0x90, + 0xc4, 0xa2, 0xe9, 0xe4, 0x0c, 0x90, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} + +CYBOZU_TEST_AUTO(prefetchiti) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + prefetchit0(ptr[rax]); + prefetchit1(ptr[rax]); + } + } c; + const uint8_t tbl[] = { + 0x0f, 0x18, 0x38, + 0x0f, 0x18, 0x30 + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} +#endif diff --git a/test/noexception.cpp b/test/noexception.cpp index 04a6dbc2..9ef0ee83 100644 --- a/test/noexception.cpp +++ b/test/noexception.cpp @@ -56,7 +56,7 @@ void test2() void test3() { static struct EmptyAllocator : Xbyak::Allocator { - uint8_t *alloc() { return 0; } + uint8_t *alloc(size_t) { return 0; } } emptyAllocator; struct Code : CodeGenerator { Code() : CodeGenerator(8, 0, &emptyAllocator) diff --git a/test/test_address.sh b/test/test_address.sh index d283a5f3..6c9e9b0d 100755 --- a/test/test_address.sh +++ b/test/test_address.sh @@ -1,13 +1,17 @@ #!/bin/sh +set -e + FILTER="grep -v warning" sub() { -CFLAGS="-Wall -fno-operator-names -I../ $OPT2" +CFLAGS="-Wall -I../ $OPT2" +CXX=${CXX:=g++} + echo "compile address.cpp" -g++ $CFLAGS address.cpp -o address +$CXX $CFLAGS address.cpp -o address ./address $1 > a.asm echo "asm" @@ -17,7 +21,7 @@ awk '{printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER > ok.lst echo "xbyak" ./address $1 jit > nm.cpp echo "compile nm_frame.cpp" -g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame +$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame ./nm_frame > x.lst diff ok.lst x.lst && echo "ok" diff --git a/test/test_avx.sh b/test/test_avx.sh index 34dc1e55..647d4d3a 100755 --- a/test/test_avx.sh +++ b/test/test_avx.sh @@ -1,6 +1,9 @@ #!/bin/sh +set -e + FILTER="grep -v warning" +CXX=${CXX:=g++} case $1 in Y) @@ -31,9 +34,9 @@ Y64) ;; esac -CFLAGS="-Wall -fno-operator-names -I../ $OPT2 -DUSE_AVX" +CFLAGS="-Wall -I../ $OPT2 -DUSE_AVX" echo "compile make_nm.cpp" -g++ $CFLAGS make_nm.cpp -o make_nm +$CXX $CFLAGS make_nm.cpp -o make_nm ./make_nm > a.asm echo "asm" @@ -43,6 +46,6 @@ awk '$3 != "1+1" {printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER echo "xbyak" ./make_nm jit > nm.cpp echo "compile nm_frame.cpp" -g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame +$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame ./nm_frame | $FILTER > x.lst diff -B ok.lst x.lst && echo "ok" diff --git a/test/test_avx512.sh b/test/test_avx512.sh index 17edfeec..01079f1e 100755 --- a/test/test_avx512.sh +++ b/test/test_avx512.sh @@ -1,6 +1,9 @@ #!/bin/sh +set -e + FILTER="grep -v warning" +CXX=${CXX:=g++} case $1 in 64) @@ -18,9 +21,9 @@ case $1 in ;; esac -CFLAGS="-Wall -fno-operator-names -I../ $OPT2 -DUSE_AVX512" +CFLAGS="-Wall -I../ $OPT2 -DUSE_AVX512" echo "compile make_512.cpp" -g++ $CFLAGS make_512.cpp -o make_512 +$CXX $CFLAGS make_512.cpp -o make_512 ./make_512 > a.asm echo "asm" @@ -30,6 +33,6 @@ awk '{printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER > ok.lst echo "xbyak" ./make_512 jit > nm.cpp echo "compile nm_frame.cpp" -g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame -DXBYAK_AVX512 +$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame -DXBYAK_AVX512 ./nm_frame | $FILTER > x.lst diff -B ok.lst x.lst && echo "ok" diff --git a/test/test_nm.sh b/test/test_nm.sh index afa2b1eb..cda7d88a 100755 --- a/test/test_nm.sh +++ b/test/test_nm.sh @@ -1,6 +1,9 @@ #!/bin/sh +set -e + FILTER=cat +CXX=${CXX:=g++} case $1 in Y) @@ -44,9 +47,9 @@ noexcept) ;; esac -CFLAGS="-Wall -fno-operator-names -I../ $OPT2" +CFLAGS="-Wall -I../ $OPT2" echo "compile make_nm.cpp with $CFLAGS" -g++ $CFLAGS make_nm.cpp -o make_nm +$CXX $CFLAGS make_nm.cpp -o make_nm ./make_nm > a.asm echo "asm" @@ -56,6 +59,6 @@ awk '$3 != "1+1" {printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER echo "xbyak" ./make_nm jit > nm.cpp echo "compile nm_frame.cpp" -g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame +$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame ./nm_frame | $FILTER > x.lst diff -B ok.lst x.lst && echo "ok" diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index eecea612..226c8d18 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -118,7 +118,7 @@ #endif #endif -#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1800) +#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1900) #undef XBYAK_TLS #define XBYAK_TLS thread_local #define XBYAK_VARIADIC_TEMPLATE @@ -144,11 +144,18 @@ #pragma warning(disable : 4127) /* constant expresison */ #endif +// disable -Warray-bounds because it may be a bug of gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104603 +#if defined(__GNUC__) && !defined(__clang__) + #define XBYAK_DISABLE_WARNING_ARRAY_BOUNDS + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Warray-bounds" +#endif + namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x6610 /* 0xABCD = A.BC(.D) */ + VERSION = 0x6680 /* 0xABCD = A.BC(.D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -371,7 +378,7 @@ inline bool IsInInt32(uint64_t x) { return ~uint64_t(0x7fffffffu) <= x || x <= 0 inline uint32_t VerifyInInt32(uint64_t x) { -#ifdef XBYAK64 +#if defined(XBYAK64) && !defined(__ILP32__) if (!IsInInt32(x)) XBYAK_THROW_RET(ERR_OFFSET_IS_TOO_BIG, 0) #endif return static_cast(x); @@ -1478,7 +1485,6 @@ public: clabelDefList_.clear(); clabelUndefList_.clear(); resetLabelPtrList(); - ClearError(); } void enterLocal() { @@ -1820,7 +1826,7 @@ private: void setSIB(const RegExp& e, int reg, int disp8N = 0) { uint64_t disp64 = e.getDisp(); -#ifdef XBYAK64 +#if defined(XBYAK64) && !defined(__ILP32__) #ifdef XBYAK_OLD_DISP_CHECK // treat 0xffffffff as 0xffffffffffffffff uint64_t high = disp64 >> 32; @@ -2412,18 +2418,21 @@ private: if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING) opVex(x, 0, addr, type, code); } - void opVnni(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding) + void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding) { + opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code0); + } + int orEvexIf(PreferredEncoding encoding) { if (encoding == DefaultEncoding) { - encoding = EvexEncoding; + encoding = defaultEncoding_; } if (encoding == EvexEncoding) { #ifdef XBYAK_DISABLE_AVX512 XBYAK_THROW(ERR_EVEX_IS_INVALID) #endif - type |= T_MUST_EVEX; + return T_MUST_EVEX; } - opAVX_X_X_XM(x1, x2, op, type, code0); + return 0; } void opInOut(const Reg& a, const Reg& d, uint8_t code) { @@ -2508,6 +2517,7 @@ public: #endif private: bool isDefaultJmpNEAR_; + PreferredEncoding defaultEncoding_; public: void L(const std::string& label) { labelMgr_.defineSlabel(label); } void L(Label& label) { labelMgr_.defineClabel(label); } @@ -2787,11 +2797,13 @@ public: , es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs) #endif , isDefaultJmpNEAR_(false) + , defaultEncoding_(EvexEncoding) { labelMgr_.set(this); } void reset() { + ClearError(); resetSize(); labelMgr_.reset(); labelMgr_.set(this); @@ -2823,6 +2835,9 @@ public: #undef jnl #endif + // set default encoding to select Vex or Evex + void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; } + /* use single byte nop if useMultiByteNop = false */ @@ -2927,6 +2942,10 @@ static const XBYAK_CONSTEXPR Segment es(Segment::es), cs(Segment::cs), ss(Segmen #pragma warning(pop) #endif +#if defined(__GNUC__) && !defined(__clang__) + #pragma GCC diagnostic pop +#endif + } // end of namespace #endif // XBYAK_XBYAK_H_ diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 5871557d..7c74e54e 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1,4 +1,6 @@ -const char *getVersionString() const { return "6.61"; } +const char *getVersionString() const { return "6.68"; } +void aadd(const Address& addr, const Reg32e ®) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); } +void aand(const Address& addr, const Reg32e ®) { db(0x66); opModM(addr, reg, 0x0F, 0x38, 0x0FC); } void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); } void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); } void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); } @@ -24,6 +26,8 @@ void andnpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x66, isXM void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); } void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); } void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); } +void aor(const Address& addr, const Reg32e ®) { db(0xF2); opModM(addr, reg, 0x0F, 0x38, 0x0FC); } +void axor(const Address& addr, const Reg32e ®) { db(0xF3); opModM(addr, reg, 0x0F, 0x38, 0x0FC); } void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf7, false); } void blendpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast(imm), 0x3A); } void blendps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast(imm), 0x3A); } @@ -654,6 +658,8 @@ void pmuludq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF4); } void popcnt(const Reg®, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); } void popf() { db(0x9D); } void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); } +void prefetchit0(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0x18); } +void prefetchit1(const Address& addr) { opModM(addr, Reg32(6), 0x0F, 0x18); } void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, 0x18); } void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); } void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); } @@ -747,6 +753,7 @@ void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); } void scasb() { db(0xAE); } void scasd() { db(0xAF); } void scasw() { db(0x66); db(0xAF); } +void serialize() { db(0x0F); db(0x01); db(0xE8); } void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }//-V524 void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }//-V524 void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }//-V524 @@ -844,6 +851,8 @@ void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); } void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); } void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); } +void vbcstnebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); } +void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); } void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm); } void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm); } void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4); } @@ -988,6 +997,11 @@ void vcomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); } void vcvtdq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6); } void vcvtdq2ps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B); } +void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72); } void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); } void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); } void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); } @@ -1191,10 +1205,16 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1 void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); } void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); } void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); } -void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); } -void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); } -void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); } -void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); } +void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x50); } +void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x51); } +void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x50); } +void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x51); } +void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); } +void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); } +void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x50); } +void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x51); } +void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); } +void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); } void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); } void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); } void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36); } @@ -1226,6 +1246,8 @@ void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); } void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); } void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); } +void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB5, encoding); } +void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB4, encoding); } void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04); } void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5); } void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); } @@ -1642,6 +1664,22 @@ void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()) void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x2C); } void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); } void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); } +void cmpbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE6, false); } +void cmpbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE2, false); } +void cmplexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEE, false); } +void cmplxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEC, false); } +void cmpnbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE7, false); } +void cmpnbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE3, false); } +void cmpnlexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEF, false); } +void cmpnlxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xED, false); } +void cmpnoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE1, false); } +void cmpnpxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEB, false); } +void cmpnsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE9, false); } +void cmpnzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE5, false); } +void cmpoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE0, false); } +void cmppxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEA, false); } +void cmpsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE8, false); } +void cmpzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE4, false); } void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); } void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); } void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); } @@ -1653,6 +1691,7 @@ void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); } void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); } void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); } +void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); } void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); } #else void jcxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); } @@ -1907,7 +1946,6 @@ void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 | void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); } void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B); } void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); } -void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); } void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A); } void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); } void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); } @@ -2141,38 +2179,36 @@ void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0); } void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44); } void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44); } -void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB5); } -void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB4); } void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D); } void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F); } void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39); } void vpminuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3B); } void vpmovb2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x29); } void vpmovd2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x39); } -void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x31, false); } -void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x33, true); } +void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x31, false); } +void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x33, true); } void vpmovm2b(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x28); } void vpmovm2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x38); } void vpmovm2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x38); } void vpmovm2w(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x28); } void vpmovq2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x39); } -void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x32, false); } -void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x35, true); } -void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x34, false); } -void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x21, false); } -void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x23, true); } -void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x22, false); } -void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x25, true); } -void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x24, false); } -void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x20, true); } -void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x11, false); } -void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x13, true); } -void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x12, false); } -void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x15, true); } -void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x14, false); } -void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x10, true); } +void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x32, false); } +void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x35, true); } +void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x34, false); } +void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x21, false); } +void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x23, true); } +void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x22, false); } +void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x25, true); } +void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x24, false); } +void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x20, true); } +void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x11, false); } +void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x13, true); } +void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x12, false); } +void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x15, true); } +void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x14, false); } +void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x10, true); } void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x29); } -void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x30, true); } +void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x30, true); } void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40); } void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83); } void vpopcntb(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54); } diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h index db8ac005..da7b68b0 100644 --- a/xbyak/xbyak_util.h +++ b/xbyak/xbyak_util.h @@ -4,7 +4,6 @@ #ifdef XBYAK_ONLY_CLASS_CPU #include #include -#include #include #ifndef XBYAK_THROW #define XBYAK_THROW(x) ; @@ -96,6 +95,11 @@ struct TypeT { template TypeT operator|(TypeT, TypeT) { return TypeT(); } +template +inline T max_(T x, T y) { return x >= y ? x : y; } +template +inline T min_(T x, T y) { return x < y ? x : y; } + } // local /** @@ -193,8 +197,8 @@ private: /* Fallback values in case a hypervisor has 0xB leaf zeroed-out. */ - numCores_[SmtLevel - 1] = (std::max)(1u, numCores_[SmtLevel - 1]); - numCores_[CoreLevel - 1] = (std::max)(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]); + numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]); + numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]); } else { /* Failed to deremine num of cores without x2APIC support. @@ -237,7 +241,7 @@ private: if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) { uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1; if (logical_cores != 0) { // true only if leaf 0xB is supported and valid - actual_logical_cores = (std::min)(actual_logical_cores, logical_cores); + actual_logical_cores = local::min_(actual_logical_cores, logical_cores); } assert(actual_logical_cores != 0); dataCacheSize_[dataCacheLevels_] = @@ -247,7 +251,7 @@ private: * (data[2] + 1); if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores; assert(smt_width != 0); - coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u); + coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u); dataCacheLevels_++; } } @@ -302,7 +306,7 @@ public: static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4]) { #ifdef XBYAK_INTEL_CPU_SPECIFIC - #ifdef _MSC_VER + #ifdef _WIN32 __cpuidex(reinterpret_cast(data), eaxIn, ecxIn); #else __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]); @@ -406,6 +410,13 @@ public: XBYAK_DEFINE_TYPE(65, tMOVDIRI); XBYAK_DEFINE_TYPE(66, tMOVDIR64B); XBYAK_DEFINE_TYPE(67, tCLZERO); // AMD Zen + XBYAK_DEFINE_TYPE(68, tAMX_FP16); + XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8); + XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT); + XBYAK_DEFINE_TYPE(71, tAVX_IFMA); + XBYAK_DEFINE_TYPE(72, tRAO_INT); + XBYAK_DEFINE_TYPE(73, tCMPCCXADD); + XBYAK_DEFINE_TYPE(74, tPREFETCHITI); #undef XBYAK_SPLIT_ID #undef XBYAK_DEFINE_TYPE @@ -545,10 +556,17 @@ public: if (EDX & (1U << 22)) type_ |= tAMX_BF16; if (maxNumSubLeaves >= 1) { getCpuidEx(7, 1, data); + if (EAX & (1U << 3)) type_ |= tRAO_INT; if (EAX & (1U << 4)) type_ |= tAVX_VNNI; if (type_ & tAVX512F) { if (EAX & (1U << 5)) type_ |= tAVX512_BF16; } + if (EAX & (1U << 7)) type_ |= tCMPCCXADD; + if (EAX & (1U << 21)) type_ |= tAMX_FP16; + if (EAX & (1U << 23)) type_ |= tAVX_IFMA; + if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8; + if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT; + if (EDX & (1U << 14)) type_ |= tPREFETCHITI; } } setFamily(); @@ -771,7 +789,7 @@ public: const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0); if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM) const Reg64& _rsp = code->rsp; - saveNum_ = (std::max)(0, allRegNum - noSaveNum); + saveNum_ = local::max_(0, allRegNum - noSaveNum); const int *tbl = getOrderTbl() + noSaveNum; for (int i = 0; i < saveNum_; i++) { code->push(Reg64(tbl[i]));