Squashed 'externals/xbyak/' changes from 88f2f771f..a1ac3750f

a1ac3750f Merge branch 'dev'
5f4ba971f v6.68
cac2c175f update doc
1b08a8248 add test of prefetchiti
bef70d9b1 add prefetchit{0,1}
f66785876 add detection of prefetchiti
124617ac9 Merge branch 'dev'
1bce8be17 Merge commit 'fbb18f6' into dev
bebfe64a3 [test] X32 is optional and remove CFLAGS
d8c332cef Merge branch 'Tachi107-small-changes' into dev
fbb18f69d ci: use containers
8a974696c test: simplify makefile
1efe9fe7c Merge branch 'dev'
bb70083e6 v6.67
436e452d7 update doc
47ff6ef42 I get an error after all on GitHub action
445c0dcec add test of CMPccXADD
c9347907d add CMPccXADD
dc792cc56 add detection of cmpccxadd
62be84cc8 fix detection of boost
b5ac7b0f0 Merge branch 'dev'
05dd400e0 recover a removed line
94eff6246 v6.66
ec3fadeba update doc
00bfaaa7f add test of rao-int
0f2f1aaa6 support rio-int
6c047f480 detect rao-int
f07c5c255 fix args of EmptyAllocator::alloc in test
04d3eb5f5 stop if a test script causes an error
1c1d2366f disable boost sample
cfb1127c3 tweak
5fcbeb7c4 [sample] change the way of detection of boost
bafc1ee60 CXX uses g++ as default value
d8cabc6cb remove warning of blace
bc73a0816 remove -fno-operator-names option
6989aea94 use CXX instead of g++
a7c5a1bd7 use English
1bfbd8c4d Fix incorrect format strings
0ecef5c28 Merge branch 'dev'
7556c20ba update doc
a15709271 v6.65
3b83aab3e add detect_x32 to TARGET
8c64bbbc3 use gcc instead of dpkg for portability
5e9a9b96f test_avx512.sh runs on x32
8ae01b0c2 disable some tests on x32
83b3da217 x32 does not check large disp
693ab8c9d sizeof(void*) = 4 on x32, so disable the test
348e3e548 Merge branch 'dev'
11b9c4dc0 v6.64
459636196 add T_M_K flag to vpmov*
1d3722928 add include path
5e27eddae move CrearError from LabelManager::reset() to CodeGenerator::reset()
f8ea5c28d Merge branch 'dev'
20b2b1eae v6.63
3706869f8 desc. of setDefaultEncoding
d6f2d7577 add test of setDefaultEncoding
3b0a19c41 vpmadd52{h,l}uq for avx-ifma
95752ebd7 add tAVX_IFMA
cd36e31ea [sample] show AMX_FP16/AVX_VNNI_INT8/AVX_NE_CONVERT
e5858af27 add setDefaultEncoding
2f7fb0220 modify gen.cpp for AVX-NE-CONVERT/AVX-VNNI-INT8/AMX-FP16
1c5cb7efa add AVX-NE-CONVERT instructions
564fe9acd add AVX-VNNI-INT8 instructions
cd14d07b1 add AMX-FP16 instruction
7811f593c Merge pull request #161 from scribam/patch-1
2218f6c08 Update changelog.md
7bccdbbb4 Merge pull request #160 from herumi/dev
5fcf87596 compile nasm-2.15 instead of apt install
e31961ea8 v6.62
ff4f9e65c update doc
9c8fb81db disable wrong detection of gcc -Warray-bounds
6b7519659 add serialize
e16582696 Merge branch 'JonLiu1993-vcpkg-installition' into dev
edbb410fb Add vcpkg installation instructions
055d31242 Merge branch 'dev'
21ab98441 Merge branch 'akodanka-enable_CIV' into dev
ed4d598e3 Changes to compile xbyak project for openvino
b652430c4 mingw uses __cpuidex
48457bfa0 Merge branch 'dev'
29cb524d1 v6.61.2
1a9a0b0e1 avoid including algorithm header in xbyak_util.h
6fadefd04 Merge branch 'dev'
fc1c18a9d update doc
f7cae7f11 v6.61.1
6f5ec5cf3 Merge branch 'doyaGu-master' into dev
4554d6bb9 Fix error related to XBYAK_NOEXCEPT

git-subtree-dir: externals/xbyak
git-subtree-split: a1ac3750f9a639b5a6c6d6c7da4259b8d6790989
This commit is contained in:
Merry 2022-12-30 23:05:02 +00:00
parent 5b6e3d8b54
commit f6fdb5f55a
30 changed files with 589 additions and 155 deletions

View file

@ -1,13 +1,21 @@
name: test name: test
on: [push] on: [push]
defaults:
run:
shell: sh
permissions:
contents: read
jobs: jobs:
build: test:
name: test
runs-on: ubuntu-latest runs-on: ubuntu-latest
container:
image: debian:testing
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
- run: sudo apt update - run: apt -y update
- run: sudo apt install nasm yasm g++-multilib tcsh - run: apt -y install g++-multilib libboost-dev make nasm yasm
- run: make test - run: make test
- run: make -C sample CXXFLAGS="-DXBYAK_NO_EXCEPTION" - run: make -C sample CXXFLAGS="-DXBYAK_NO_EXCEPTION"

8
Android.bp Normal file
View file

@ -0,0 +1,8 @@
//#################################################
cc_library_headers {
name: "xbyak_headers",
vendor: true,
export_include_dirs: [
"xbyak"
],
}

View file

@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 2.6...3.0.2) cmake_minimum_required(VERSION 2.6...3.0.2)
project(xbyak LANGUAGES CXX VERSION 6.61) project(xbyak LANGUAGES CXX VERSION 6.68)
file(GLOB headers xbyak/*.h) file(GLOB headers xbyak/*.h)

View file

@ -1,5 +1,14 @@
# History # History
* 2022/Dec/07 ver 6.68 support prefetchit{0,1}
* 2022/Nov/30 ver 6.67 support CMPccXADD
* 2022/Nov/25 ver 6.66 support RAO-INT
* 2022/Nov/22 ver 6.65 consider x32
* 2022/Nov/04 ver 6.64 some vmov* support addressing with mask
* 2022/Oct/06 ver 6.63 vpmadd52{h,l}uq support AVX-IFMA
* 2022/Oct/05 ver 6.63 support amx_fp16/avx_vnni_int8/avx_ne_convert and add setDefaultEncoding()
* 2022/Aug/15 ver 6.62 add serialize instruction
* 2022/Aug/02 ver 6.61.1 noexcept is supported by Visual Studio 2015 or later
* 2022/Jul/29 ver 6.61 fix exception of movzx eax, ah in 64-bit mode * 2022/Jul/29 ver 6.61 fix exception of movzx eax, ah in 64-bit mode
* 2022/Jun/16 ver 6.60.2 fix detection of GFNI, VAES, and VPCLMULQDQ * 2022/Jun/16 ver 6.60.2 fix detection of GFNI, VAES, and VPCLMULQDQ
* 2022/Jun/15 ver 6.60.1 fix link error of Xbyak::util::Cpu on Visual Studio with /O0 option * 2022/Jun/15 ver 6.60.1 fix link error of Xbyak::util::Cpu on Visual Studio with /O0 option

View file

@ -12,3 +12,15 @@ make install
``` ```
These files are copied into `/usr/local/include/xbyak`. These files are copied into `/usr/local/include/xbyak`.
# Building xbyak - Using vcpkg
You can download and install xbyak using the [vcpkg](https://github.com/Microsoft/vcpkg) dependency manager:
git clone https://github.com/Microsoft/vcpkg.git
cd vcpkg
./bootstrap-vcpkg.sh
./vcpkg integrate install
./vcpkg install xbyak
The xbyak port in vcpkg is kept up to date by Microsoft team members and community contributors. If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.

View file

@ -110,7 +110,15 @@ vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64],
vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding
setDefaultEncoding(VexEncoding); // default encoding is VEX
vpdpbusd(xm0, xm1, xm2); // VEX encoding
``` ```
- setDefaultEncoding(PreferredEncoding encoding);
- Set the default encoding to select EVEX or VEX.
- The default value is EvexEncoding.
- This function affects only an instruction that has a PreferredEncoding argument such as vpdpbusd.
### Remark ### Remark
* `k1`, ..., `k7` are opmask registers. * `k1`, ..., `k7` are opmask registers.
- `k0` is dealt as no mask. - `k0` is dealt as no mask.

View file

@ -1,6 +1,6 @@
TARGET=../xbyak/xbyak_mnemonic.h TARGET=../xbyak/xbyak_mnemonic.h
BIN=sortline gen_code gen_avx512 BIN=sortline gen_code gen_avx512
CFLAGS=-I../ -O2 -DXBYAK_NO_OP_NAMES -Wall -Wextra -Wno-missing-field-initializers $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) CFLAGS=-I../ -O2 -Wall -Wextra -Wno-missing-field-initializers $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS)
all: $(TARGET) ../CMakeLists.txt ../meson.build ../readme.md ../readme.txt all: $(TARGET) ../CMakeLists.txt ../meson.build ../readme.md ../readme.txt
sortline: sortline.cpp sortline: sortline.cpp
$(CXX) $(CFLAGS) $< -o $@ $(CXX) $(CFLAGS) $< -o $@

View file

@ -387,9 +387,6 @@ void putX_X_XM_IMM()
{ 0x57, "vreducess", T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N4, true }, { 0x57, "vreducess", T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N4, true },
{ 0x57, "vreducesh", T_0F3A | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, true }, { 0x57, "vreducesh", T_0F3A | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, true },
{ 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false },
{ 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false },
{ 0x70, "vpshldw", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z, true }, { 0x70, "vpshldw", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z, true },
{ 0x71, "vpshldd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, true }, { 0x71, "vpshldd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, true },
{ 0x71, "vpshldq", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, true }, { 0x71, "vpshldq", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, true },
@ -695,29 +692,29 @@ void putMov()
int type; int type;
int mode; int mode;
} tbl[] = { } tbl[] = {
{ 0x32, "vpmovqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL, false }, { 0x32, "vpmovqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false },
{ 0x22, "vpmovsqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL, false }, { 0x22, "vpmovsqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false },
{ 0x12, "vpmovusqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL, false }, { 0x12, "vpmovusqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false },
{ 0x34, "vpmovqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, { 0x34, "vpmovqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
{ 0x24, "vpmovsqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, { 0x24, "vpmovsqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
{ 0x14, "vpmovusqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, { 0x14, "vpmovusqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
{ 0x35, "vpmovqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, { 0x35, "vpmovqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x25, "vpmovsqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, { 0x25, "vpmovsqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x15, "vpmovusqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, { 0x15, "vpmovusqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x31, "vpmovdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, { 0x31, "vpmovdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
{ 0x21, "vpmovsdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, { 0x21, "vpmovsdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
{ 0x11, "vpmovusdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false }, { 0x11, "vpmovusdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
{ 0x33, "vpmovdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, { 0x33, "vpmovdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x23, "vpmovsdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, { 0x23, "vpmovsdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x13, "vpmovusdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, { 0x13, "vpmovusdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x30, "vpmovwb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, { 0x30, "vpmovwb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x20, "vpmovswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, { 0x20, "vpmovswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x10, "vpmovuswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true }, { 0x10, "vpmovuswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i]; const Tbl& p = tbl[i];
@ -827,7 +824,6 @@ void putMisc()
puts("void vfpclasssh(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isXMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_0F3A | T_MUST_EVEX | T_EW0 | T_N2, 0x67, imm); }"); puts("void vfpclasssh(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isXMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_0F3A | T_MUST_EVEX | T_EW0 | T_N2, 0x67, imm); }");
puts("void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) { opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F); }"); puts("void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) { opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F); }");
puts("void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }");
puts("void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68); }"); puts("void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68); }");
puts("void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68); }"); puts("void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68); }");

View file

@ -560,6 +560,8 @@ void put()
{ 0, "nta", 0x18}, { 0, "nta", 0x18},
{ 2, "wt1", 0x0D}, { 2, "wt1", 0x0D},
{ 1, "w", 0x0D}, { 1, "w", 0x0D},
{ 7, "it0", 0x18},
{ 6, "it1", 0x18},
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i]; const Tbl *p = &tbl[i];
@ -693,6 +695,7 @@ void put()
{ "lock", 0xF0 }, { "lock", 0xF0 },
{ "sahf", 0x9E }, { "sahf", 0x9E },
{ "serialize", 0x0F, 0x01, 0xE8 },
{ "stc", 0xF9 }, { "stc", 0xF9 },
{ "std", 0xFD }, { "std", 0xFD },
{ "sti", 0xFB }, { "sti", 0xFB },
@ -806,6 +809,23 @@ void put()
printf("void %s(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x%02X, %d); }\n", p->name, p->code, p->ext); printf("void %s(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x%02X, %d); }\n", p->name, p->code, p->ext);
} }
} }
{
const struct Tbl {
const char *name;
uint8_t prefix;
} tbl[] = {
{ "aadd", 0 },
{ "aand", 0x66 },
{ "aor", 0xF2 },
{ "axor", 0xF3 },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
printf("void %s(const Address& addr, const Reg32e &reg) { ", p->name);
if (p->prefix) printf("db(0x%02X); ", p->prefix);
printf("opModM(addr, reg, 0x0F, 0x38, 0x0FC); }\n");
}
}
{ {
const struct Tbl { const struct Tbl {
@ -1666,6 +1686,25 @@ void put()
puts("void vcvtps2ph(const Operand& op, const Xmm& x, uint8_t imm) { checkCvt1(x, op); opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y | T_M_K, 0x1D, imm); }"); puts("void vcvtps2ph(const Operand& op, const Xmm& x, uint8_t imm) { checkCvt1(x, op); opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y | T_M_K, 0x1D, imm); }");
} }
{
const struct Tbl {
const char *name;
int type;
uint8_t code;
} tbl[] = {
{ "vbcstnebf162ps", T_F3 | T_0F38 | T_W0 | T_B16 | T_YMM, 0xB1 },
{ "vbcstnesh2ps", T_66 | T_0F38 | T_W0 | T_B16 | T_YMM, 0xB1 },
{ "vcvtneebf162ps", T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0 },
{ "vcvtneeph2ps", T_66 | T_0F38 | T_W0 | T_YMM, 0xB0 },
{ "vcvtneobf162ps", T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0 },
{ "vcvtneoph2ps", T_0F38 | T_W0 | T_YMM, 0xB0 }
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
printf("void %s(const Xmm& x, const Address& addr) { opVex(x, 0, addr, %s, 0x%02X); }\n", p.name, type2String(p.type).c_str(), p.code);
}
puts("void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72); }");
}
// haswell gpr(reg, reg, r/m) // haswell gpr(reg, reg, r/m)
{ {
const struct Tbl { const struct Tbl {
@ -1755,11 +1794,33 @@ void put()
{ 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, { 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
{ 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, { 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
{ 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, { 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
{ 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 },
{ 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 },
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i]; const Tbl *p = &tbl[i];
std::string type = type2String(p->type); std::string type = type2String(p->type);
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, type.c_str(), p->code); printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, type.c_str(), p->code);
}
}
// avx-vnni-int8
{
const struct Tbl {
uint8_t code;
const char *name;
int type;
} tbl[] = {
{ 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM },
{ 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM },
{ 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM },
{ 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
{ 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM },
{ 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
std::string type = type2String(p->type);
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, type.c_str(), p->code);
} }
} }
} }
@ -1824,6 +1885,34 @@ void put64()
puts("void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); }"); puts("void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); }");
puts("void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); }"); puts("void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); }");
// CMPccXADD
{
const struct Tbl {
const char *name;
uint8_t code;
} tbl[] = {
{ "be", 0xE6 },
{ "b", 0xE2 },
{ "le", 0xEE },
{ "l", 0xEC },
{ "nbe", 0xE7 },
{ "nb", 0xE3 },
{ "nle", 0xEF },
{ "nl", 0xED },
{ "no", 0xE1 },
{ "np", 0xEB },
{ "ns", 0xE9 },
{ "nz", 0xE5 },
{ "o", 0xE0 },
{ "p", 0xEA },
{ "s", 0xE8 },
{ "z", 0xE4 },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
printf("void cmp%sxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0x%02X, false); }\n", p->name, p->code);
}
}
} }
void putAMX_TILE() void putAMX_TILE()
@ -1842,6 +1931,7 @@ void putAMX_INT8()
puts("void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }"); puts("void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }");
puts("void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }"); puts("void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }");
puts("void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }"); puts("void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }");
puts("void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }");
} }
void putAMX_BF16() void putAMX_BF16()
{ {

View file

@ -5,7 +5,7 @@
project( project(
'xbyak', 'xbyak',
'cpp', 'cpp',
version: '6.61', version: '6.68',
license: 'BSD-3-Clause', license: 'BSD-3-Clause',
default_options: 'b_ndebug=if-release' default_options: 'b_ndebug=if-release'
) )

View file

@ -1,5 +1,5 @@
# Xbyak 6.61 [![Badge Build]][Build Status] # Xbyak 6.68 [![Badge Build]][Build Status]
*A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)* *A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)*
@ -28,6 +28,7 @@ If you want to use them, then specify `-fno-operator-names` option to gcc/clang.
### News ### News
- add amx_fp16/avx_vnni_int8/avx_ne_convert/avx-ifma
- add movdiri, movdir64b, clwb, cldemote - add movdiri, movdir64b, clwb, cldemote
- WAITPKG instructions (tpause, umonitor, umwait) are supported. - WAITPKG instructions (tpause, umonitor, umwait) are supported.
- MmapAllocator supports memfd with user-defined strings. see sample/memfd.cpp - MmapAllocator supports memfd with user-defined strings. see sample/memfd.cpp

View file

@ -1,5 +1,5 @@
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.61 C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.68
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
◎概要 ◎概要
@ -166,13 +166,15 @@ vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64],
vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding
setDefaultEncoding(VexEncoding); // default encoding is VEX
vpdpbusd(xm0, xm1, xm2); // VEX encoding
注意 注意
* k1, ..., k7 は新しいopmaskレジスタです。 * k1, ..., k7 は新しいopmaskレジスタです。
* z, sae, rn-sae, rd-sae, ru-sae, rz-saeの代わりにT_z, T_sae, T_rn_sae, T_rd_sae, T_ru_sae, T_rz_saeを使ってください。 * z, sae, rn-sae, rd-sae, ru-sae, rz-saeの代わりにT_z, T_sae, T_rn_sae, T_rd_sae, T_ru_sae, T_rz_saeを使ってください。
* `k4 | k3`と`k3 | k4`は意味が異なります。 * `k4 | k3`と`k3 | k4`は意味が異なります。
* {1toX}の代わりにptr_bを使ってください。Xは自動的に決まります。 * {1toX}の代わりにptr_bを使ってください。Xは自動的に決まります。
* 一部の命令はメモリサイズを指定するためにxword/yword/zword(_b)を使ってください。 * 一部の命令はメモリサイズを指定するためにxword/yword/zword(_b)を使ってください。
* setDefaultEncoding()でencoding省略時のEVEX/VEXを設定できます。
・ラベル ・ラベル
@ -400,6 +402,15 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
◎履歴 ◎履歴
2022/12/07 ver 6.68 prefetchit{0,1}サポート
2022/11/30 ver 6.67 CMPccXADDサポート
2022/11/25 ver 6.66 RAO-INTサポート
2022/11/22 ver 6.65 x32動作確認
2022/11/04 ver 6.64 vmov*命令をmaskつきアドレッシング対応修正
2022/10/06 ver 6.63 AVX-IFMA用のvpmadd52{h,l}uq対応
2022/10/05 amx_fp16/avx_vnni_int8/avx_ne_convertt対応とsetDefaultEncoding()追加
2022/09/15 ver 6.62 serialize追加
2022/08/02 ver 6.61.1 noexceptはVisual Studio 2015以降対応
2022/07/29 ver 6.61 movzx eax, ahがエラーになるのを修正 2022/07/29 ver 6.61 movzx eax, ahがエラーになるのを修正
2022/06/16 ver 6.60.2 GFNI, VAES, VPCLMULQDQの判定修正 2022/06/16 ver 6.60.2 GFNI, VAES, VPCLMULQDQの判定修正
2022/06/15 ver 6.60.1 Visual Studio /O0でXbyak::util::Cpuがリンクエラーになるのに対応 2022/06/15 ver 6.60.1 Visual Studio /O0でXbyak::util::Cpuがリンクエラーになるのに対応

View file

@ -1,6 +1,7 @@
XBYAK_INC=../xbyak/xbyak.h XBYAK_INC=../xbyak/xbyak.h
CXX?=g++
BOOST_EXIST=$(shell echo "\#include <boost/spirit/core.hpp>" | (gcc -E - 2>/dev/null) | grep "boost/spirit/core.hpp" >/dev/null && echo "1") BOOST_EXIST=$(shell echo "#include <boost/spirit/core.hpp>" | $(CXX) -x c++ -c - 2>/dev/null && echo 1)
UNAME_M=$(shell uname -m) UNAME_M=$(shell uname -m)
ONLY_64BIT=0 ONLY_64BIT=0
@ -104,7 +105,7 @@ profiler-vtune: profiler.cpp ../xbyak/xbyak_util.h
$(CXX) $(CFLAGS) profiler.cpp -o $@ -DXBYAK_USE_VTUNE -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl $(CXX) $(CFLAGS) profiler.cpp -o $@ -DXBYAK_USE_VTUNE -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
clean: clean:
rm -rf *.o $(TARGET) *.exe profiler profiler-vtune rm -rf $(TARGET) profiler profiler-vtune
test : test0.cpp $(XBYAK_INC) test : test0.cpp $(XBYAK_INC)
test64: test0.cpp $(XBYAK_INC) test64: test0.cpp $(XBYAK_INC)

View file

@ -199,7 +199,7 @@ int main(int argc, char *argv[])
quantize2(dest2, src, qTbl); quantize2(dest2, src, qTbl);
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
if (dest[i] != dest2[i]) { if (dest[i] != dest2[i]) {
printf("err[%d] %d %d\n", i, dest[i], dest2[i]); printf("err[%d] %u %u\n", i, dest[i], dest2[i]);
} }
} }

View file

@ -89,6 +89,13 @@ void putCPUinfo(bool onlyCpuidFeature)
{ Cpu::tMOVDIRI, "movdiri" }, { Cpu::tMOVDIRI, "movdiri" },
{ Cpu::tMOVDIR64B, "movdir64b" }, { Cpu::tMOVDIR64B, "movdir64b" },
{ Cpu::tCLZERO, "clzero" }, { Cpu::tCLZERO, "clzero" },
{ Cpu::tAMX_FP16, "amx_fp16" },
{ Cpu::tAVX_VNNI_INT8, "avx_vnni_int8" },
{ Cpu::tAVX_NE_CONVERT, "avx_ne_convert" },
{ Cpu::tAVX_IFMA, "avx_ifma" },
{ Cpu::tRAO_INT, "rao-int" },
{ Cpu::tCMPCCXADD, "cmpccxadd" },
{ Cpu::tPREFETCHITI, "prefetchiti" },
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str); if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);

View file

@ -5,8 +5,8 @@
mem_ 4byte x 65536 mem_ 4byte x 65536
4byte固定 all instructions are fixed at 4 bytes.
16bit all immediate values are 16-bit.
R = A or B R = A or B
vldiR, imm ; R = imm vldiR, imm ; R = imm
@ -109,7 +109,7 @@ public:
reg[r] -= imm; reg[r] -= imm;
break; break;
case PUT: case PUT:
printf("%c %8d(0x%08x)\n", 'A' + r, reg[r], reg[r]); printf("%c %8u(0x%08x)\n", 'A' + r, reg[r], reg[r]);
break; break;
case JNZ: case JNZ:
if (reg[r] != 0) pc += static_cast<signed short>(imm); if (reg[r] != 0) pc += static_cast<signed short>(imm);
@ -294,7 +294,7 @@ lp:
p = t; p = t;
n--; n--;
if (n != 0) goto lp; if (n != 0) goto lp;
printf("c=%d(0x%08x)\n", c, c); printf("c=%u(0x%08x)\n", c, c);
} }
int main() int main()

View file

@ -1,6 +1,9 @@
TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32
XBYAK_INC=../xbyak/xbyak.h XBYAK_INC=../xbyak/xbyak.h
UNAME_S=$(shell uname -s) UNAME_S=$(shell uname -s)
ifeq ($(shell ./detect_x32),x32)
X32?=1
endif
BIT=32 BIT=32
ifeq ($(shell uname -m),x86_64) ifeq ($(shell uname -m),x86_64)
BIT=64 BIT=64
@ -20,9 +23,9 @@ endif
all: $(TARGET) all: $(TARGET)
CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wwrite-strings -Wfloat-equal -Wpointer-arith
CFLAGS=-O2 -fomit-frame-pointer -Wall -fno-operator-names -I../ -I./ $(CFLAGS_WARN) $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) #-std=c++0x CFLAGS=-O2 -Wall -I../ -I./ $(CFLAGS_WARN) $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) #-std=c++0x
make_nm: make_nm:
$(CXX) $(CFLAGS) make_nm.cpp -o $@ $(CXX) $(CFLAGS) make_nm.cpp -o $@
normalize_prefix: normalize_prefix.cpp ../xbyak/xbyak.h normalize_prefix: normalize_prefix.cpp ../xbyak/xbyak.h
@ -53,12 +56,11 @@ noexception: noexception.cpp ../xbyak/xbyak.h
test_nm: normalize_prefix $(TARGET) test_nm: normalize_prefix $(TARGET)
$(MAKE) -C ../gen $(MAKE) -C ../gen
ifneq ($(ONLY_64BIT),1) ifneq ($(ONLY_64BIT),1)
./test_nm.sh CXX=$(CXX) ./test_nm.sh
./test_nm.sh noexcept CXX=$(CXX) ./test_nm.sh noexcept
./noexception CXX=$(CXX) ./test_nm.sh Y
./test_nm.sh Y CXX=$(CXX) ./test_nm.sh avx512
./test_nm.sh avx512 CXX=$(CXX) ./test_address.sh
./test_address.sh
./jmp ./jmp
./cvt_test32 ./cvt_test32
endif endif
@ -67,32 +69,38 @@ endif
./misc32 ./misc32
./cvt_test ./cvt_test
ifeq ($(BIT),64) ifeq ($(BIT),64)
./test_address.sh 64 CXX=$(CXX) ./test_address.sh 64
./test_nm.sh 64 ifneq ($(X32),1)
./test_nm.sh Y64 CXX=$(CXX) ./test_nm.sh 64
CXX=$(CXX) ./test_nm.sh Y64
endif
./jmp64 ./jmp64
endif endif
test_avx: normalize_prefix test_avx: normalize_prefix
ifneq ($(ONLY_64BIT),0) ifneq ($(ONLY_64BIT),0)
./test_avx.sh CXX=$(CXX) ./test_avx.sh
./test_avx.sh Y CXX=$(CXX) ./test_avx.sh Y
endif endif
ifeq ($(BIT),64) ifeq ($(BIT),64)
./test_address.sh 64 CXX=$(CXX) ./test_avx.sh 64
./test_avx.sh 64 ifneq ($(X32),1)
./test_avx.sh Y64 CXX=$(CXX) ./test_avx.sh Y64
endif
endif endif
test_avx512: normalize_prefix test_avx512: normalize_prefix
ifneq ($(ONLY_64BIT),0) ifneq ($(ONLY_64BIT),0)
./test_avx512.sh CXX=$(CXX) ./test_avx512.sh
endif endif
ifeq ($(BIT),64) ifeq ($(BIT),64)
./test_avx512.sh 64 CXX=$(CXX) ./test_avx512.sh 64
endif endif
test: detect_x32: detect_x32.c
$(CC) $< -o $@
test: detect_x32
$(MAKE) test_nm $(MAKE) test_nm
$(MAKE) test_avx $(MAKE) test_avx
$(MAKE) test_avx512 $(MAKE) test_avx512
@ -104,4 +112,3 @@ lib_run: lib_test.cpp lib_run.cpp lib.h
$(CXX) $(CFLAGS) lib_run.cpp lib_test.cpp -o lib_run $(CXX) $(CFLAGS) lib_run.cpp lib_test.cpp -o lib_run
make_nm: make_nm.cpp $(XBYAK_INC) make_nm: make_nm.cpp $(XBYAK_INC)

View file

@ -1,4 +1,4 @@
OPT=/EHsc -I../xbyak /W4 -D_CRT_SECURE_NO_WARNINGS OPT=/EHsc -I../xbyak /W4 -D_CRT_SECURE_NO_WARNINGS -I ../
../xbyak/xbyak_mnemonic.h: ../gen/gen_code.exe ../gen/gen_avx512.exe ../xbyak/xbyak_mnemonic.h: ../gen/gen_code.exe ../gen/gen_avx512.exe
../gen/gen_code.exe > $@ ../gen/gen_code.exe > $@
../gen/gen_avx512.exe >> $@ ../gen/gen_avx512.exe >> $@

8
test/detect_x32.c Normal file
View file

@ -0,0 +1,8 @@
#include <stdio.h>
int main()
{
#if defined(__x86_64__) && defined(__ILP32__)
puts("x32");
#endif
}

View file

@ -1807,44 +1807,44 @@ public:
put("vpmovd2m", K, _XMM | _YMM | _ZMM); put("vpmovd2m", K, _XMM | _YMM | _ZMM);
put("vpmovq2m", K, _XMM | _YMM | _ZMM); put("vpmovq2m", K, _XMM | _YMM | _ZMM);
put("vpmovqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); put("vpmovqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovsqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); put("vpmovsqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovusqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); put("vpmovusqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); put("vpmovqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovsqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); put("vpmovsqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovusqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); put("vpmovusqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovqd", XMM_KZ | _MEM, _XMM | _YMM); put("vpmovqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovqd", YMM_KZ | _MEM, _ZMM); put("vpmovqd", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovsqd", XMM_KZ | _MEM, _XMM | _YMM); put("vpmovsqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovsqd", YMM_KZ | _MEM, _ZMM); put("vpmovsqd", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovusqd", XMM_KZ | _MEM, _XMM | _YMM); put("vpmovusqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovusqd", YMM_KZ | _MEM, _ZMM); put("vpmovusqd", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); put("vpmovdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovsdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); put("vpmovsdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovusdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM); put("vpmovusdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovdw", XMM_KZ | _MEM, _XMM | _YMM); put("vpmovdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovdw", YMM_KZ | _MEM, _ZMM); put("vpmovdw", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovsdw", XMM_KZ | _MEM, _XMM | _YMM); put("vpmovsdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovsdw", YMM_KZ | _MEM, _ZMM); put("vpmovsdw", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovusdw", XMM_KZ | _MEM, _XMM | _YMM); put("vpmovusdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovusdw", YMM_KZ | _MEM, _ZMM); put("vpmovusdw", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovwb", XMM_KZ | _MEM, _XMM | _YMM); put("vpmovwb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovwb", YMM_KZ | _MEM, _ZMM); put("vpmovwb", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovswb", XMM_KZ | _MEM, _XMM | _YMM); put("vpmovswb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovswb", YMM_KZ | _MEM, _ZMM); put("vpmovswb", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovuswb", XMM_KZ | _MEM, _XMM | _YMM); put("vpmovuswb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovuswb", YMM_KZ | _MEM, _ZMM); put("vpmovuswb", YMM_KZ | _MEM | MEM_K, _ZMM);
} }
void putRot() void putRot()
{ {

View file

@ -533,6 +533,7 @@ class Test {
"nop", "nop",
"sahf", "sahf",
"serialize",
"stc", "stc",
"std", "std",
"sti", "sti",
@ -1017,9 +1018,7 @@ class Test {
} }
void putCmov() const void putCmov() const
{ {
const struct { const char tbl[][4] = {
const char *s;
} tbl[] = {
"o", "o",
"no", "no",
"b", "b",
@ -1053,11 +1052,11 @@ class Test {
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
char buf[32]; char buf[32];
snprintf(buf, sizeof(buf), "cmov%s", tbl[i].s); snprintf(buf, sizeof(buf), "cmov%s", tbl[i]);
put(buf, REG16, REG16|MEM); put(buf, REG16, REG16|MEM);
put(buf, REG32, REG32|MEM); put(buf, REG32, REG32|MEM);
put(buf, REG64, REG64|MEM); put(buf, REG64, REG64|MEM);
snprintf(buf, sizeof(buf), "set%s", tbl[i].s); snprintf(buf, sizeof(buf), "set%s", tbl[i]);
put(buf, REG8|REG8_3|MEM); put(buf, REG8|REG8_3|MEM);
} }
} }
@ -1294,7 +1293,7 @@ class Test {
put(p, REG64, "0x1234567890abcdefLL", "0x1234567890abcdef"); put(p, REG64, "0x1234567890abcdefLL", "0x1234567890abcdef");
put("movbe", REG16|REG32e, MEM); put("movbe", REG16|REG32e, MEM);
put("movbe", MEM, REG16|REG32e); put("movbe", MEM, REG16|REG32e);
#ifdef XBYAK64 #if defined(XBYAK64) && !defined(__ILP32__)
put(p, RAX|EAX|AX|AL, "ptr [0x1234567890abcdefLL]", "[qword 0x1234567890abcdef]"); put(p, RAX|EAX|AX|AL, "ptr [0x1234567890abcdefLL]", "[qword 0x1234567890abcdef]");
put(p, "ptr [0x1234567890abcdefLL]", "[qword 0x1234567890abcdef]", RAX|EAX|AX|AL); put(p, "ptr [0x1234567890abcdefLL]", "[qword 0x1234567890abcdef]", RAX|EAX|AX|AL);
put(p, "qword [rax], 0"); put(p, "qword [rax], 0");
@ -2608,7 +2607,7 @@ public:
putMPX(); putMPX();
#endif #endif
#ifdef XBYAK64 #if defined(XBYAK64) && !defined(__ILP32__)
#ifdef USE_YASM #ifdef USE_YASM
putRip(); putRip();

View file

@ -5,6 +5,7 @@
#include <xbyak/xbyak_util.h> #include <xbyak/xbyak_util.h>
#include <cybozu/inttype.hpp> #include <cybozu/inttype.hpp>
#include <cybozu/test.hpp> #include <cybozu/test.hpp>
#include <algorithm>
using namespace Xbyak; using namespace Xbyak;
@ -97,13 +98,17 @@ CYBOZU_TEST_AUTO(mov_const)
} }
#ifdef XBYAK64 #ifdef XBYAK64
CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x7fffffff])); CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x7fffffff]));
if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32
CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x17fffffff]), Xbyak::Error); CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x17fffffff]), Xbyak::Error);
}
#ifdef XBYAK_OLD_DISP_CHECK #ifdef XBYAK_OLD_DISP_CHECK
CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x80000000])); CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x80000000]));
CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0xffffffff])); CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0xffffffff]));
#else #else
if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32
CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x80000000ull]), Xbyak::Error); CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x80000000ull]), Xbyak::Error);
CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0xffffffffull]), Xbyak::Error); CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0xffffffffull]), Xbyak::Error);
}
#endif #endif
#endif #endif
} }
@ -875,6 +880,10 @@ CYBOZU_TEST_AUTO(vnni)
vpdpbusd(xm0, xm1, xm2); vpdpbusd(xm0, xm1, xm2);
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // EVEX vpdpbusd(xm0, xm1, xm2, EvexEncoding); // EVEX
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX
setDefaultEncoding(VexEncoding);
vpdpbusd(xm0, xm1, xm2); // VEX
setDefaultEncoding(EvexEncoding);
vpdpbusd(xm0, xm1, xm2); // EVEX
} }
void badVex() void badVex()
{ {
@ -885,6 +894,8 @@ CYBOZU_TEST_AUTO(vnni)
0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2, 0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2, 0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
0xC4, 0xE2, 0x71, 0x50, 0xC2, 0xC4, 0xE2, 0x71, 0x50, 0xC2,
0xC4, 0xE2, 0x71, 0x50, 0xC2,
0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
}; };
const size_t n = sizeof(tbl) / sizeof(tbl[0]); const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n); CYBOZU_TEST_EQUAL(c.getSize(), n);
@ -1975,3 +1986,175 @@ CYBOZU_TEST_AUTO(cpu)
Cpu cpu; Cpu cpu;
CYBOZU_TEST_EQUAL(cpu.has(Cpu::tINTEL) && cpu.has(Cpu::tAMD), cpu.has(Cpu::tINTEL | Cpu::tAMD)); CYBOZU_TEST_EQUAL(cpu.has(Cpu::tINTEL) && cpu.has(Cpu::tAMD), cpu.has(Cpu::tINTEL | Cpu::tAMD));
} }
CYBOZU_TEST_AUTO(minmax)
{
using namespace Xbyak::util;
CYBOZU_TEST_EQUAL((std::min)(3, 4), local::min_(3, 4));
CYBOZU_TEST_EQUAL((std::max)(3, 4), local::max_(3, 4));
}
CYBOZU_TEST_AUTO(rao_int)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
#ifdef XBYAK64
aadd(ptr[rax], ecx);
aadd(ptr[eax], ecx);
aadd(ptr[rax], r10);
aand(ptr[rax], ecx);
aand(ptr[eax], ecx);
aand(ptr[rax], r10);
aor(ptr[rax], ecx);
aor(ptr[eax], ecx);
aor(ptr[rax], r10);
axor(ptr[rax], ecx);
axor(ptr[eax], ecx);
axor(ptr[rax], r10);
#else
aadd(ptr[eax], ecx);
aand(ptr[eax], ecx);
aor(ptr[eax], ecx);
axor(ptr[eax], ecx);
#endif
}
} c;
const uint8_t tbl[] = {
#ifdef XBYAK64
// aadd
0x0f, 0x38, 0xfc, 0x08,
0x67, 0x0f, 0x38, 0xfc, 0x08,
0x4c, 0x0f, 0x38, 0xfc, 0x10,
// aand
0x66, 0x0f, 0x38, 0xfc, 0x08,
0x66, 0x67, 0x0f, 0x38, 0xfc, 0x08,
0x66, 0x4c, 0x0f, 0x38, 0xfc, 0x10,
// aor
0xf2, 0x0f, 0x38, 0xfc, 0x08,
0xf2, 0x67, 0x0f, 0x38, 0xfc, 0x08,
0xf2, 0x4c, 0x0f, 0x38, 0xfc, 0x10,
// axor
0xf3, 0x0f, 0x38, 0xfc, 0x08,
0xf3, 0x67, 0x0f, 0x38, 0xfc, 0x08,
0xf3, 0x4c, 0x0f, 0x38, 0xfc, 0x10,
#else
// aadd
0x0f, 0x38, 0xfc, 0x08,
// aand
0x66, 0x0f, 0x38, 0xfc, 0x08,
// aor
0xf2, 0x0f, 0x38, 0xfc, 0x08,
// axor
0xf3, 0x0f, 0x38, 0xfc, 0x08,
#endif
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
#ifdef XBYAK64
CYBOZU_TEST_AUTO(CMPccXADD)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
// 32bit reg
cmpbexadd(ptr[rax+r10*4], ecx, edx);
cmpbxadd(ptr[rax+r10*4], ecx, edx);
cmplexadd(ptr[rax+r10*4], ecx, edx);
cmplxadd(ptr[rax+r10*4], ecx, edx);
cmpnbexadd(ptr[rax+r10*4], ecx, edx);
cmpnbxadd(ptr[rax+r10*4], ecx, edx);
cmpnlexadd(ptr[rax+r10*4], ecx, edx);
cmpnlxadd(ptr[rax+r10*4], ecx, edx);
cmpnoxadd(ptr[rax+r10*4], ecx, edx);
cmpnpxadd(ptr[rax+r10*4], ecx, edx);
cmpnsxadd(ptr[rax+r10*4], ecx, edx);
cmpnzxadd(ptr[rax+r10*4], ecx, edx);
cmpoxadd(ptr[rax+r10*4], ecx, edx);
cmppxadd(ptr[rax+r10*4], ecx, edx);
cmpsxadd(ptr[rax+r10*4], ecx, edx);
cmpzxadd(ptr[rax+r10*4], ecx, edx);
// 64bit reg
cmpbexadd(ptr[rax+r10*4], rcx, rdx);
cmpbxadd(ptr[rax+r10*4], rcx, rdx);
cmplexadd(ptr[rax+r10*4], rcx, rdx);
cmplxadd(ptr[rax+r10*4], rcx, rdx);
cmpnbexadd(ptr[rax+r10*4], rcx, rdx);
cmpnbxadd(ptr[rax+r10*4], rcx, rdx);
cmpnlexadd(ptr[rax+r10*4], rcx, rdx);
cmpnlxadd(ptr[rax+r10*4], rcx, rdx);
cmpnoxadd(ptr[rax+r10*4], rcx, rdx);
cmpnpxadd(ptr[rax+r10*4], rcx, rdx);
cmpnsxadd(ptr[rax+r10*4], rcx, rdx);
cmpnzxadd(ptr[rax+r10*4], rcx, rdx);
cmpoxadd(ptr[rax+r10*4], rcx, rdx);
cmppxadd(ptr[rax+r10*4], rcx, rdx);
cmpsxadd(ptr[rax+r10*4], rcx, rdx);
cmpzxadd(ptr[rax+r10*4], rcx, rdx);
}
} c;
const uint8_t tbl[] = {
// 32bit reg
0xc4, 0xa2, 0x69, 0xe6, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe2, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xee, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xec, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe7, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe3, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xef, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xed, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe1, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xeb, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe9, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe5, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe0, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xea, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe8, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe4, 0x0c, 0x90,
// 64bit reg
0xc4, 0xa2, 0xe9, 0xe6, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe2, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xee, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xec, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe7, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe3, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xef, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xed, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe1, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xeb, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe9, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe5, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe0, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xea, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe8, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe4, 0x0c, 0x90,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(prefetchiti)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
prefetchit0(ptr[rax]);
prefetchit1(ptr[rax]);
}
} c;
const uint8_t tbl[] = {
0x0f, 0x18, 0x38,
0x0f, 0x18, 0x30
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
#endif

View file

@ -56,7 +56,7 @@ void test2()
void test3() void test3()
{ {
static struct EmptyAllocator : Xbyak::Allocator { static struct EmptyAllocator : Xbyak::Allocator {
uint8_t *alloc() { return 0; } uint8_t *alloc(size_t) { return 0; }
} emptyAllocator; } emptyAllocator;
struct Code : CodeGenerator { struct Code : CodeGenerator {
Code() : CodeGenerator(8, 0, &emptyAllocator) Code() : CodeGenerator(8, 0, &emptyAllocator)

View file

@ -1,13 +1,17 @@
#!/bin/sh #!/bin/sh
set -e
FILTER="grep -v warning" FILTER="grep -v warning"
sub() sub()
{ {
CFLAGS="-Wall -fno-operator-names -I../ $OPT2" CFLAGS="-Wall -I../ $OPT2"
CXX=${CXX:=g++}
echo "compile address.cpp" echo "compile address.cpp"
g++ $CFLAGS address.cpp -o address $CXX $CFLAGS address.cpp -o address
./address $1 > a.asm ./address $1 > a.asm
echo "asm" echo "asm"
@ -17,7 +21,7 @@ awk '{printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER > ok.lst
echo "xbyak" echo "xbyak"
./address $1 jit > nm.cpp ./address $1 jit > nm.cpp
echo "compile nm_frame.cpp" echo "compile nm_frame.cpp"
g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame $CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
./nm_frame > x.lst ./nm_frame > x.lst
diff ok.lst x.lst && echo "ok" diff ok.lst x.lst && echo "ok"

View file

@ -1,6 +1,9 @@
#!/bin/sh #!/bin/sh
set -e
FILTER="grep -v warning" FILTER="grep -v warning"
CXX=${CXX:=g++}
case $1 in case $1 in
Y) Y)
@ -31,9 +34,9 @@ Y64)
;; ;;
esac esac
CFLAGS="-Wall -fno-operator-names -I../ $OPT2 -DUSE_AVX" CFLAGS="-Wall -I../ $OPT2 -DUSE_AVX"
echo "compile make_nm.cpp" echo "compile make_nm.cpp"
g++ $CFLAGS make_nm.cpp -o make_nm $CXX $CFLAGS make_nm.cpp -o make_nm
./make_nm > a.asm ./make_nm > a.asm
echo "asm" echo "asm"
@ -43,6 +46,6 @@ awk '$3 != "1+1" {printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER
echo "xbyak" echo "xbyak"
./make_nm jit > nm.cpp ./make_nm jit > nm.cpp
echo "compile nm_frame.cpp" echo "compile nm_frame.cpp"
g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame $CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
./nm_frame | $FILTER > x.lst ./nm_frame | $FILTER > x.lst
diff -B ok.lst x.lst && echo "ok" diff -B ok.lst x.lst && echo "ok"

View file

@ -1,6 +1,9 @@
#!/bin/sh #!/bin/sh
set -e
FILTER="grep -v warning" FILTER="grep -v warning"
CXX=${CXX:=g++}
case $1 in case $1 in
64) 64)
@ -18,9 +21,9 @@ case $1 in
;; ;;
esac esac
CFLAGS="-Wall -fno-operator-names -I../ $OPT2 -DUSE_AVX512" CFLAGS="-Wall -I../ $OPT2 -DUSE_AVX512"
echo "compile make_512.cpp" echo "compile make_512.cpp"
g++ $CFLAGS make_512.cpp -o make_512 $CXX $CFLAGS make_512.cpp -o make_512
./make_512 > a.asm ./make_512 > a.asm
echo "asm" echo "asm"
@ -30,6 +33,6 @@ awk '{printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER > ok.lst
echo "xbyak" echo "xbyak"
./make_512 jit > nm.cpp ./make_512 jit > nm.cpp
echo "compile nm_frame.cpp" echo "compile nm_frame.cpp"
g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame -DXBYAK_AVX512 $CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame -DXBYAK_AVX512
./nm_frame | $FILTER > x.lst ./nm_frame | $FILTER > x.lst
diff -B ok.lst x.lst && echo "ok" diff -B ok.lst x.lst && echo "ok"

View file

@ -1,6 +1,9 @@
#!/bin/sh #!/bin/sh
set -e
FILTER=cat FILTER=cat
CXX=${CXX:=g++}
case $1 in case $1 in
Y) Y)
@ -44,9 +47,9 @@ noexcept)
;; ;;
esac esac
CFLAGS="-Wall -fno-operator-names -I../ $OPT2" CFLAGS="-Wall -I../ $OPT2"
echo "compile make_nm.cpp with $CFLAGS" echo "compile make_nm.cpp with $CFLAGS"
g++ $CFLAGS make_nm.cpp -o make_nm $CXX $CFLAGS make_nm.cpp -o make_nm
./make_nm > a.asm ./make_nm > a.asm
echo "asm" echo "asm"
@ -56,6 +59,6 @@ awk '$3 != "1+1" {printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER
echo "xbyak" echo "xbyak"
./make_nm jit > nm.cpp ./make_nm jit > nm.cpp
echo "compile nm_frame.cpp" echo "compile nm_frame.cpp"
g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame $CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
./nm_frame | $FILTER > x.lst ./nm_frame | $FILTER > x.lst
diff -B ok.lst x.lst && echo "ok" diff -B ok.lst x.lst && echo "ok"

View file

@ -118,7 +118,7 @@
#endif #endif
#endif #endif
#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1800) #if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1900)
#undef XBYAK_TLS #undef XBYAK_TLS
#define XBYAK_TLS thread_local #define XBYAK_TLS thread_local
#define XBYAK_VARIADIC_TEMPLATE #define XBYAK_VARIADIC_TEMPLATE
@ -144,11 +144,18 @@
#pragma warning(disable : 4127) /* constant expresison */ #pragma warning(disable : 4127) /* constant expresison */
#endif #endif
// disable -Warray-bounds because it may be a bug of gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104603
#if defined(__GNUC__) && !defined(__clang__)
#define XBYAK_DISABLE_WARNING_ARRAY_BOUNDS
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Warray-bounds"
#endif
namespace Xbyak { namespace Xbyak {
enum { enum {
DEFAULT_MAX_CODE_SIZE = 4096, DEFAULT_MAX_CODE_SIZE = 4096,
VERSION = 0x6610 /* 0xABCD = A.BC(.D) */ VERSION = 0x6680 /* 0xABCD = A.BC(.D) */
}; };
#ifndef MIE_INTEGER_TYPE_DEFINED #ifndef MIE_INTEGER_TYPE_DEFINED
@ -371,7 +378,7 @@ inline bool IsInInt32(uint64_t x) { return ~uint64_t(0x7fffffffu) <= x || x <= 0
inline uint32_t VerifyInInt32(uint64_t x) inline uint32_t VerifyInInt32(uint64_t x)
{ {
#ifdef XBYAK64 #if defined(XBYAK64) && !defined(__ILP32__)
if (!IsInInt32(x)) XBYAK_THROW_RET(ERR_OFFSET_IS_TOO_BIG, 0) if (!IsInInt32(x)) XBYAK_THROW_RET(ERR_OFFSET_IS_TOO_BIG, 0)
#endif #endif
return static_cast<uint32_t>(x); return static_cast<uint32_t>(x);
@ -1478,7 +1485,6 @@ public:
clabelDefList_.clear(); clabelDefList_.clear();
clabelUndefList_.clear(); clabelUndefList_.clear();
resetLabelPtrList(); resetLabelPtrList();
ClearError();
} }
void enterLocal() void enterLocal()
{ {
@ -1820,7 +1826,7 @@ private:
void setSIB(const RegExp& e, int reg, int disp8N = 0) void setSIB(const RegExp& e, int reg, int disp8N = 0)
{ {
uint64_t disp64 = e.getDisp(); uint64_t disp64 = e.getDisp();
#ifdef XBYAK64 #if defined(XBYAK64) && !defined(__ILP32__)
#ifdef XBYAK_OLD_DISP_CHECK #ifdef XBYAK_OLD_DISP_CHECK
// treat 0xffffffff as 0xffffffffffffffff // treat 0xffffffff as 0xffffffffffffffff
uint64_t high = disp64 >> 32; uint64_t high = disp64 >> 32;
@ -2412,18 +2418,21 @@ private:
if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING) if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
opVex(x, 0, addr, type, code); opVex(x, 0, addr, type, code);
} }
void opVnni(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding) void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding)
{ {
opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code0);
}
int orEvexIf(PreferredEncoding encoding) {
if (encoding == DefaultEncoding) { if (encoding == DefaultEncoding) {
encoding = EvexEncoding; encoding = defaultEncoding_;
} }
if (encoding == EvexEncoding) { if (encoding == EvexEncoding) {
#ifdef XBYAK_DISABLE_AVX512 #ifdef XBYAK_DISABLE_AVX512
XBYAK_THROW(ERR_EVEX_IS_INVALID) XBYAK_THROW(ERR_EVEX_IS_INVALID)
#endif #endif
type |= T_MUST_EVEX; return T_MUST_EVEX;
} }
opAVX_X_X_XM(x1, x2, op, type, code0); return 0;
} }
void opInOut(const Reg& a, const Reg& d, uint8_t code) void opInOut(const Reg& a, const Reg& d, uint8_t code)
{ {
@ -2508,6 +2517,7 @@ public:
#endif #endif
private: private:
bool isDefaultJmpNEAR_; bool isDefaultJmpNEAR_;
PreferredEncoding defaultEncoding_;
public: public:
void L(const std::string& label) { labelMgr_.defineSlabel(label); } void L(const std::string& label) { labelMgr_.defineSlabel(label); }
void L(Label& label) { labelMgr_.defineClabel(label); } void L(Label& label) { labelMgr_.defineClabel(label); }
@ -2787,11 +2797,13 @@ public:
, es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs) , es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs)
#endif #endif
, isDefaultJmpNEAR_(false) , isDefaultJmpNEAR_(false)
, defaultEncoding_(EvexEncoding)
{ {
labelMgr_.set(this); labelMgr_.set(this);
} }
void reset() void reset()
{ {
ClearError();
resetSize(); resetSize();
labelMgr_.reset(); labelMgr_.reset();
labelMgr_.set(this); labelMgr_.set(this);
@ -2823,6 +2835,9 @@ public:
#undef jnl #undef jnl
#endif #endif
// set default encoding to select Vex or Evex
void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; }
/* /*
use single byte nop if useMultiByteNop = false use single byte nop if useMultiByteNop = false
*/ */
@ -2927,6 +2942,10 @@ static const XBYAK_CONSTEXPR Segment es(Segment::es), cs(Segment::cs), ss(Segmen
#pragma warning(pop) #pragma warning(pop)
#endif #endif
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop
#endif
} // end of namespace } // end of namespace
#endif // XBYAK_XBYAK_H_ #endif // XBYAK_XBYAK_H_

View file

@ -1,4 +1,6 @@
const char *getVersionString() const { return "6.61"; } const char *getVersionString() const { return "6.68"; }
void aadd(const Address& addr, const Reg32e &reg) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void aand(const Address& addr, const Reg32e &reg) { db(0x66); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); } void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); } void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); } void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@ -24,6 +26,8 @@ void andnpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x66, isXM
void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); } void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); }
void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); } void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); }
void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); } void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); }
void aor(const Address& addr, const Reg32e &reg) { db(0xF2); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void axor(const Address& addr, const Reg32e &reg) { db(0xF3); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf7, false); } void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf7, false); }
void blendpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); } void blendpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
void blendps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); } void blendps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
@ -654,6 +658,8 @@ void pmuludq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF4); }
void popcnt(const Reg&reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); } void popcnt(const Reg&reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); }
void popf() { db(0x9D); } void popf() { db(0x9D); }
void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); } void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); }
void prefetchit0(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0x18); }
void prefetchit1(const Address& addr) { opModM(addr, Reg32(6), 0x0F, 0x18); }
void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, 0x18); } void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, 0x18); }
void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); } void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); }
void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); } void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); }
@ -747,6 +753,7 @@ void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); }
void scasb() { db(0xAE); } void scasb() { db(0xAE); }
void scasd() { db(0xAF); } void scasd() { db(0xAF); }
void scasw() { db(0x66); db(0xAF); } void scasw() { db(0x66); db(0xAF); }
void serialize() { db(0x0F); db(0x01); db(0xE8); }
void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }//-V524 void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }//-V524
void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }//-V524 void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }//-V524
void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }//-V524 void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }//-V524
@ -844,6 +851,8 @@ void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand())
void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); } void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); }
void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); } void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); }
void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); } void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); }
void vbcstnebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); }
void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); }
void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm); } void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm); }
void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm); } void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm); }
void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4); } void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4); }
@ -988,6 +997,11 @@ void vcomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T
void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); } void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); }
void vcvtdq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6); } void vcvtdq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6); }
void vcvtdq2ps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B); } void vcvtdq2ps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B); }
void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0); }
void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM, 0xB0); }
void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0); }
void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_W0 | T_YMM, 0xB0); }
void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72); }
void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); } void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); }
void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); } void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); }
void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); } void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); }
@ -1191,10 +1205,16 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1
void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); } void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); }
void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); } void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); }
void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); } void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); }
void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); } void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x50); }
void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); } void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x51); }
void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); } void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x50); }
void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); } void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x51); }
void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); }
void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); }
void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x50); }
void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x51); }
void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); }
void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); }
void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); } void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); }
void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); } void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); }
void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36); } void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36); }
@ -1226,6 +1246,8 @@ void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if
void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); } void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); }
void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); } void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); }
void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); } void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); }
void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB5, encoding); }
void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB4, encoding); }
void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04); } void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04); }
void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5); } void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5); }
void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); } void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); }
@ -1642,6 +1664,22 @@ void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx())
void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x2C); } void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x2C); }
void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); } void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); }
void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); } void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); }
void cmpbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE6, false); }
void cmpbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE2, false); }
void cmplexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEE, false); }
void cmplxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEC, false); }
void cmpnbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE7, false); }
void cmpnbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE3, false); }
void cmpnlexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEF, false); }
void cmpnlxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xED, false); }
void cmpnoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE1, false); }
void cmpnpxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEB, false); }
void cmpnsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE9, false); }
void cmpnzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE5, false); }
void cmpoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE0, false); }
void cmppxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEA, false); }
void cmpsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE8, false); }
void cmpzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE4, false); }
void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); } void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); }
void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); } void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }
void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); } void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); }
@ -1653,6 +1691,7 @@ void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T
void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); } void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }
void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); } void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }
void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); } void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }
void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }
void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); } void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }
#else #else
void jcxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); } void jcxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }
@ -1907,7 +1946,6 @@ void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 |
void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); } void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); }
void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B); } void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B); }
void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); } void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A); } void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A); }
void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); } void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); }
void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); } void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); }
@ -2141,38 +2179,36 @@ void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T
void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0); } void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0); }
void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44); } void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44); }
void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44); } void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44); }
void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB5); }
void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB4); }
void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D); } void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D); }
void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F); } void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F); }
void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39); } void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39); }
void vpminuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3B); } void vpminuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3B); }
void vpmovb2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x29); } void vpmovb2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x29); }
void vpmovd2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x39); } void vpmovd2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x39); }
void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x31, false); } void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x31, false); }
void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x33, true); } void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x33, true); }
void vpmovm2b(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x28); } void vpmovm2b(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x28); }
void vpmovm2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x38); } void vpmovm2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x38); }
void vpmovm2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x38); } void vpmovm2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x38); }
void vpmovm2w(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x28); } void vpmovm2w(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x28); }
void vpmovq2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x39); } void vpmovq2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x39); }
void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x32, false); } void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x32, false); }
void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x35, true); } void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x35, true); }
void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x34, false); } void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x34, false); }
void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x21, false); } void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x21, false); }
void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x23, true); } void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x23, true); }
void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x22, false); } void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x22, false); }
void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x25, true); } void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x25, true); }
void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x24, false); } void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x24, false); }
void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x20, true); } void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x20, true); }
void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x11, false); } void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x11, false); }
void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x13, true); } void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x13, true); }
void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x12, false); } void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x12, false); }
void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x15, true); } void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x15, true); }
void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x14, false); } void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x14, false); }
void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x10, true); } void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x10, true); }
void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x29); } void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x29); }
void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x30, true); } void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x30, true); }
void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40); } void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40); }
void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83); } void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83); }
void vpopcntb(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54); } void vpopcntb(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54); }

View file

@ -4,7 +4,6 @@
#ifdef XBYAK_ONLY_CLASS_CPU #ifdef XBYAK_ONLY_CLASS_CPU
#include <stdint.h> #include <stdint.h>
#include <stdlib.h> #include <stdlib.h>
#include <algorithm>
#include <assert.h> #include <assert.h>
#ifndef XBYAK_THROW #ifndef XBYAK_THROW
#define XBYAK_THROW(x) ; #define XBYAK_THROW(x) ;
@ -96,6 +95,11 @@ struct TypeT {
template<uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2> template<uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2>
TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) { return TypeT<L1 | L2, H1 | H2>(); } TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) { return TypeT<L1 | L2, H1 | H2>(); }
template<typename T>
inline T max_(T x, T y) { return x >= y ? x : y; }
template<typename T>
inline T min_(T x, T y) { return x < y ? x : y; }
} // local } // local
/** /**
@ -193,8 +197,8 @@ private:
/* /*
Fallback values in case a hypervisor has 0xB leaf zeroed-out. Fallback values in case a hypervisor has 0xB leaf zeroed-out.
*/ */
numCores_[SmtLevel - 1] = (std::max)(1u, numCores_[SmtLevel - 1]); numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]);
numCores_[CoreLevel - 1] = (std::max)(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]); numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
} else { } else {
/* /*
Failed to deremine num of cores without x2APIC support. Failed to deremine num of cores without x2APIC support.
@ -237,7 +241,7 @@ private:
if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) { if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1; uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1;
if (logical_cores != 0) { // true only if leaf 0xB is supported and valid if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
actual_logical_cores = (std::min)(actual_logical_cores, logical_cores); actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
} }
assert(actual_logical_cores != 0); assert(actual_logical_cores != 0);
dataCacheSize_[dataCacheLevels_] = dataCacheSize_[dataCacheLevels_] =
@ -247,7 +251,7 @@ private:
* (data[2] + 1); * (data[2] + 1);
if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores; if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
assert(smt_width != 0); assert(smt_width != 0);
coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u); coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u);
dataCacheLevels_++; dataCacheLevels_++;
} }
} }
@ -302,7 +306,7 @@ public:
static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4]) static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4])
{ {
#ifdef XBYAK_INTEL_CPU_SPECIFIC #ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER #ifdef _WIN32
__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn); __cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
#else #else
__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]); __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
@ -406,6 +410,13 @@ public:
XBYAK_DEFINE_TYPE(65, tMOVDIRI); XBYAK_DEFINE_TYPE(65, tMOVDIRI);
XBYAK_DEFINE_TYPE(66, tMOVDIR64B); XBYAK_DEFINE_TYPE(66, tMOVDIR64B);
XBYAK_DEFINE_TYPE(67, tCLZERO); // AMD Zen XBYAK_DEFINE_TYPE(67, tCLZERO); // AMD Zen
XBYAK_DEFINE_TYPE(68, tAMX_FP16);
XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8);
XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT);
XBYAK_DEFINE_TYPE(71, tAVX_IFMA);
XBYAK_DEFINE_TYPE(72, tRAO_INT);
XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
#undef XBYAK_SPLIT_ID #undef XBYAK_SPLIT_ID
#undef XBYAK_DEFINE_TYPE #undef XBYAK_DEFINE_TYPE
@ -545,10 +556,17 @@ public:
if (EDX & (1U << 22)) type_ |= tAMX_BF16; if (EDX & (1U << 22)) type_ |= tAMX_BF16;
if (maxNumSubLeaves >= 1) { if (maxNumSubLeaves >= 1) {
getCpuidEx(7, 1, data); getCpuidEx(7, 1, data);
if (EAX & (1U << 3)) type_ |= tRAO_INT;
if (EAX & (1U << 4)) type_ |= tAVX_VNNI; if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
if (type_ & tAVX512F) { if (type_ & tAVX512F) {
if (EAX & (1U << 5)) type_ |= tAVX512_BF16; if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
} }
if (EAX & (1U << 7)) type_ |= tCMPCCXADD;
if (EAX & (1U << 21)) type_ |= tAMX_FP16;
if (EAX & (1U << 23)) type_ |= tAVX_IFMA;
if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8;
if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT;
if (EDX & (1U << 14)) type_ |= tPREFETCHITI;
} }
} }
setFamily(); setFamily();
@ -771,7 +789,7 @@ public:
const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0); const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM) if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM)
const Reg64& _rsp = code->rsp; const Reg64& _rsp = code->rsp;
saveNum_ = (std::max)(0, allRegNum - noSaveNum); saveNum_ = local::max_(0, allRegNum - noSaveNum);
const int *tbl = getOrderTbl() + noSaveNum; const int *tbl = getOrderTbl() + noSaveNum;
for (int i = 0; i < saveNum_; i++) { for (int i = 0; i < saveNum_; i++) {
code->push(Reg64(tbl[i])); code->push(Reg64(tbl[i]));