externals: Update xbyak to v6.68

Merge commit 'f6fdb5f55a88f73ef7bef45f50cf5878ceec9781'
This commit is contained in:
Merry 2022-12-30 23:05:02 +00:00
commit 916d7cf9bd
30 changed files with 589 additions and 155 deletions

View file

@ -1,13 +1,21 @@
name: test
on: [push]
defaults:
run:
shell: sh
permissions:
contents: read
jobs:
build:
name: test
test:
runs-on: ubuntu-latest
container:
image: debian:testing
steps:
- uses: actions/checkout@v2
- run: sudo apt update
- run: sudo apt install nasm yasm g++-multilib tcsh
- uses: actions/checkout@v3
- run: apt -y update
- run: apt -y install g++-multilib libboost-dev make nasm yasm
- run: make test
- run: make -C sample CXXFLAGS="-DXBYAK_NO_EXCEPTION"

8
externals/xbyak/Android.bp vendored Normal file
View file

@ -0,0 +1,8 @@
//#################################################
cc_library_headers {
name: "xbyak_headers",
vendor: true,
export_include_dirs: [
"xbyak"
],
}

View file

@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 2.6...3.0.2)
project(xbyak LANGUAGES CXX VERSION 6.61)
project(xbyak LANGUAGES CXX VERSION 6.68)
file(GLOB headers xbyak/*.h)

View file

@ -1,5 +1,14 @@
# History
* 2022/Dec/07 ver 6.68 support prefetchit{0,1}
* 2022/Nov/30 ver 6.67 support CMPccXADD
* 2022/Nov/25 ver 6.66 support RAO-INT
* 2022/Nov/22 ver 6.65 consider x32
* 2022/Nov/04 ver 6.64 some vmov* support addressing with mask
* 2022/Oct/06 ver 6.63 vpmadd52{h,l}uq support AVX-IFMA
* 2022/Oct/05 ver 6.63 support amx_fp16/avx_vnni_int8/avx_ne_convert and add setDefaultEncoding()
* 2022/Aug/15 ver 6.62 add serialize instruction
* 2022/Aug/02 ver 6.61.1 noexcept is supported by Visual Studio 2015 or later
* 2022/Jul/29 ver 6.61 fix exception of movzx eax, ah in 64-bit mode
* 2022/Jun/16 ver 6.60.2 fix detection of GFNI, VAES, and VPCLMULQDQ
* 2022/Jun/15 ver 6.60.1 fix link error of Xbyak::util::Cpu on Visual Studio with /O0 option

View file

@ -12,3 +12,15 @@ make install
```
These files are copied into `/usr/local/include/xbyak`.
# Building xbyak - Using vcpkg
You can download and install xbyak using the [vcpkg](https://github.com/Microsoft/vcpkg) dependency manager:
git clone https://github.com/Microsoft/vcpkg.git
cd vcpkg
./bootstrap-vcpkg.sh
./vcpkg integrate install
./vcpkg install xbyak
The xbyak port in vcpkg is kept up to date by Microsoft team members and community contributors. If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.

View file

@ -110,7 +110,15 @@ vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64],
vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding
setDefaultEncoding(VexEncoding); // default encoding is VEX
vpdpbusd(xm0, xm1, xm2); // VEX encoding
```
- setDefaultEncoding(PreferredEncoding encoding);
- Set the default encoding to select EVEX or VEX.
- The default value is EvexEncoding.
- This function affects only an instruction that has a PreferredEncoding argument such as vpdpbusd.
### Remark
* `k1`, ..., `k7` are opmask registers.
- `k0` is dealt as no mask.

View file

@ -1,6 +1,6 @@
TARGET=../xbyak/xbyak_mnemonic.h
BIN=sortline gen_code gen_avx512
CFLAGS=-I../ -O2 -DXBYAK_NO_OP_NAMES -Wall -Wextra -Wno-missing-field-initializers $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS)
CFLAGS=-I../ -O2 -Wall -Wextra -Wno-missing-field-initializers $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS)
all: $(TARGET) ../CMakeLists.txt ../meson.build ../readme.md ../readme.txt
sortline: sortline.cpp
$(CXX) $(CFLAGS) $< -o $@

View file

@ -387,9 +387,6 @@ void putX_X_XM_IMM()
{ 0x57, "vreducess", T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N4, true },
{ 0x57, "vreducesh", T_0F3A | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, true },
{ 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false },
{ 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false },
{ 0x70, "vpshldw", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z, true },
{ 0x71, "vpshldd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, true },
{ 0x71, "vpshldq", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, true },
@ -695,29 +692,29 @@ void putMov()
int type;
int mode;
} tbl[] = {
{ 0x32, "vpmovqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL, false },
{ 0x22, "vpmovsqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL, false },
{ 0x12, "vpmovusqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL, false },
{ 0x32, "vpmovqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false },
{ 0x22, "vpmovsqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false },
{ 0x12, "vpmovusqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false },
{ 0x34, "vpmovqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false },
{ 0x24, "vpmovsqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false },
{ 0x14, "vpmovusqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false },
{ 0x34, "vpmovqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
{ 0x24, "vpmovsqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
{ 0x14, "vpmovusqw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
{ 0x35, "vpmovqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
{ 0x25, "vpmovsqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
{ 0x15, "vpmovusqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
{ 0x35, "vpmovqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x25, "vpmovsqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x15, "vpmovusqd", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x31, "vpmovdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false },
{ 0x21, "vpmovsdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false },
{ 0x11, "vpmovusdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL, false },
{ 0x31, "vpmovdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
{ 0x21, "vpmovsdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
{ 0x11, "vpmovusdb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 | T_N_VL | T_M_K, false },
{ 0x33, "vpmovdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
{ 0x23, "vpmovsdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
{ 0x13, "vpmovusdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
{ 0x33, "vpmovdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x23, "vpmovsdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x13, "vpmovusdw", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x30, "vpmovwb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
{ 0x20, "vpmovswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
{ 0x10, "vpmovuswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL, true },
{ 0x30, "vpmovwb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x20, "vpmovswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
{ 0x10, "vpmovuswb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N8 | T_N_VL | T_M_K, true },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
@ -827,7 +824,6 @@ void putMisc()
puts("void vfpclasssh(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isXMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_0F3A | T_MUST_EVEX | T_EW0 | T_N2, 0x67, imm); }");
puts("void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) { opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F); }");
puts("void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }");
puts("void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68); }");
puts("void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68); }");

View file

@ -560,6 +560,8 @@ void put()
{ 0, "nta", 0x18},
{ 2, "wt1", 0x0D},
{ 1, "w", 0x0D},
{ 7, "it0", 0x18},
{ 6, "it1", 0x18},
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
@ -693,6 +695,7 @@ void put()
{ "lock", 0xF0 },
{ "sahf", 0x9E },
{ "serialize", 0x0F, 0x01, 0xE8 },
{ "stc", 0xF9 },
{ "std", 0xFD },
{ "sti", 0xFB },
@ -806,6 +809,23 @@ void put()
printf("void %s(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x%02X, %d); }\n", p->name, p->code, p->ext);
}
}
{
const struct Tbl {
const char *name;
uint8_t prefix;
} tbl[] = {
{ "aadd", 0 },
{ "aand", 0x66 },
{ "aor", 0xF2 },
{ "axor", 0xF3 },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
printf("void %s(const Address& addr, const Reg32e &reg) { ", p->name);
if (p->prefix) printf("db(0x%02X); ", p->prefix);
printf("opModM(addr, reg, 0x0F, 0x38, 0x0FC); }\n");
}
}
{
const struct Tbl {
@ -1666,6 +1686,25 @@ void put()
puts("void vcvtps2ph(const Operand& op, const Xmm& x, uint8_t imm) { checkCvt1(x, op); opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y | T_M_K, 0x1D, imm); }");
}
{
const struct Tbl {
const char *name;
int type;
uint8_t code;
} tbl[] = {
{ "vbcstnebf162ps", T_F3 | T_0F38 | T_W0 | T_B16 | T_YMM, 0xB1 },
{ "vbcstnesh2ps", T_66 | T_0F38 | T_W0 | T_B16 | T_YMM, 0xB1 },
{ "vcvtneebf162ps", T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0 },
{ "vcvtneeph2ps", T_66 | T_0F38 | T_W0 | T_YMM, 0xB0 },
{ "vcvtneobf162ps", T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0 },
{ "vcvtneoph2ps", T_0F38 | T_W0 | T_YMM, 0xB0 }
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
printf("void %s(const Xmm& x, const Address& addr) { opVex(x, 0, addr, %s, 0x%02X); }\n", p.name, type2String(p.type).c_str(), p.code);
}
puts("void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72); }");
}
// haswell gpr(reg, reg, r/m)
{
const struct Tbl {
@ -1755,11 +1794,33 @@ void put()
{ 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
{ 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
{ 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
{ 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 },
{ 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
std::string type = type2String(p->type);
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, type.c_str(), p->code);
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, type.c_str(), p->code);
}
}
// avx-vnni-int8
{
const struct Tbl {
uint8_t code;
const char *name;
int type;
} tbl[] = {
{ 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM },
{ 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM },
{ 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM },
{ 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
{ 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM },
{ 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
std::string type = type2String(p->type);
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, type.c_str(), p->code);
}
}
}
@ -1824,6 +1885,34 @@ void put64()
puts("void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); }");
puts("void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); }");
// CMPccXADD
{
const struct Tbl {
const char *name;
uint8_t code;
} tbl[] = {
{ "be", 0xE6 },
{ "b", 0xE2 },
{ "le", 0xEE },
{ "l", 0xEC },
{ "nbe", 0xE7 },
{ "nb", 0xE3 },
{ "nle", 0xEF },
{ "nl", 0xED },
{ "no", 0xE1 },
{ "np", 0xEB },
{ "ns", 0xE9 },
{ "nz", 0xE5 },
{ "o", 0xE0 },
{ "p", 0xEA },
{ "s", 0xE8 },
{ "z", 0xE4 },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
printf("void cmp%sxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0x%02X, false); }\n", p->name, p->code);
}
}
}
void putAMX_TILE()
@ -1842,6 +1931,7 @@ void putAMX_INT8()
puts("void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }");
puts("void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }");
puts("void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }");
puts("void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }");
}
void putAMX_BF16()
{

View file

@ -5,7 +5,7 @@
project(
'xbyak',
'cpp',
version: '6.61',
version: '6.68',
license: 'BSD-3-Clause',
default_options: 'b_ndebug=if-release'
)

View file

@ -1,5 +1,5 @@
# Xbyak 6.61 [![Badge Build]][Build Status]
# Xbyak 6.68 [![Badge Build]][Build Status]
*A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)*
@ -28,6 +28,7 @@ If you want to use them, then specify `-fno-operator-names` option to gcc/clang.
### News
- add amx_fp16/avx_vnni_int8/avx_ne_convert/avx-ifma
- add movdiri, movdir64b, clwb, cldemote
- WAITPKG instructions (tpause, umonitor, umwait) are supported.
- MmapAllocator supports memfd with user-defined strings. see sample/memfd.cpp

View file

@ -1,5 +1,5 @@
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.61
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.68
-----------------------------------------------------------------------------
◎概要
@ -166,13 +166,15 @@ vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64],
vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding
setDefaultEncoding(VexEncoding); // default encoding is VEX
vpdpbusd(xm0, xm1, xm2); // VEX encoding
注意
* k1, ..., k7 は新しいopmaskレジスタです。
* z, sae, rn-sae, rd-sae, ru-sae, rz-saeの代わりにT_z, T_sae, T_rn_sae, T_rd_sae, T_ru_sae, T_rz_saeを使ってください。
* `k4 | k3`と`k3 | k4`は意味が異なります。
* {1toX}の代わりにptr_bを使ってください。Xは自動的に決まります。
* 一部の命令はメモリサイズを指定するためにxword/yword/zword(_b)を使ってください。
* setDefaultEncoding()でencoding省略時のEVEX/VEXを設定できます。
・ラベル
@ -400,6 +402,15 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
-----------------------------------------------------------------------------
◎履歴
2022/12/07 ver 6.68 prefetchit{0,1}サポート
2022/11/30 ver 6.67 CMPccXADDサポート
2022/11/25 ver 6.66 RAO-INTサポート
2022/11/22 ver 6.65 x32動作確認
2022/11/04 ver 6.64 vmov*命令をmaskつきアドレッシング対応修正
2022/10/06 ver 6.63 AVX-IFMA用のvpmadd52{h,l}uq対応
2022/10/05 amx_fp16/avx_vnni_int8/avx_ne_convertt対応とsetDefaultEncoding()追加
2022/09/15 ver 6.62 serialize追加
2022/08/02 ver 6.61.1 noexceptはVisual Studio 2015以降対応
2022/07/29 ver 6.61 movzx eax, ahがエラーになるのを修正
2022/06/16 ver 6.60.2 GFNI, VAES, VPCLMULQDQの判定修正
2022/06/15 ver 6.60.1 Visual Studio /O0でXbyak::util::Cpuがリンクエラーになるのに対応

View file

@ -1,6 +1,7 @@
XBYAK_INC=../xbyak/xbyak.h
CXX?=g++
BOOST_EXIST=$(shell echo "\#include <boost/spirit/core.hpp>" | (gcc -E - 2>/dev/null) | grep "boost/spirit/core.hpp" >/dev/null && echo "1")
BOOST_EXIST=$(shell echo "#include <boost/spirit/core.hpp>" | $(CXX) -x c++ -c - 2>/dev/null && echo 1)
UNAME_M=$(shell uname -m)
ONLY_64BIT=0
@ -104,7 +105,7 @@ profiler-vtune: profiler.cpp ../xbyak/xbyak_util.h
$(CXX) $(CFLAGS) profiler.cpp -o $@ -DXBYAK_USE_VTUNE -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
clean:
rm -rf *.o $(TARGET) *.exe profiler profiler-vtune
rm -rf $(TARGET) profiler profiler-vtune
test : test0.cpp $(XBYAK_INC)
test64: test0.cpp $(XBYAK_INC)

View file

@ -199,7 +199,7 @@ int main(int argc, char *argv[])
quantize2(dest2, src, qTbl);
for (int i = 0; i < N; i++) {
if (dest[i] != dest2[i]) {
printf("err[%d] %d %d\n", i, dest[i], dest2[i]);
printf("err[%d] %u %u\n", i, dest[i], dest2[i]);
}
}

View file

@ -89,6 +89,13 @@ void putCPUinfo(bool onlyCpuidFeature)
{ Cpu::tMOVDIRI, "movdiri" },
{ Cpu::tMOVDIR64B, "movdir64b" },
{ Cpu::tCLZERO, "clzero" },
{ Cpu::tAMX_FP16, "amx_fp16" },
{ Cpu::tAVX_VNNI_INT8, "avx_vnni_int8" },
{ Cpu::tAVX_NE_CONVERT, "avx_ne_convert" },
{ Cpu::tAVX_IFMA, "avx_ifma" },
{ Cpu::tRAO_INT, "rao-int" },
{ Cpu::tCMPCCXADD, "cmpccxadd" },
{ Cpu::tPREFETCHITI, "prefetchiti" },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);

View file

@ -5,8 +5,8 @@
mem_ 4byte x 65536
4byte固定
16bit
all instructions are fixed at 4 bytes.
all immediate values are 16-bit.
R = A or B
vldiR, imm ; R = imm
@ -109,7 +109,7 @@ public:
reg[r] -= imm;
break;
case PUT:
printf("%c %8d(0x%08x)\n", 'A' + r, reg[r], reg[r]);
printf("%c %8u(0x%08x)\n", 'A' + r, reg[r], reg[r]);
break;
case JNZ:
if (reg[r] != 0) pc += static_cast<signed short>(imm);
@ -294,7 +294,7 @@ lp:
p = t;
n--;
if (n != 0) goto lp;
printf("c=%d(0x%08x)\n", c, c);
printf("c=%u(0x%08x)\n", c, c);
}
int main()

View file

@ -1,6 +1,9 @@
TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32
TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32
XBYAK_INC=../xbyak/xbyak.h
UNAME_S=$(shell uname -s)
ifeq ($(shell ./detect_x32),x32)
X32?=1
endif
BIT=32
ifeq ($(shell uname -m),x86_64)
BIT=64
@ -20,9 +23,9 @@ endif
all: $(TARGET)
CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith
CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wwrite-strings -Wfloat-equal -Wpointer-arith
CFLAGS=-O2 -fomit-frame-pointer -Wall -fno-operator-names -I../ -I./ $(CFLAGS_WARN) $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) #-std=c++0x
CFLAGS=-O2 -Wall -I../ -I./ $(CFLAGS_WARN) $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) #-std=c++0x
make_nm:
$(CXX) $(CFLAGS) make_nm.cpp -o $@
normalize_prefix: normalize_prefix.cpp ../xbyak/xbyak.h
@ -53,12 +56,11 @@ noexception: noexception.cpp ../xbyak/xbyak.h
test_nm: normalize_prefix $(TARGET)
$(MAKE) -C ../gen
ifneq ($(ONLY_64BIT),1)
./test_nm.sh
./test_nm.sh noexcept
./noexception
./test_nm.sh Y
./test_nm.sh avx512
./test_address.sh
CXX=$(CXX) ./test_nm.sh
CXX=$(CXX) ./test_nm.sh noexcept
CXX=$(CXX) ./test_nm.sh Y
CXX=$(CXX) ./test_nm.sh avx512
CXX=$(CXX) ./test_address.sh
./jmp
./cvt_test32
endif
@ -67,32 +69,38 @@ endif
./misc32
./cvt_test
ifeq ($(BIT),64)
./test_address.sh 64
./test_nm.sh 64
./test_nm.sh Y64
CXX=$(CXX) ./test_address.sh 64
ifneq ($(X32),1)
CXX=$(CXX) ./test_nm.sh 64
CXX=$(CXX) ./test_nm.sh Y64
endif
./jmp64
endif
test_avx: normalize_prefix
ifneq ($(ONLY_64BIT),0)
./test_avx.sh
./test_avx.sh Y
CXX=$(CXX) ./test_avx.sh
CXX=$(CXX) ./test_avx.sh Y
endif
ifeq ($(BIT),64)
./test_address.sh 64
./test_avx.sh 64
./test_avx.sh Y64
CXX=$(CXX) ./test_avx.sh 64
ifneq ($(X32),1)
CXX=$(CXX) ./test_avx.sh Y64
endif
endif
test_avx512: normalize_prefix
ifneq ($(ONLY_64BIT),0)
./test_avx512.sh
CXX=$(CXX) ./test_avx512.sh
endif
ifeq ($(BIT),64)
./test_avx512.sh 64
CXX=$(CXX) ./test_avx512.sh 64
endif
test:
detect_x32: detect_x32.c
$(CC) $< -o $@
test: detect_x32
$(MAKE) test_nm
$(MAKE) test_avx
$(MAKE) test_avx512
@ -104,4 +112,3 @@ lib_run: lib_test.cpp lib_run.cpp lib.h
$(CXX) $(CFLAGS) lib_run.cpp lib_test.cpp -o lib_run
make_nm: make_nm.cpp $(XBYAK_INC)

View file

@ -1,4 +1,4 @@
OPT=/EHsc -I../xbyak /W4 -D_CRT_SECURE_NO_WARNINGS
OPT=/EHsc -I../xbyak /W4 -D_CRT_SECURE_NO_WARNINGS -I ../
../xbyak/xbyak_mnemonic.h: ../gen/gen_code.exe ../gen/gen_avx512.exe
../gen/gen_code.exe > $@
../gen/gen_avx512.exe >> $@

8
externals/xbyak/test/detect_x32.c vendored Normal file
View file

@ -0,0 +1,8 @@
#include <stdio.h>
int main()
{
#if defined(__x86_64__) && defined(__ILP32__)
puts("x32");
#endif
}

View file

@ -1807,44 +1807,44 @@ public:
put("vpmovd2m", K, _XMM | _YMM | _ZMM);
put("vpmovq2m", K, _XMM | _YMM | _ZMM);
put("vpmovqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
put("vpmovsqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
put("vpmovusqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
put("vpmovqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovsqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovusqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
put("vpmovsqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
put("vpmovusqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
put("vpmovqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovsqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovusqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovqd", XMM_KZ | _MEM, _XMM | _YMM);
put("vpmovqd", YMM_KZ | _MEM, _ZMM);
put("vpmovqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovqd", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovsqd", XMM_KZ | _MEM, _XMM | _YMM);
put("vpmovsqd", YMM_KZ | _MEM, _ZMM);
put("vpmovsqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovsqd", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovusqd", XMM_KZ | _MEM, _XMM | _YMM);
put("vpmovusqd", YMM_KZ | _MEM, _ZMM);
put("vpmovusqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovusqd", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
put("vpmovsdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
put("vpmovusdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
put("vpmovdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovsdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovusdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
put("vpmovdw", XMM_KZ | _MEM, _XMM | _YMM);
put("vpmovdw", YMM_KZ | _MEM, _ZMM);
put("vpmovdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovdw", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovsdw", XMM_KZ | _MEM, _XMM | _YMM);
put("vpmovsdw", YMM_KZ | _MEM, _ZMM);
put("vpmovsdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovsdw", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovusdw", XMM_KZ | _MEM, _XMM | _YMM);
put("vpmovusdw", YMM_KZ | _MEM, _ZMM);
put("vpmovusdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovusdw", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovwb", XMM_KZ | _MEM, _XMM | _YMM);
put("vpmovwb", YMM_KZ | _MEM, _ZMM);
put("vpmovwb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovwb", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovswb", XMM_KZ | _MEM, _XMM | _YMM);
put("vpmovswb", YMM_KZ | _MEM, _ZMM);
put("vpmovswb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovswb", YMM_KZ | _MEM | MEM_K, _ZMM);
put("vpmovuswb", XMM_KZ | _MEM, _XMM | _YMM);
put("vpmovuswb", YMM_KZ | _MEM, _ZMM);
put("vpmovuswb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
put("vpmovuswb", YMM_KZ | _MEM | MEM_K, _ZMM);
}
void putRot()
{

View file

@ -533,6 +533,7 @@ class Test {
"nop",
"sahf",
"serialize",
"stc",
"std",
"sti",
@ -1017,9 +1018,7 @@ class Test {
}
void putCmov() const
{
const struct {
const char *s;
} tbl[] = {
const char tbl[][4] = {
"o",
"no",
"b",
@ -1053,11 +1052,11 @@ class Test {
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
char buf[32];
snprintf(buf, sizeof(buf), "cmov%s", tbl[i].s);
snprintf(buf, sizeof(buf), "cmov%s", tbl[i]);
put(buf, REG16, REG16|MEM);
put(buf, REG32, REG32|MEM);
put(buf, REG64, REG64|MEM);
snprintf(buf, sizeof(buf), "set%s", tbl[i].s);
snprintf(buf, sizeof(buf), "set%s", tbl[i]);
put(buf, REG8|REG8_3|MEM);
}
}
@ -1294,7 +1293,7 @@ class Test {
put(p, REG64, "0x1234567890abcdefLL", "0x1234567890abcdef");
put("movbe", REG16|REG32e, MEM);
put("movbe", MEM, REG16|REG32e);
#ifdef XBYAK64
#if defined(XBYAK64) && !defined(__ILP32__)
put(p, RAX|EAX|AX|AL, "ptr [0x1234567890abcdefLL]", "[qword 0x1234567890abcdef]");
put(p, "ptr [0x1234567890abcdefLL]", "[qword 0x1234567890abcdef]", RAX|EAX|AX|AL);
put(p, "qword [rax], 0");
@ -2608,7 +2607,7 @@ public:
putMPX();
#endif
#ifdef XBYAK64
#if defined(XBYAK64) && !defined(__ILP32__)
#ifdef USE_YASM
putRip();

View file

@ -5,6 +5,7 @@
#include <xbyak/xbyak_util.h>
#include <cybozu/inttype.hpp>
#include <cybozu/test.hpp>
#include <algorithm>
using namespace Xbyak;
@ -97,13 +98,17 @@ CYBOZU_TEST_AUTO(mov_const)
}
#ifdef XBYAK64
CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x7fffffff]));
if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32
CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x17fffffff]), Xbyak::Error);
}
#ifdef XBYAK_OLD_DISP_CHECK
CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x80000000]));
CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0xffffffff]));
#else
if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32
CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x80000000ull]), Xbyak::Error);
CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0xffffffffull]), Xbyak::Error);
}
#endif
#endif
}
@ -875,6 +880,10 @@ CYBOZU_TEST_AUTO(vnni)
vpdpbusd(xm0, xm1, xm2);
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // EVEX
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX
setDefaultEncoding(VexEncoding);
vpdpbusd(xm0, xm1, xm2); // VEX
setDefaultEncoding(EvexEncoding);
vpdpbusd(xm0, xm1, xm2); // EVEX
}
void badVex()
{
@ -885,6 +894,8 @@ CYBOZU_TEST_AUTO(vnni)
0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
0xC4, 0xE2, 0x71, 0x50, 0xC2,
0xC4, 0xE2, 0x71, 0x50, 0xC2,
0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
@ -1975,3 +1986,175 @@ CYBOZU_TEST_AUTO(cpu)
Cpu cpu;
CYBOZU_TEST_EQUAL(cpu.has(Cpu::tINTEL) && cpu.has(Cpu::tAMD), cpu.has(Cpu::tINTEL | Cpu::tAMD));
}
CYBOZU_TEST_AUTO(minmax)
{
using namespace Xbyak::util;
CYBOZU_TEST_EQUAL((std::min)(3, 4), local::min_(3, 4));
CYBOZU_TEST_EQUAL((std::max)(3, 4), local::max_(3, 4));
}
CYBOZU_TEST_AUTO(rao_int)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
#ifdef XBYAK64
aadd(ptr[rax], ecx);
aadd(ptr[eax], ecx);
aadd(ptr[rax], r10);
aand(ptr[rax], ecx);
aand(ptr[eax], ecx);
aand(ptr[rax], r10);
aor(ptr[rax], ecx);
aor(ptr[eax], ecx);
aor(ptr[rax], r10);
axor(ptr[rax], ecx);
axor(ptr[eax], ecx);
axor(ptr[rax], r10);
#else
aadd(ptr[eax], ecx);
aand(ptr[eax], ecx);
aor(ptr[eax], ecx);
axor(ptr[eax], ecx);
#endif
}
} c;
const uint8_t tbl[] = {
#ifdef XBYAK64
// aadd
0x0f, 0x38, 0xfc, 0x08,
0x67, 0x0f, 0x38, 0xfc, 0x08,
0x4c, 0x0f, 0x38, 0xfc, 0x10,
// aand
0x66, 0x0f, 0x38, 0xfc, 0x08,
0x66, 0x67, 0x0f, 0x38, 0xfc, 0x08,
0x66, 0x4c, 0x0f, 0x38, 0xfc, 0x10,
// aor
0xf2, 0x0f, 0x38, 0xfc, 0x08,
0xf2, 0x67, 0x0f, 0x38, 0xfc, 0x08,
0xf2, 0x4c, 0x0f, 0x38, 0xfc, 0x10,
// axor
0xf3, 0x0f, 0x38, 0xfc, 0x08,
0xf3, 0x67, 0x0f, 0x38, 0xfc, 0x08,
0xf3, 0x4c, 0x0f, 0x38, 0xfc, 0x10,
#else
// aadd
0x0f, 0x38, 0xfc, 0x08,
// aand
0x66, 0x0f, 0x38, 0xfc, 0x08,
// aor
0xf2, 0x0f, 0x38, 0xfc, 0x08,
// axor
0xf3, 0x0f, 0x38, 0xfc, 0x08,
#endif
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
#ifdef XBYAK64
CYBOZU_TEST_AUTO(CMPccXADD)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
// 32bit reg
cmpbexadd(ptr[rax+r10*4], ecx, edx);
cmpbxadd(ptr[rax+r10*4], ecx, edx);
cmplexadd(ptr[rax+r10*4], ecx, edx);
cmplxadd(ptr[rax+r10*4], ecx, edx);
cmpnbexadd(ptr[rax+r10*4], ecx, edx);
cmpnbxadd(ptr[rax+r10*4], ecx, edx);
cmpnlexadd(ptr[rax+r10*4], ecx, edx);
cmpnlxadd(ptr[rax+r10*4], ecx, edx);
cmpnoxadd(ptr[rax+r10*4], ecx, edx);
cmpnpxadd(ptr[rax+r10*4], ecx, edx);
cmpnsxadd(ptr[rax+r10*4], ecx, edx);
cmpnzxadd(ptr[rax+r10*4], ecx, edx);
cmpoxadd(ptr[rax+r10*4], ecx, edx);
cmppxadd(ptr[rax+r10*4], ecx, edx);
cmpsxadd(ptr[rax+r10*4], ecx, edx);
cmpzxadd(ptr[rax+r10*4], ecx, edx);
// 64bit reg
cmpbexadd(ptr[rax+r10*4], rcx, rdx);
cmpbxadd(ptr[rax+r10*4], rcx, rdx);
cmplexadd(ptr[rax+r10*4], rcx, rdx);
cmplxadd(ptr[rax+r10*4], rcx, rdx);
cmpnbexadd(ptr[rax+r10*4], rcx, rdx);
cmpnbxadd(ptr[rax+r10*4], rcx, rdx);
cmpnlexadd(ptr[rax+r10*4], rcx, rdx);
cmpnlxadd(ptr[rax+r10*4], rcx, rdx);
cmpnoxadd(ptr[rax+r10*4], rcx, rdx);
cmpnpxadd(ptr[rax+r10*4], rcx, rdx);
cmpnsxadd(ptr[rax+r10*4], rcx, rdx);
cmpnzxadd(ptr[rax+r10*4], rcx, rdx);
cmpoxadd(ptr[rax+r10*4], rcx, rdx);
cmppxadd(ptr[rax+r10*4], rcx, rdx);
cmpsxadd(ptr[rax+r10*4], rcx, rdx);
cmpzxadd(ptr[rax+r10*4], rcx, rdx);
}
} c;
const uint8_t tbl[] = {
// 32bit reg
0xc4, 0xa2, 0x69, 0xe6, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe2, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xee, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xec, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe7, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe3, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xef, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xed, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe1, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xeb, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe9, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe5, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe0, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xea, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe8, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe4, 0x0c, 0x90,
// 64bit reg
0xc4, 0xa2, 0xe9, 0xe6, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe2, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xee, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xec, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe7, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe3, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xef, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xed, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe1, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xeb, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe9, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe5, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe0, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xea, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe8, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe4, 0x0c, 0x90,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(prefetchiti)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
prefetchit0(ptr[rax]);
prefetchit1(ptr[rax]);
}
} c;
const uint8_t tbl[] = {
0x0f, 0x18, 0x38,
0x0f, 0x18, 0x30
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
#endif

View file

@ -56,7 +56,7 @@ void test2()
void test3()
{
static struct EmptyAllocator : Xbyak::Allocator {
uint8_t *alloc() { return 0; }
uint8_t *alloc(size_t) { return 0; }
} emptyAllocator;
struct Code : CodeGenerator {
Code() : CodeGenerator(8, 0, &emptyAllocator)

View file

@ -1,13 +1,17 @@
#!/bin/sh
set -e
FILTER="grep -v warning"
sub()
{
CFLAGS="-Wall -fno-operator-names -I../ $OPT2"
CFLAGS="-Wall -I../ $OPT2"
CXX=${CXX:=g++}
echo "compile address.cpp"
g++ $CFLAGS address.cpp -o address
$CXX $CFLAGS address.cpp -o address
./address $1 > a.asm
echo "asm"
@ -17,7 +21,7 @@ awk '{printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER > ok.lst
echo "xbyak"
./address $1 jit > nm.cpp
echo "compile nm_frame.cpp"
g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
./nm_frame > x.lst
diff ok.lst x.lst && echo "ok"

View file

@ -1,6 +1,9 @@
#!/bin/sh
set -e
FILTER="grep -v warning"
CXX=${CXX:=g++}
case $1 in
Y)
@ -31,9 +34,9 @@ Y64)
;;
esac
CFLAGS="-Wall -fno-operator-names -I../ $OPT2 -DUSE_AVX"
CFLAGS="-Wall -I../ $OPT2 -DUSE_AVX"
echo "compile make_nm.cpp"
g++ $CFLAGS make_nm.cpp -o make_nm
$CXX $CFLAGS make_nm.cpp -o make_nm
./make_nm > a.asm
echo "asm"
@ -43,6 +46,6 @@ awk '$3 != "1+1" {printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER
echo "xbyak"
./make_nm jit > nm.cpp
echo "compile nm_frame.cpp"
g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
./nm_frame | $FILTER > x.lst
diff -B ok.lst x.lst && echo "ok"

View file

@ -1,6 +1,9 @@
#!/bin/sh
set -e
FILTER="grep -v warning"
CXX=${CXX:=g++}
case $1 in
64)
@ -18,9 +21,9 @@ case $1 in
;;
esac
CFLAGS="-Wall -fno-operator-names -I../ $OPT2 -DUSE_AVX512"
CFLAGS="-Wall -I../ $OPT2 -DUSE_AVX512"
echo "compile make_512.cpp"
g++ $CFLAGS make_512.cpp -o make_512
$CXX $CFLAGS make_512.cpp -o make_512
./make_512 > a.asm
echo "asm"
@ -30,6 +33,6 @@ awk '{printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER > ok.lst
echo "xbyak"
./make_512 jit > nm.cpp
echo "compile nm_frame.cpp"
g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame -DXBYAK_AVX512
$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame -DXBYAK_AVX512
./nm_frame | $FILTER > x.lst
diff -B ok.lst x.lst && echo "ok"

View file

@ -1,6 +1,9 @@
#!/bin/sh
set -e
FILTER=cat
CXX=${CXX:=g++}
case $1 in
Y)
@ -44,9 +47,9 @@ noexcept)
;;
esac
CFLAGS="-Wall -fno-operator-names -I../ $OPT2"
CFLAGS="-Wall -I../ $OPT2"
echo "compile make_nm.cpp with $CFLAGS"
g++ $CFLAGS make_nm.cpp -o make_nm
$CXX $CFLAGS make_nm.cpp -o make_nm
./make_nm > a.asm
echo "asm"
@ -56,6 +59,6 @@ awk '$3 != "1+1" {printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER
echo "xbyak"
./make_nm jit > nm.cpp
echo "compile nm_frame.cpp"
g++ $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
./nm_frame | $FILTER > x.lst
diff -B ok.lst x.lst && echo "ok"

View file

@ -118,7 +118,7 @@
#endif
#endif
#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1800)
#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1900)
#undef XBYAK_TLS
#define XBYAK_TLS thread_local
#define XBYAK_VARIADIC_TEMPLATE
@ -144,11 +144,18 @@
#pragma warning(disable : 4127) /* constant expresison */
#endif
// disable -Warray-bounds because it may be a bug of gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104603
#if defined(__GNUC__) && !defined(__clang__)
#define XBYAK_DISABLE_WARNING_ARRAY_BOUNDS
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Warray-bounds"
#endif
namespace Xbyak {
enum {
DEFAULT_MAX_CODE_SIZE = 4096,
VERSION = 0x6610 /* 0xABCD = A.BC(.D) */
VERSION = 0x6680 /* 0xABCD = A.BC(.D) */
};
#ifndef MIE_INTEGER_TYPE_DEFINED
@ -371,7 +378,7 @@ inline bool IsInInt32(uint64_t x) { return ~uint64_t(0x7fffffffu) <= x || x <= 0
inline uint32_t VerifyInInt32(uint64_t x)
{
#ifdef XBYAK64
#if defined(XBYAK64) && !defined(__ILP32__)
if (!IsInInt32(x)) XBYAK_THROW_RET(ERR_OFFSET_IS_TOO_BIG, 0)
#endif
return static_cast<uint32_t>(x);
@ -1478,7 +1485,6 @@ public:
clabelDefList_.clear();
clabelUndefList_.clear();
resetLabelPtrList();
ClearError();
}
void enterLocal()
{
@ -1820,7 +1826,7 @@ private:
void setSIB(const RegExp& e, int reg, int disp8N = 0)
{
uint64_t disp64 = e.getDisp();
#ifdef XBYAK64
#if defined(XBYAK64) && !defined(__ILP32__)
#ifdef XBYAK_OLD_DISP_CHECK
// treat 0xffffffff as 0xffffffffffffffff
uint64_t high = disp64 >> 32;
@ -2412,18 +2418,21 @@ private:
if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
opVex(x, 0, addr, type, code);
}
void opVnni(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding)
void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding)
{
opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code0);
}
int orEvexIf(PreferredEncoding encoding) {
if (encoding == DefaultEncoding) {
encoding = EvexEncoding;
encoding = defaultEncoding_;
}
if (encoding == EvexEncoding) {
#ifdef XBYAK_DISABLE_AVX512
XBYAK_THROW(ERR_EVEX_IS_INVALID)
#endif
type |= T_MUST_EVEX;
return T_MUST_EVEX;
}
opAVX_X_X_XM(x1, x2, op, type, code0);
return 0;
}
void opInOut(const Reg& a, const Reg& d, uint8_t code)
{
@ -2508,6 +2517,7 @@ public:
#endif
private:
bool isDefaultJmpNEAR_;
PreferredEncoding defaultEncoding_;
public:
void L(const std::string& label) { labelMgr_.defineSlabel(label); }
void L(Label& label) { labelMgr_.defineClabel(label); }
@ -2787,11 +2797,13 @@ public:
, es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs)
#endif
, isDefaultJmpNEAR_(false)
, defaultEncoding_(EvexEncoding)
{
labelMgr_.set(this);
}
void reset()
{
ClearError();
resetSize();
labelMgr_.reset();
labelMgr_.set(this);
@ -2823,6 +2835,9 @@ public:
#undef jnl
#endif
// set default encoding to select Vex or Evex
void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; }
/*
use single byte nop if useMultiByteNop = false
*/
@ -2927,6 +2942,10 @@ static const XBYAK_CONSTEXPR Segment es(Segment::es), cs(Segment::cs), ss(Segmen
#pragma warning(pop)
#endif
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop
#endif
} // end of namespace
#endif // XBYAK_XBYAK_H_

View file

@ -1,4 +1,6 @@
const char *getVersionString() const { return "6.61"; }
const char *getVersionString() const { return "6.68"; }
void aadd(const Address& addr, const Reg32e &reg) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void aand(const Address& addr, const Reg32e &reg) { db(0x66); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@ -24,6 +26,8 @@ void andnpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x66, isXM
void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); }
void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); }
void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); }
void aor(const Address& addr, const Reg32e &reg) { db(0xF2); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void axor(const Address& addr, const Reg32e &reg) { db(0xF3); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf7, false); }
void blendpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
void blendps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
@ -654,6 +658,8 @@ void pmuludq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF4); }
void popcnt(const Reg&reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); }
void popf() { db(0x9D); }
void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); }
void prefetchit0(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0x18); }
void prefetchit1(const Address& addr) { opModM(addr, Reg32(6), 0x0F, 0x18); }
void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, 0x18); }
void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); }
void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); }
@ -747,6 +753,7 @@ void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); }
void scasb() { db(0xAE); }
void scasd() { db(0xAF); }
void scasw() { db(0x66); db(0xAF); }
void serialize() { db(0x0F); db(0x01); db(0xE8); }
void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }//-V524
void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }//-V524
void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }//-V524
@ -844,6 +851,8 @@ void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand())
void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); }
void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); }
void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); }
void vbcstnebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); }
void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); }
void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm); }
void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm); }
void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4); }
@ -988,6 +997,11 @@ void vcomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T
void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); }
void vcvtdq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6); }
void vcvtdq2ps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B); }
void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0); }
void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM, 0xB0); }
void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0); }
void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_W0 | T_YMM, 0xB0); }
void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72); }
void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); }
void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); }
void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); }
@ -1191,10 +1205,16 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1
void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); }
void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); }
void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); }
void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); }
void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); }
void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); }
void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); }
void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x50); }
void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x51); }
void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x50); }
void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x51); }
void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); }
void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); }
void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x50); }
void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x51); }
void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); }
void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); }
void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); }
void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); }
void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36); }
@ -1226,6 +1246,8 @@ void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if
void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); }
void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); }
void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); }
void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB5, encoding); }
void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB4, encoding); }
void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04); }
void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5); }
void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); }
@ -1642,6 +1664,22 @@ void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx())
void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x2C); }
void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); }
void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); }
void cmpbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE6, false); }
void cmpbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE2, false); }
void cmplexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEE, false); }
void cmplxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEC, false); }
void cmpnbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE7, false); }
void cmpnbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE3, false); }
void cmpnlexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEF, false); }
void cmpnlxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xED, false); }
void cmpnoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE1, false); }
void cmpnpxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEB, false); }
void cmpnsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE9, false); }
void cmpnzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE5, false); }
void cmpoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE0, false); }
void cmppxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEA, false); }
void cmpsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE8, false); }
void cmpzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE4, false); }
void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); }
void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }
void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); }
@ -1653,6 +1691,7 @@ void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T
void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }
void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }
void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }
void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }
void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }
#else
void jcxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }
@ -1907,7 +1946,6 @@ void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 |
void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); }
void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B); }
void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A); }
void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); }
void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); }
@ -2141,38 +2179,36 @@ void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T
void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0); }
void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44); }
void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44); }
void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB5); }
void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB4); }
void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D); }
void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F); }
void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39); }
void vpminuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3B); }
void vpmovb2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x29); }
void vpmovd2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x39); }
void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x31, false); }
void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x33, true); }
void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x31, false); }
void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x33, true); }
void vpmovm2b(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x28); }
void vpmovm2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x38); }
void vpmovm2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x38); }
void vpmovm2w(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x28); }
void vpmovq2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x39); }
void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x32, false); }
void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x35, true); }
void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x34, false); }
void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x21, false); }
void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x23, true); }
void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x22, false); }
void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x25, true); }
void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x24, false); }
void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x20, true); }
void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x11, false); }
void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x13, true); }
void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x12, false); }
void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x15, true); }
void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x14, false); }
void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x10, true); }
void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x32, false); }
void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x35, true); }
void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x34, false); }
void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x21, false); }
void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x23, true); }
void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x22, false); }
void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x25, true); }
void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x24, false); }
void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x20, true); }
void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x11, false); }
void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x13, true); }
void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x12, false); }
void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x15, true); }
void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x14, false); }
void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x10, true); }
void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x29); }
void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x30, true); }
void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x30, true); }
void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40); }
void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83); }
void vpopcntb(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54); }

View file

@ -4,7 +4,6 @@
#ifdef XBYAK_ONLY_CLASS_CPU
#include <stdint.h>
#include <stdlib.h>
#include <algorithm>
#include <assert.h>
#ifndef XBYAK_THROW
#define XBYAK_THROW(x) ;
@ -96,6 +95,11 @@ struct TypeT {
template<uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2>
TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) { return TypeT<L1 | L2, H1 | H2>(); }
template<typename T>
inline T max_(T x, T y) { return x >= y ? x : y; }
template<typename T>
inline T min_(T x, T y) { return x < y ? x : y; }
} // local
/**
@ -193,8 +197,8 @@ private:
/*
Fallback values in case a hypervisor has 0xB leaf zeroed-out.
*/
numCores_[SmtLevel - 1] = (std::max)(1u, numCores_[SmtLevel - 1]);
numCores_[CoreLevel - 1] = (std::max)(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]);
numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
} else {
/*
Failed to deremine num of cores without x2APIC support.
@ -237,7 +241,7 @@ private:
if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1;
if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
actual_logical_cores = (std::min)(actual_logical_cores, logical_cores);
actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
}
assert(actual_logical_cores != 0);
dataCacheSize_[dataCacheLevels_] =
@ -247,7 +251,7 @@ private:
* (data[2] + 1);
if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
assert(smt_width != 0);
coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u);
coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u);
dataCacheLevels_++;
}
}
@ -302,7 +306,7 @@ public:
static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4])
{
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
#ifdef _WIN32
__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
#else
__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
@ -406,6 +410,13 @@ public:
XBYAK_DEFINE_TYPE(65, tMOVDIRI);
XBYAK_DEFINE_TYPE(66, tMOVDIR64B);
XBYAK_DEFINE_TYPE(67, tCLZERO); // AMD Zen
XBYAK_DEFINE_TYPE(68, tAMX_FP16);
XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8);
XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT);
XBYAK_DEFINE_TYPE(71, tAVX_IFMA);
XBYAK_DEFINE_TYPE(72, tRAO_INT);
XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
#undef XBYAK_SPLIT_ID
#undef XBYAK_DEFINE_TYPE
@ -545,10 +556,17 @@ public:
if (EDX & (1U << 22)) type_ |= tAMX_BF16;
if (maxNumSubLeaves >= 1) {
getCpuidEx(7, 1, data);
if (EAX & (1U << 3)) type_ |= tRAO_INT;
if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
if (type_ & tAVX512F) {
if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
}
if (EAX & (1U << 7)) type_ |= tCMPCCXADD;
if (EAX & (1U << 21)) type_ |= tAMX_FP16;
if (EAX & (1U << 23)) type_ |= tAVX_IFMA;
if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8;
if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT;
if (EDX & (1U << 14)) type_ |= tPREFETCHITI;
}
}
setFamily();
@ -771,7 +789,7 @@ public:
const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM)
const Reg64& _rsp = code->rsp;
saveNum_ = (std::max)(0, allRegNum - noSaveNum);
saveNum_ = local::max_(0, allRegNum - noSaveNum);
const int *tbl = getOrderTbl() + noSaveNum;
for (int i = 0; i < saveNum_; i++) {
code->push(Reg64(tbl[i]));