Squashed 'externals/xbyak/' changes from a1ac3750f..2ce465bbc

2ce465bbc Merge branch 'dev'
0b3f360eb v7.05
66f22b7a4 update doc
13ee4e19f use opSetCC for setCC
383866b42 use opMR with APX
d6e6e6f85 tweak
a7b02ac80 RAO_INT supports APX
26840492c use Address.immSize
e2b40a33e refactor Address class
e1b6896c2 Merge branch 'dev'
c0888cc45 v7.04
7d9c82835 refactor rex
b3e27734b apx supports 0x0f opecode with rex2
2e7b62d78 bswap supports apx
2e93baa6a Merge branch 'dev'
e1864642c unify getMap and getMMM
0750873b7 T_MAP3 is not necessary
ee4984222 T_MAP1 is not necessary
5c95842be tweak
8c44467af add no_flags sample
523cf1ed0 fix comment of sample/ccmp.cpp
5438fc69d Merge branch 'dev'
ee26c094e v7.03
691ce361a [doc] update dfv
8d0e78146 set 0 for the default value of dfv
2255aea0d [doc] add ccmpSCC and ctestSCC
b5e115284 add sample/ccmp.cpp
bacd8d34b add sample/zero_upper.cpp
f17cb9d6b Merge branch 'dev'
c9ce3f8f6 v7.02
3427be298 unify opAESKL and opSHA
bfd14244a update doc
e690a2a47 sha* supports apx
c9765588f Merge branch 'dev'
903f7c02e v7.01
54a1f07f9 update cpuid by sde
223ddfaf8 add detection of sse4a/clwb
ba943b5b6 reorder cpu detection
30c362df5 Merge branch 'Sonicadvance1-missing_checks' into dev
02bc84ad8 renumber of tSSE4a, tCLWB
84fe3ab9d update doc
90fc0151c add encodekey{128,256}
440972b88 add detection of KEYLOCKER, KEYLOCKER_WIDE
68a30b91f add detection of AESKLE, WIDE_KL
e2d36c662 fix detection of AVX10
48551f5cc add aesenc{128,256}kl, aesencwide{128,256}kl
d9c7c992f add aesdecwide{128,256}kl
cd5231de0 add aesdec256kl
fcb3d0dbb add aesdec128kl
85709ace7 mvoe opKmov in private
406199e7a Support cpuid CLWB
1214aad95 Adds back missing SSE4a check
5315658ad add detection of avx10/apx_f
835f6d2e6 Merge pull request #180 from Tachi107/fix-32bit-tests
650b241e3 test: only run apx test when BIT=64
016ce86b6 [doc] add a blank line
df0ebc740 v7.00
1ec2adbbb Merge branch 'apx'
da1818592 update doc
bec145ba9 amx supports apx
944438195 add tests of kmov*
bd85d108c kmov* supports apx
93bd6a0b7 rename T_VEX to T_APX
b063d276f add misc tests
6d21c7389 add evex tests
05a66d2c0 support V4 in evex
33017d4fb support V4 in evex
e228e737d prepare evex extension of evex
45eca7987 update doc
98ce73bb2 add cfcmov tests
e2d9685af add cfcmov
a4ec97ca9 add tests of ctestscc
45711c502 add ctestscc
a1f6c14cc add alias of dfv
facb052a1 avoid r15 on 32-bit mode
c1c15848c remove warnings
be319626b add ccmpscc with imm
c4d05037e add ccmpscc
17f7d279c testing ccmpb
ff01b1e20 setcc supports apx
25ceea2ef add 3-op cmovcc
2f8cfb9a8 CMPccXADD supports APX
a9310deac add tests of push/pop
ec2881bfd push/pop support rex2
114152fed add push2/pop2
1aefdb649 support jmpabs
77eca6d0d add tests of 3-op shift
5e54ffdfa add 3-op shift
426814c50 check v instead of r
3f3d6095c disable rol/ror to support NF
ee572b7eb add tests of ror/rol
186d63ad9 add tests of shr/sar
26be71a12 2-op shl supports apx
83f5bd25e remove some warnings
e43d99762 add crc32 tests
92153b6f8 crc32 supports apx
d7ca6a2dd split T_F2 from T_66|T_F3
fb1fc738f tweak
389d73347 movbe supports apx and append test
3636cde22 tests of 1-byte opcode with rex2
1dd020126 check whether or not it is a 1-byte opcode
083822b52 movdiri supports apx
6703d4344 movdir64b supports apx
ed5dc3516 add tests of shld/shrd
b01c0ed40 shld/shrd support apx
c51c4a6f7 add tests of lzcnt and tzcnt
2cc22ea1b lzcnt and tzcnt support apx
baddec288 tweak
1d3a19a50 update doc of apx
273d8d5b6 add 3-op imul with T_zu
50875294c add tests of 2-op imul
d20142d01 add T_zu
eb9de1392 2-op imul supports apx
dba2c174f add 2op neg/not_
95ad5927f add tests of imul/mul/neg/not_ with 1-op
790afb745 add tests o idiv
045ef31a3 add tests of div
1d7e2a6bb div supports apx
e5fe58231 remove warning on 32-bit
66b3a3042 check all regs of NF
c7dba88df add dec test
f55f596ad add inc test
6f6423899 2-op inc/dec
95c0c4e6f tweak inc/dec
f5fda7ace change detection of pp with type
a18e5aeb5 rorx supports apx
5bb8461b4 blsmsk, blsr support apx
a493dc7b4 blsi supports apx
7c1accedc sarx/shlx/shrx support apx and add tests
125d8e740 test bzhi with apx
78be5afd1 add tests of bextr with apx
e9603b79d bextr supports apx
3a85aadc6 pdep, pext support apx
16f1a5d8a mulx supports apx
82529af93 andn supports APX
637ad7a4a add test of NF
e23f5ad75 fix type for adc
1bcc83303 3-op add supports T_nf
5d46b950b the type of all type is uint64_t
0a8ea9edf fix type
b1f0fef4d add test of 3op apx
9b21727ba remove space
6fa1b4a90 reorder of opRO
2d1f229a0 simplify condR
b220be972 simplify opRO
24b71a1ce use Reg instead of Operand if possible
de1353448 rename opGen with opSSE
4cd8e8eac refactor opGpr as opRRO
01d756917 rename
5037120f7 replace old rex with rexA
45fe94fdd rename opLoadSeg2 with opLoadSeg
253f800bc tweak
4f3939d92 rename opModM2 with opModM
fa731a27c rename opModR2 with opModR
e5db7d0e4 rename opModRM2 to opModRM
dc20fd09b use opModRM2
d4da1561b rename opR_ModM2 with opR_ModM
ef3665274 use opR_ModM2
e5b20e5a5 use opModM2
104941db2 use opModM2
6ae769f21 rename opROO2 with opROO
1521cb7ce rename opGen2 to opGen
f9c6cb5dc all opGen are replaced with opGen2
249d6978a use opGen2
81ae48922 use opGen2
b9e4bb2fc always put prefix as byte code
3374a158f use opGen2
719f81f45 use opGen2
8d037ebd6 use opGen2
6f8bc28e2 use opGen2
303876cac use opGen2
f0b49752a rewrite opMovXMM
5d4c48ffd rewrite opMMX
189c3488b use opMMX2
1361d0946 use opMMX2
32cafcc61 tweak
cf1cfd6c4 add temporary converting code
433bf29e3 replacing opModR with opModR2
ba1d07ed1 senduipi uses opModR2
646da9750 use opModR2 for rdrand, rdseed, movq
ccad6cecd use opModR2 for movdq2q, movq2dq
3c21754b9 use opModR2 for movd, movmskps
4718643ef use opModR2 for bswap, maskmovq, pmovmskb
e1a148707 try to use opModR2
220a5def7 split avx_type_def.h in gen/
87b8c8ed2 adox passes the test
bd8477292 fix detection of adox without apx
6b19515eb add adcx, adox with APX
77d6acea6 increase the room of type
710e39bfe add test of r, r/m
ea9cd9ade tweak
057f09c5b rename T_NF to T_nf
57a0c1935 support NF=1
8f49739da remove cmp of 3-op
e3310344c [doc] about APX
cdc2533c1 add test of adc/3op
9c6b81c4d return value on nothrow mode
8d524b4a4 add op(r, r/m, imm) and op(r, r/m, r/m)
4c62d1fdc test adc2(r, op, mem) and adc2(r, mem, op)
6f593a1cb test of adc2 (3op APX)
61addb9d9 simplify opMIB
575c447f1 remove rex2p
a95bd9cc5 add test of adc/add/and_/cmp/or_/sbb/sub/xor_
f7d3c17e8 tweak
d7a7ea912 refactoring rex
acd797139 use opModM instead of opMIB
ad3334ba6 add modRM with rex2
059d115b5 add test of apx.cpp
873c93a51 add test of regs of apx
e25b1cd62 [not tested] add(r1, r2) with rex2
eb118504d remove warning of VC
6c580b1f7 fix cvt test for extended r16-r31
981fa6f05 add r16 - r31
244623812 Merge branch 'dev'
aafe3cb62 build(cmake): bump minimum required to version 3.5
76d7477d7 Merge branch 'dev'
151c8ab04 v6.73
dd66cfb76 add tests of avx-vnni-int{8,16}
4a6132d66 update cpuid list
bea25541a add detection of AVX_VNNI_INT16
d9e76b1c6 add tests of SM4
e1c4c360b add SM4
d79717dbe add tests of SM3
48f8dbeb6 add SM3
5473d3933 vsha512* check regs
9b3687a68 add detection of SHA512, SM3, SM4
ecdd01ee5 mov crypt test in 64-bit mode
c4550b6a9 sde 9.24.0
5762819de add vsha512{msg1, msg2, rnds2}
3255d606a Merge branch 'dev'
322665e72 v6.72
ad178a219 add xabort/xbegin/xend
0924ff4aa Merge branch 'dev'
8980934c1 v6.71
76292b310 add SystemInfo class for win
3e42709ab ignore space and cr
66b2768a6 disable wrong detection of gcc
1855985e1 remove / for mingw64
5bdccc0b8 64bit only for mingw64
33882d0a0 use sysconf(_SC_PAGESIZE) instead of const value 4096 on linux
33075c2bd add link to other projects
60e71402e reorder
79854aa08 add new cpus
5921e270c update cpuid
ce083a0dc Merge branch 'dev'
b538485f3 v6.70
461dd34ee udpate doc
2149c79e3 add test of alias of vpclmulqdq
2c59c5c91 add alias of vpclmulqdq
729ae4aa3 fix alias of pclmulqdq
3c248d68a define XBYAK_CONSTEXPR if XBYAK_ONLY_CLASS_CPU is defined
c0a932d7b Merge remote-tracking branch 'origin/dev'
ef502b5b4 update doc
ba3db4730 update version
c0d7a704f v6.69.2
c535f4737 update cpuid test list
683249232 change the order of args of diff
e81b95583 Merge branch 'Wunkolo-constexpr-typet' into dev
ab3f40587 Allow constexpr TypeT `operator|`
ad5276fa4 Merge pull request #172 from orz--/patch-1
b4d54f6e1 Update changelog.md
58642e0cd Merge branch 'dev'
3b13d068b v6.69.1
d700f6c35 add detection of xsave
740dff2e8 Merge branch 'dev'
dc048a04c v6.69
ad0dfffd2 add senduipi/stui/testui/uiret
e78f1121b add clui
23b40331a add detection of uintr
98a0f1924 remove warning of sign/unsigned
0afd71a27 add detection of SERIALIZE
363bbaa57 sample shows cpu cache info for AMD
edce72709 Cpu supports AMD

git-subtree-dir: externals/xbyak
git-subtree-split: 2ce465bbca46e92dde9c44bbe7940fd7f70e3b97
This commit is contained in:
Merry 2024-01-30 00:36:49 +00:00
parent f6fdb5f55a
commit fdf626b74f
60 changed files with 5177 additions and 2057 deletions

View file

@ -1,53 +1,49 @@
cmake_minimum_required(VERSION 2.6...3.0.2)
cmake_minimum_required(VERSION 3.5)
project(xbyak LANGUAGES CXX VERSION 6.68)
project(xbyak LANGUAGES CXX VERSION 7.05)
file(GLOB headers xbyak/*.h)
if (DEFINED CMAKE_VERSION AND CMAKE_VERSION VERSION_GREATER_EQUAL 3.0.2)
include(GNUInstallDirs)
add_library(${PROJECT_NAME} INTERFACE)
add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
include(GNUInstallDirs)
add_library(${PROJECT_NAME} INTERFACE)
add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
target_include_directories(
${PROJECT_NAME} INTERFACE
"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
)
target_include_directories(
${PROJECT_NAME} INTERFACE
"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
)
install(
TARGETS ${PROJECT_NAME}
EXPORT ${PROJECT_NAME}-targets
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}
)
install(
TARGETS ${PROJECT_NAME}
EXPORT ${PROJECT_NAME}-targets
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}
)
include(CMakePackageConfigHelpers)
configure_package_config_file(
cmake/config.cmake.in
include(CMakePackageConfigHelpers)
configure_package_config_file(
cmake/config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config.cmake"
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)
write_basic_package_version_file(
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake"
COMPATIBILITY SameMajorVersion
)
install(
FILES
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config.cmake"
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)
write_basic_package_version_file(
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake"
COMPATIBILITY SameMajorVersion
)
DESTINATION
${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)
install(
FILES
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config.cmake"
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake"
DESTINATION
${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)
install(
EXPORT ${PROJECT_NAME}-targets
NAMESPACE ${PROJECT_NAME}::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)
elseif(NOT DEFINED CMAKE_INSTALL_INCLUDEDIR)
set(CMAKE_INSTALL_INCLUDEDIR "include")
endif()
install(
EXPORT ${PROJECT_NAME}-targets
NAMESPACE ${PROJECT_NAME}::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)
install(
FILES ${headers}

View file

@ -1,5 +1,18 @@
# History
* 2024/Jan/03 ver 7.05 support RAO-INT for APX
* 2023/Dec/28 ver 7.04 rex2 supports two-byte opecode
* 2023/Dec/26 ver 7.03 set the default value of dfv to 0
* 2023/Dec/20 ver 7.02 SHA* support APX
* 2023/Dec/19 ver 7.01 support AESKLE, WIDE_KL, KEYLOCKER, KEYLOCKER_WIDE, detection of APX10/APX
* 2023/Dec/01 ver 7.00 support APX
* 2023/Aug/07 ver 6.73 add sha512/sm3/sm4/avx-vnni-int16
* 2023/Aug/02 ver 6.72 add xbegin/xabort/xend
* 2023/Jul/27 ver 6.71 Allocator supports huge page
* 2023/Jul/05 ver 6.70 add alias of vclmulqdq, correct alias of pclmulqdq
* 2023/Jun/27 ver 6.69.2 add constexpr to `TypeT operator|` (thanks to Wunkolo)
* 2023/Mar/23 ver 6.69.1 add detection of xsave (thanks to Wunkolo)
* 2023/Feb/20 ver 6.69 util::Cpu supports AMD CPUs. support UINTR
* 2022/Dec/07 ver 6.68 support prefetchit{0,1}
* 2022/Nov/30 ver 6.67 support CMPccXADD
* 2022/Nov/25 ver 6.66 support RAO-INT

View file

@ -128,6 +128,34 @@ vpdpbusd(xm0, xm1, xm2); // VEX encoding
* use `ptr_b` for broadcast `{1toX}`. X is automatically determined.
* specify `xword`/`yword`/`zword(_b)` for m128/m256/m512 if necessary.
## APX
[Advanced Performance Extensions (APX) Architecture Specification](https://www.intel.com/content/www/us/en/content-details/786223/intel-advanced-performance-extensions-intel-apx-architecture-specification.html)
- Support 64-bit 16 additional GPRs (general-purpose registers) r16, ..., r31
- 32-bit regs are r16d, ..., r31d
- 16-bit regs are r16w, ..., r31w
- 8-bit regs are r16b, ..., r31b
- `add(r20, r21);`
- `lea(r30, ptr[r29+r31]);`
- Support three-operand instruction
- `add(r20, r21, r23);`
- `add(r20, ptr[rax + rcx * 8 + 0x1234], r23);`
- Support T_nf for NF=1 (status flags update suppression)
- `add(r20|T_nf, r21, r23);` // Set EVEX.NF=1
- Support T_zu for NF=ZU (zero upper) for imul and setcc
- `imul(ax|T_zu, cx, 0x1234);` // Set ND=ZU
- `imul(ax|T_zu|T_nf, cx, 0x1234);` // Set ND=ZU and EVEX.NF=1
- `setb(r31b|T_zu);` // same as set(r31b); movzx(r31, r31b);
- See [sample/zero_upper.cpp](../sample/zero_upper.cpp)
### ccmpSCC and ctestSCC
- ccmpSCC(op1, op2, dfv = 0); // eflags = eflags == SCC ? cmp(op1, op2) : dfv
- ctestSCC(op1, op2, dfv = 0); // eflags = eflags == SCC ? test(op1, op2) : dfv
- SCC means source condition code such as z, a, gt.
- See [sample/ccmp.cpp](../sample/ccmp.cpp)
- Specify the union of T_of(=8), T_sf(=4), T_zf(=2), or T_cf(=1) for dfv.
## Label
Two kinds of Label are supported. (String literal and Label class).

View file

@ -1,12 +1,17 @@
TARGET=../xbyak/xbyak_mnemonic.h
BIN=sortline gen_code gen_avx512
CFLAGS=-I../ -O2 -Wall -Wextra -Wno-missing-field-initializers $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS)
CFLAGS=-I../ -I ./ -Wall -Wextra -Wno-missing-field-initializers $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS)
all: $(TARGET) ../CMakeLists.txt ../meson.build ../readme.md ../readme.txt
avx_type_def.h: ../xbyak/xbyak.h
sed -n '/@@@begin of avx_type_def.h/,/@@@end of avx_type_def.h/p' $< > $@
avx_type.hpp: avx_type_def.h
sortline: sortline.cpp
$(CXX) $(CFLAGS) $< -o $@
gen_code: gen_code.cpp ../xbyak/xbyak.h avx_type.hpp
gen_code: gen_code.cpp avx_type.hpp
$(CXX) $(CFLAGS) $< -o $@
gen_avx512: gen_avx512.cpp ../xbyak/xbyak.h avx_type.hpp
gen_avx512: gen_avx512.cpp avx_type.hpp
$(CXX) $(CFLAGS) $< -o $@
$(TARGET): $(BIN)
@ -36,4 +41,4 @@ VER=$(shell head -n 1 ../xbyak/xbyak_mnemonic.h|grep -o "[0-9.]*")
sed -l 2 -i -e "s/Xbyak [0-9.]*/Xbyak $(VER)/" $@
clean:
$(RM) $(BIN) $(TARGET)
$(RM) $(BIN) $(TARGET) avx_type_def.h

View file

@ -1,190 +1,72 @@
#include <assert.h>
// copy CodeGenerator::AVXtype
enum AVXtype {
// low 3 bit
T_N1 = 1,
T_N2 = 2,
T_N4 = 3,
T_N8 = 4,
T_N16 = 5,
T_N32 = 6,
T_NX_MASK = 7,
//
T_N_VL = 1 << 3, // N * (1, 2, 4) for VL
T_DUP = 1 << 4, // N = (8, 32, 64)
T_66 = 1 << 5, // pp = 1
T_F3 = 1 << 6, // pp = 2
T_F2 = T_66 | T_F3, // pp = 3
T_ER_R = 1 << 7, // reg{er}
T_0F = 1 << 8,
T_0F38 = 1 << 9,
T_0F3A = 1 << 10,
T_L0 = 1 << 11,
T_L1 = 1 << 12,
T_W0 = 1 << 13,
T_W1 = 1 << 14,
T_EW0 = 1 << 15,
T_EW1 = 1 << 16,
T_YMM = 1 << 17, // support YMM, ZMM
T_EVEX = 1 << 18,
T_ER_X = 1 << 19, // xmm{er}
T_ER_Y = 1 << 20, // ymm{er}
T_ER_Z = 1 << 21, // zmm{er}
T_SAE_X = 1 << 22, // xmm{sae}
T_SAE_Y = 1 << 23, // ymm{sae}
T_SAE_Z = 1 << 24, // zmm{sae}
T_MUST_EVEX = 1 << 25, // contains T_EVEX
T_B32 = 1 << 26, // m32bcst
T_B64 = 1 << 27, // m64bcst
T_B16 = T_B32 | T_B64, // m16bcst
T_M_K = 1 << 28, // mem{k}
T_VSIB = 1 << 29,
T_MEM_EVEX = 1 << 30, // use evex if mem
T_FP16 = 1 << 31,
T_MAP5 = T_FP16 | T_0F,
T_MAP6 = T_FP16 | T_0F38,
T_XXX
};
// T_66 = 1, T_F3 = 2, T_F2 = 3
uint32_t getPP(int type) { return (type >> 5) & 3; }
#include "avx_type_def.h"
const int NONE = 256; // same as Xbyak::CodeGenerator::NONE
std::string type2String(int type)
std::string type2String(uint64_t type)
{
if (type == 0) return "T_NONE";
std::string str;
int low = type & T_NX_MASK;
if (0 < low) {
if (0 < low && low < 7) {
const char *tbl[8] = {
"T_N1", "T_N2", "T_N4", "T_N8", "T_N16", "T_N32"
};
assert(low < int(sizeof(tbl) / sizeof(tbl[0])));
str = tbl[low - 1];
}
if (type & T_N_VL) {
if (!str.empty()) str += " | ";
str += "T_N_VL";
}
if (type & T_DUP) {
if (!str.empty()) str += " | ";
str += "T_DUP";
}
if (type & T_F2) {
if (!str.empty()) str += " | ";
switch (type & T_F2) {
case T_66: str += "T_66"; break;
case T_F3: str += "T_F3"; break;
case T_F2: str += "T_F2"; break;
default: break;
}
}
if (type & T_N_VL) str += "|T_N_VL";
if (type & T_APX) str += "|T_APX";
if ((type & T_NX_MASK) == T_DUP) str += "|T_DUP";
if (type & T_66) str += "|T_66";
if (type & T_F3) str += "|T_F3";
if (type & T_F2) str += "|T_F2";
if (type & T_0F) {
if (!str.empty()) str += " | ";
if (type & T_FP16) {
str += "T_MAP5";
str += "|T_MAP5";
} else {
str += "T_0F";
str += "|T_0F";
}
}
if (type & T_0F38) {
if (!str.empty()) str += " | ";
if (type & T_FP16) {
str += "T_MAP6";
str += "|T_MAP6";
} else {
str += "T_0F38";
str += "|T_0F38";
}
}
if (type & T_0F3A) {
if (!str.empty()) str += " | ";
str += "T_0F3A";
}
if (type & T_L0) {
if (!str.empty()) str += " | ";
str += "VEZ_L0";
}
if (type & T_L1) {
if (!str.empty()) str += " | ";
str += "VEZ_L1";
}
if (type & T_W0) {
if (!str.empty()) str += " | ";
str += "T_W0";
}
if (type & T_W1) {
if (!str.empty()) str += " | ";
str += "T_W1";
}
if (type & T_EW0) {
if (!str.empty()) str += " | ";
str += "T_EW0";
}
if (type & T_EW1) {
if (!str.empty()) str += " | ";
str += "T_EW1";
}
if (type & T_YMM) {
if (!str.empty()) str += " | ";
str += "T_YMM";
}
if (type & T_EVEX) {
if (!str.empty()) str += " | ";
str += "T_EVEX";
}
if (type & T_ER_X) {
if (!str.empty()) str += " | ";
str += "T_ER_X";
}
if (type & T_ER_Y) {
if (!str.empty()) str += " | ";
str += "T_ER_Y";
}
if (type & T_ER_Z) {
if (!str.empty()) str += " | ";
str += "T_ER_Z";
}
if (type & T_ER_R) {
if (!str.empty()) str += " | ";
str += "T_ER_R";
}
if (type & T_SAE_X) {
if (!str.empty()) str += " | ";
str += "T_SAE_X";
}
if (type & T_SAE_Y) {
if (!str.empty()) str += " | ";
str += "T_SAE_Y";
}
if (type & T_SAE_Z) {
if (!str.empty()) str += " | ";
str += "T_SAE_Z";
}
if (type & T_MUST_EVEX) {
if (!str.empty()) str += " | ";
str += "T_MUST_EVEX";
}
if (type & T_B32) {
if (!str.empty()) str += " | ";
if (type & T_B64) {
str += "T_B16"; // T_B16 = T_B32 | T_B64
} else {
str += "T_B32";
}
} else if (type & T_B64) {
if (!str.empty()) str += " | ";
str += "T_B64";
}
if (type & T_M_K) {
if (!str.empty()) str += " | ";
str += "T_M_K";
}
if (type & T_VSIB) {
if (!str.empty()) str += " | ";
str += "T_VSIB";
}
if (type & T_MEM_EVEX) {
if (!str.empty()) str += " | ";
str += "T_MEM_EVEX";
if (type & T_0F3A) str += "|T_0F3A";
if (type & T_L0) str += "|T_L0";
if (type & T_L1) str += "|T_L1";
if (type & T_W0) str += "|T_W0";
if (type & T_W1) str += "|T_W1";
if (type & T_EW0) str += "|T_EW0";
if (type & T_EW1) str += "|T_EW1";
if (type & T_YMM) str += "|T_YMM";
if (type & T_EVEX) str += "|T_EVEX";
if (type & T_ER_X) str += "|T_ER_X";
if (type & T_ER_Y) str += "|T_ER_Y";
if (type & T_ER_Z) str += "|T_ER_Z";
if (type & T_ER_R) str += "|T_ER_R";
if (type & T_SAE_X) str += "|T_SAE_X";
if (type & T_SAE_Y) str += "|T_SAE_Y";
if (type & T_SAE_Z) str += "|T_SAE_Z";
if (type & T_MUST_EVEX) str += "|T_MUST_EVEX";
switch (type & T_B16) { // T_B16 = T_B32 | T_B64
case T_B16: str += "|T_B16"; break;
case T_B32: str += "|T_B32"; break;
case T_B64: str += "|T_B64"; break;
default: break;
}
if (type & T_M_K) str += "|T_M_K";
if (type & T_VSIB) str += "|T_VSIB";
if (type & T_MEM_EVEX) str += "|T_MEM_EVEX";
if (type & T_NF) str += "|T_NF";
if (type & T_CODE1_IF1) str += "|T_CODE1_IF1";
if (type & T_ND1) str += "|T_ND1";
if (type & T_ZU) str += "|T_ZU";
if (str[0] == '|') str = str.substr(1);
return str;
}

52
gen/avx_type_def.h Normal file
View file

@ -0,0 +1,52 @@
// @@@begin of avx_type_def.h
static const uint64_t T_NONE = 0ull;
// low 3 bit
static const uint64_t T_N1 = 1ull;
static const uint64_t T_N2 = 2ull;
static const uint64_t T_N4 = 3ull;
static const uint64_t T_N8 = 4ull;
static const uint64_t T_N16 = 5ull;
static const uint64_t T_N32 = 6ull;
static const uint64_t T_NX_MASK = 7ull;
static const uint64_t T_DUP = T_NX_MASK;//1 << 4, // N = (8, 32, 64)
static const uint64_t T_N_VL = 1ull << 3; // N * (1, 2, 4) for VL
static const uint64_t T_APX = 1ull << 4;
static const uint64_t T_66 = 1ull << 5; // pp = 1
static const uint64_t T_F3 = 1ull << 6; // pp = 2
static const uint64_t T_ER_R = 1ull << 7; // reg{er}
static const uint64_t T_0F = 1ull << 8;
static const uint64_t T_0F38 = 1ull << 9;
static const uint64_t T_0F3A = 1ull << 10;
static const uint64_t T_L0 = 1ull << 11;
static const uint64_t T_L1 = 1ull << 12;
static const uint64_t T_W0 = 1ull << 13;
static const uint64_t T_W1 = 1ull << 14;
static const uint64_t T_EW0 = 1ull << 15;
static const uint64_t T_EW1 = 1ull << 16;
static const uint64_t T_YMM = 1ull << 17; // support YMM, ZMM
static const uint64_t T_EVEX = 1ull << 18;
static const uint64_t T_ER_X = 1ull << 19; // xmm{er}
static const uint64_t T_ER_Y = 1ull << 20; // ymm{er}
static const uint64_t T_ER_Z = 1ull << 21; // zmm{er}
static const uint64_t T_SAE_X = 1ull << 22; // xmm{sae}
static const uint64_t T_SAE_Y = 1ull << 23; // ymm{sae}
static const uint64_t T_SAE_Z = 1ull << 24; // zmm{sae}
static const uint64_t T_MUST_EVEX = 1ull << 25; // contains T_EVEX
static const uint64_t T_B32 = 1ull << 26; // m32bcst
static const uint64_t T_B64 = 1ull << 27; // m64bcst
static const uint64_t T_B16 = T_B32 | T_B64; // m16bcst (Be careful)
static const uint64_t T_M_K = 1ull << 28; // mem{k}
static const uint64_t T_VSIB = 1ull << 29;
static const uint64_t T_MEM_EVEX = 1ull << 30; // use evex if mem
static const uint64_t T_FP16 = 1ull << 31; // avx512-fp16
static const uint64_t T_MAP5 = T_FP16 | T_0F;
static const uint64_t T_MAP6 = T_FP16 | T_0F38;
static const uint64_t T_NF = 1ull << 32; // T_nf
static const uint64_t T_CODE1_IF1 = 1ull << 33; // code|=1 if !r.isBit(8)
static const uint64_t T_ND1 = 1ull << 35; // ND=1
static const uint64_t T_ZU = 1ull << 36; // ND=ZU
static const uint64_t T_F2 = 1ull << 37; // pp = 3
// T_66 = 1, T_F3 = 2, T_F2 = 3
static inline uint32_t getPP(uint64_t type) { return (type & T_66) ? 1 : (type & T_F3) ? 2 : (type & T_F2) ? 3 : 0; }
// @@@end of avx_type_def.h

View file

@ -15,8 +15,7 @@ using namespace Xbyak;
void putOpmask(bool only64bit)
{
if (only64bit) {
puts("void kmovq(const Opmask& k, const Reg64& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W1, 0x92); }");
puts("void kmovq(const Reg64& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W1, 0x93); }");
puts("void kmovq(const Reg64& r, const Opmask& k) { opKmov(k, r, true, 64); }");
return;
}
@ -76,22 +75,14 @@ void putOpmask(bool only64bit)
printf("void %sd(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x%02X, imm); }\n", p.name, p.code + 1);
}
}
puts("void kmovw(const Opmask& k, const Operand& op) { if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W0, 0x90); }");
puts("void kmovq(const Opmask& k, const Operand& op) { if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W1, 0x90); }");
puts("void kmovb(const Opmask& k, const Operand& op) { if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W0, 0x90); }");
puts("void kmovd(const Opmask& k, const Operand& op) { if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W1, 0x90); }");
puts("void kmovw(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W0, 0x91); }");
puts("void kmovq(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W1, 0x91); }");
puts("void kmovb(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W0, 0x91); }");
puts("void kmovd(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W1, 0x91); }");
puts("void kmovw(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_W0, 0x92); }");
puts("void kmovw(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_W0, 0x93); }");
puts("void kmovb(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_66 | T_W0, 0x92); }");
puts("void kmovb(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_66 | T_W0, 0x93); }");
puts("void kmovd(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W0, 0x92); }");
puts("void kmovd(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W0, 0x93); }");
for (int i = 0; i < 4; i++) {
const char tbl[] = "bwdq";
const int bitTbl[] = { 8, 16, 32, 64 };
int bit = bitTbl[i];
printf("void kmov%c(const Opmask& k, const Operand& op) { opKmov(k, op, false, %d); }\n", tbl[i], bit);
printf("void kmov%c(const Address& addr, const Opmask& k) { opKmov(k, addr, true, %d); }\n", tbl[i], bit);
if (i < 3) printf("void kmov%c(const Reg32& r, const Opmask& k) { opKmov(k, r, true, %d); }\n", tbl[i], bit);
}
}
// vcmppd(k, x, op)
@ -100,7 +91,7 @@ void putVcmp()
const struct Tbl {
uint8_t code;
const char *name;
int type;
uint64_t type;
bool hasIMM;
} tbl[] = {
{ 0xC2, "vcmppd", T_0F | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_YMM | T_66 | T_B64, true },
@ -142,9 +133,9 @@ void putVcmp()
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
std::string type = type2String(p->type);
std::string s = type2String(p->type);
printf("void %s(const Opmask& k, const Xmm& x, const Operand& op%s) { opAVX_K_X_XM(k, x, op, %s, 0x%02X%s); }\n"
, p->name, p->hasIMM ? ", uint8_t imm" : "", type.c_str(), p->code, p->hasIMM ? ", imm" : "");
, p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? ", imm" : "");
}
puts("void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F); }");
puts("void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E); }");
@ -173,7 +164,7 @@ void putX_XM()
const struct Tbl {
uint8_t code;
const char *name;
int type;
uint64_t type;
} tbl[] = {
{ 0x6F, "vmovdqa32", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z },
{ 0x6F, "vmovdqa64", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z },
@ -210,8 +201,8 @@ void putX_XM()
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
std::string type = type2String(p->type);
printf("void %s(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, %s, 0x%02X); }\n", p->name, type.c_str(), p->code);
std::string s = type2String(p->type);
printf("void %s(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code);
}
puts("void vpabsq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_MUST_EVEX | T_EW1 | T_B64 | T_YMM, 0x1F); }");
@ -229,7 +220,7 @@ void putM_X()
const struct Tbl {
uint8_t code;
const char *name;
int type;
uint64_t type;
} tbl[] = {
{ 0x7F, "vmovdqa32", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K },
{ 0x7F, "vmovdqa64", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K },
@ -242,8 +233,8 @@ void putM_X()
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
std::string type = type2String(p->type);
printf("void %s(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, %s, 0x%02X); }\n", p->name, type.c_str(), p->code);
std::string s = type2String(p->type);
printf("void %s(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, %s, 0x%02X); }\n", p->name, s.c_str(), p->code);
}
}
@ -252,7 +243,7 @@ void putXM_X()
const struct Tbl {
uint8_t code;
const char *name;
int type;
uint64_t type;
} tbl[] = {
{ 0x8A, "vcompresspd", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_N8 },
{ 0x8A, "vcompressps", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 },
@ -265,8 +256,8 @@ void putXM_X()
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
std::string type = type2String(p->type);
printf("void %s(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, %s, 0x%02X); }\n", p->name, type.c_str(), p->code);
std::string s = type2String(p->type);
printf("void %s(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code);
}
}
@ -275,7 +266,7 @@ void putX_X_XM_IMM()
const struct Tbl {
uint8_t code;
const char *name;
int type;
uint64_t type;
bool hasIMM;
} tbl[] = {
{ 0x03, "valignd", T_MUST_EVEX | T_66 | T_0F3A | T_EW0 | T_YMM, true },
@ -413,9 +404,9 @@ void putX_X_XM_IMM()
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
std::string type = type2String(p->type);
std::string s = type2String(p->type);
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op%s) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X%s); }\n"
, p->name, p->hasIMM ? ", uint8_t imm" : "", type.c_str(), p->code, p->hasIMM ? ", imm" : "");
, p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? ", imm" : "");
}
}
@ -425,7 +416,7 @@ void putShift()
const char *name;
uint8_t code;
int idx;
int type;
uint64_t type;
} tbl[] = {
{ "vpsraq", 0x72, 4, T_0F | T_66 | T_YMM | T_MUST_EVEX |T_EW1 | T_B64 },
{ "vprold", 0x72, 1, T_66 | T_0F | T_YMM | T_MUST_EVEX | T_EW0 | T_B32 },
@ -435,8 +426,8 @@ void putShift()
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
std::string type = type2String(p.type);
printf("void %s(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), %d), x, op, %s, 0x%02X, imm); }\n", p.name, p.idx, type.c_str(), p.code);
std::string s = type2String(p.type);
printf("void %s(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), %d), x, op, %s, 0x%02X, imm); }\n", p.name, p.idx, s.c_str(), p.code);
}
}
@ -446,7 +437,7 @@ void putExtractInsert()
const struct Tbl {
const char *name;
uint8_t code;
int type;
uint64_t type;
bool isZMM;
} tbl[] = {
{ "vextractf32x4", 0x19, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_YMM | T_N16, false },
@ -461,16 +452,16 @@ void putExtractInsert()
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
std::string type = type2String(p.type);
std::string s = type2String(p.type);
const char *kind = p.isZMM ? "Operand::MEM | Operand::YMM" : "Operand::MEM | Operand::XMM";
printf("void %s(const Operand& op, const %s& r, uint8_t imm) { if (!op.is(%s)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, %s, 0x%2X, imm); }\n", p.name, p.isZMM ? "Zmm" : "Ymm", kind, type.c_str(), p.code);
printf("void %s(const Operand& op, const %s& r, uint8_t imm) { if (!op.is(%s)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, %s, 0x%2X, imm); }\n", p.name, p.isZMM ? "Zmm" : "Ymm", kind, s.c_str(), p.code);
}
}
{
const struct Tbl {
const char *name;
uint8_t code;
int type;
uint64_t type;
bool isZMM;
} tbl[] = {
{ "vinsertf32x4", 0x18, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_YMM | T_N16, false },
@ -485,12 +476,12 @@ void putExtractInsert()
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
std::string type = type2String(p.type);
std::string s = type2String(p.type);
const char *x = p.isZMM ? "Zmm" : "Ymm";
const char *cond = p.isZMM ? "op.is(Operand::MEM | Operand::YMM)" : "(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))";
printf("void %s(const %s& r1, const %s& r2, const Operand& op, uint8_t imm) {"
"if (!%s) XBYAK_THROW(ERR_BAD_COMBINATION) "
"opVex(r1, &r2, op, %s, 0x%2X, imm); }\n", p.name, x, x, cond, type.c_str(), p.code);
"opVex(r1, &r2, op, %s, 0x%2X, imm); }\n", p.name, x, x, cond, s.c_str(), p.code);
}
}
}
@ -501,7 +492,7 @@ void putBroadcast(bool only64bit)
const struct Tbl {
uint8_t code;
const char *name;
int type;
uint64_t type;
int reg;
} tbl[] = {
{ 0x7A, "vpbroadcastb", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0, 8 },
@ -511,9 +502,9 @@ void putBroadcast(bool only64bit)
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
std::string type = type2String(p.type);
std::string s = type2String(p.type);
if ((only64bit && p.reg == 64) || (!only64bit && p.reg != 64)) {
printf("void %s(const Xmm& x, const Reg%d& r) { opVex(x, 0, r, %s, 0x%02X); }\n", p.name, p.reg, type.c_str(), p.code);
printf("void %s(const Xmm& x, const Reg%d& r) { opVex(x, 0, r, %s, 0x%02X); }\n", p.name, p.reg, s.c_str(), p.code);
}
}
}
@ -536,7 +527,7 @@ void putCvt()
const struct Tbl {
uint8_t code;
const char *name;
int type;
uint64_t type;
int ptn;
} tbl[] = {
{ 0x79, "vcvtsd2usi", T_F2 | T_0F | T_MUST_EVEX | T_N8 | T_ER_X, 0 },
@ -583,28 +574,28 @@ void putCvt()
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
std::string type = type2String(p.type);
std::string s = type2String(p.type);
switch (p.ptn) {
case 0:
printf("void %s(const Reg32e& r, const Operand& op) { int type = (%s) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x%02X); }\n", p.name, type.c_str(), p.code);
printf("void %s(const Reg32e& r, const Operand& op) { uint64_t type = (%s) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x%02X); }\n", p.name, s.c_str(), p.code);
break;
case 1:
printf("void %s(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, %s, 0x%02X); }\n", p.name, type.c_str(), p.code);
printf("void %s(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
break;
case 2:
printf("void %s(const Xmm& x, const Operand& op) { opCvt2(x, op, %s, 0x%02X); }\n", p.name, type.c_str(), p.code);
printf("void %s(const Xmm& x, const Operand& op) { opCvt2(x, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
break;
case 3:
printf("void %s(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, %s, 0x%02X); }\n", p.name, type.c_str(), p.code);
printf("void %s(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
break;
case 4:
printf("void %s(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, %s, 0x%02X); }\n", p.name, type.c_str(), p.code);
printf("void %s(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
break;
case 5:
printf("void %s(const Xmm& x, const Operand& op) { opCvt5(x, op, %s, 0x%02X); }\n", p.name, type.c_str(), p.code);
printf("void %s(const Xmm& x, const Operand& op) { opCvt5(x, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
break;
case 6:
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { if (!(x1.isXMM() && x2.isXMM() && op.isBit(32|64))) XBYAK_THROW(ERR_BAD_COMBINATION) int type = (%s) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8)); opVex(x1, &x2, op, type, 0x%02X); }\n", p.name, type.c_str(), p.code);
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { if (!(x1.isXMM() && x2.isXMM() && op.isBit(32|64))) XBYAK_THROW(ERR_BAD_COMBINATION) uint64_t type = (%s) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8)); opVex(x1, &x2, op, type, 0x%02X); }\n", p.name, s.c_str(), p.code);
break;
}
}
@ -621,7 +612,7 @@ void putGather()
{
const struct Tbl {
const char *name;
int type;
uint64_t type;
uint8_t code;
int mode;
} tbl[] = {
@ -636,15 +627,15 @@ void putGather()
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
std::string type = type2String(p.type | T_VSIB);
printf("void %s(const Xmm& x, const Address& addr) { opGather2(x, addr, %s, 0x%02X, %d); }\n", p.name, type.c_str(), p.code, p.mode);
std::string s = type2String(p.type | T_VSIB);
printf("void %s(const Xmm& x, const Address& addr) { opGather2(x, addr, %s, 0x%02X, %d); }\n", p.name, s.c_str(), p.code, p.mode);
}
}
void putScatter()
{
const struct Tbl {
const char *name;
int type;
uint64_t type;
uint8_t code;
int mode; // reverse of gather
} tbl[] = {
@ -660,8 +651,8 @@ void putScatter()
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
std::string type = type2String(p.type | T_VSIB);
printf("void %s(const Address& addr, const Xmm& x) { opGather2(x, addr, %s, 0x%02X, %d); }\n", p.name, type.c_str(), p.code, p.mode);
std::string s = type2String(p.type | T_VSIB);
printf("void %s(const Address& addr, const Xmm& x) { opGather2(x, addr, %s, 0x%02X, %d); }\n", p.name, s.c_str(), p.code, p.mode);
}
}
@ -689,7 +680,7 @@ void putMov()
const struct Tbl {
uint8_t code;
const char *name;
int type;
uint64_t type;
int mode;
} tbl[] = {
{ 0x32, "vpmovqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false },
@ -718,8 +709,8 @@ void putMov()
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
std::string type = type2String(p.type);
printf("void %s(const Operand& op, const Xmm& x) { opVmov(op, x, %s, 0x%02X, %s); }\n", p.name, type.c_str(), p.code, p.mode ? "true" : "false");
std::string s = type2String(p.type);
printf("void %s(const Operand& op, const Xmm& x) { opVmov(op, x, %s, 0x%02X, %s); }\n", p.name, s.c_str(), p.code, p.mode ? "true" : "false");
}
}
}
@ -729,7 +720,7 @@ void putX_XM_IMM()
const struct Tbl {
uint8_t code;
const char *name;
int type;
uint64_t type;
bool hasIMM;
} tbl[] = {
{ 0x26, "vgetmantpd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64 | T_SAE_Z, true },
@ -770,9 +761,9 @@ void putX_XM_IMM()
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
std::string type = type2String(p->type);
std::string s = type2String(p->type);
printf("void %s(const Xmm& x, const Operand& op%s) { opAVX_X_XM_IMM(x, op, %s, 0x%02X%s); }\n"
, p->name, p->hasIMM ? ", uint8_t imm" : "", type.c_str(), p->code, p->hasIMM ? ", imm" : "");
, p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? ", imm" : "");
}
}
@ -784,7 +775,7 @@ void putMisc()
const struct Tbl {
const char *name;
int zm;
int type;
uint64_t type;
uint8_t code;
bool isZmm;
} tbl[] = {
@ -810,9 +801,9 @@ void putMisc()
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
std::string type = type2String(p.type | T_66 | T_0F38 | T_MUST_EVEX | T_M_K | T_VSIB);
std::string s = type2String(p.type | T_66 | T_0F38 | T_MUST_EVEX | T_M_K | T_VSIB);
printf("void %s(const Address& addr) { opGatherFetch(addr, zm%d, %s, 0x%2X, Operand::%s); }\n"
, p.name, p.zm, type.c_str(), p.code, p.isZmm ? "ZMM" : "YMM");
, p.name, p.zm, s.c_str(), p.code, p.isZmm ? "ZMM" : "YMM");
}
}
@ -887,18 +878,18 @@ void putFP16_FMA()
{ "213", 0xA0 },
{ "231", 0xB0 },
};
int t = T_66 | T_MAP6 | T_EW0 | T_MUST_EVEX;
uint64_t type = T_66 | T_MAP6 | T_EW0 | T_MUST_EVEX;
const char *suf = 0;
if (tbl[i].isPH) {
t |= T_ER_Z | T_YMM | T_B16;
type |= T_ER_Z | T_YMM | T_B16;
suf = "ph";
} else {
t |= T_ER_X | T_N2;
type |= T_ER_X | T_N2;
suf = "sh";
}
std::string type = type2String(t);
std::string s = type2String(type);
printf("void %s%s%s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n"
, tbl[i].name, ord[k].str, suf, type.c_str(), tbl[i].code | ord[k].code);
, tbl[i].name, ord[k].str, suf, s.c_str(), tbl[i].code | ord[k].code);
}
}
}
@ -914,23 +905,23 @@ void putFP16_FMA2()
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
for (int j = 0; j < 2; j++) {
int t = T_MAP6 | T_EW0 | T_MUST_EVEX;
uint64_t type = T_MAP6 | T_EW0 | T_MUST_EVEX;
if (j == 0) {
t |= T_F2;
type |= T_F2;
} else {
t |= T_F3;
type |= T_F3;
}
const char *suf = 0;
if (tbl[i].isPH) {
t |= T_ER_Z | T_YMM | T_B32;
type |= T_ER_Z | T_YMM | T_B32;
suf = "ph";
} else {
t |= T_ER_X | T_N2;
type |= T_ER_X | T_N2;
suf = "sh";
}
std::string type = type2String(t);
std::string s = type2String(type);
printf("void vf%s%s%s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n"
, j == 0 ? "c" : "", tbl[i].name, suf, type.c_str(), tbl[i].code);
, j == 0 ? "c" : "", tbl[i].name, suf, s.c_str(), tbl[i].code);
}
}
}
@ -938,16 +929,16 @@ void putFP16_FMA2()
void putFP16_2()
{
{
int t = T_F3 | T_MAP5 | T_MUST_EVEX | T_EW0 | T_N2;
std::string type = type2String(t);
printf("void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, %s, 0x10); }\n", type.c_str());
printf("void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, %s, 0x10); }\n", type.c_str());
uint64_t type = T_F3 | T_MAP5 | T_MUST_EVEX | T_EW0 | T_N2;
std::string s = type2String(type);
printf("void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, %s, 0x10); }\n", s.c_str());
printf("void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, %s, 0x10); }\n", s.c_str());
}
{
int t = T_66 | T_MAP5 | T_MUST_EVEX | T_N2;
std::string type = type2String(t);
printf("void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, %s, 0x6E); }\n", type.c_str());
printf("void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, %s, 0x7E); }\n", type.c_str());
uint64_t type = T_66 | T_MAP5 | T_MUST_EVEX | T_N2;
std::string s = type2String(type);
printf("void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, %s, 0x6E); }\n", s.c_str());
printf("void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, %s, 0x7E); }\n", s.c_str());
}
}

File diff suppressed because it is too large Load diff

View file

@ -5,7 +5,7 @@
project(
'xbyak',
'cpp',
version: '6.68',
version: '7.05',
license: 'BSD-3-Clause',
default_options: 'b_ndebug=if-release'
)

View file

@ -1,5 +1,5 @@
# Xbyak 6.68 [![Badge Build]][Build Status]
# Xbyak 7.05 [![Badge Build]][Build Status]
*A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)*
@ -21,13 +21,21 @@ It is named from a Japanese word [開闢](https://translate.google.com/?hl=ja&sl
- header file only
- Intel/MASM like syntax
- fully support AVX-512
- support APX/AVX10
**Note**:
Use `and_()`, `or_()`, ... instead of `and()`, `or()`.
If you want to use them, then specify `-fno-operator-names` option to gcc/clang.
### Derived Projects
- [Xbyak_aarch64](https://github.com/fujitsu/xbyak_aarch64/) : for AArch64
- [Xbyak_riscv](https://github.com/herumi/xbyak_riscv) : for RISC-V
### News
- support RAO-INT for APX
- support AVX10 detection, AESKLE, WIDE_KL, KEYLOCKER, KEYLOCKER_WIDE
- support APX except for a few instructions
- add amx_fp16/avx_vnni_int8/avx_ne_convert/avx-ifma
- add movdiri, movdir64b, clwb, cldemote
- WAITPKG instructions (tpause, umonitor, umwait) are supported.

View file

@ -1,5 +1,5 @@
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.68
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.05
-----------------------------------------------------------------------------
◎概要
@ -46,6 +46,8 @@ Linuxではmake installで/usr/local/include/xbyakにコピーされます。
-----------------------------------------------------------------------------
◎新機能
APX/AVX10対応
例外なしモード追加
XBYAK_NO_EXCEPTIONを定義してコンパイルするとgcc/clangで-fno-exceptionsオプションでコンパイルできます。
エラーは例外の代わりに`Xbyak::GetError()`で通達されます。
@ -402,6 +404,19 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
-----------------------------------------------------------------------------
◎履歴
2024/01/03 ver 7.05 APX対応RAO-INT
2023/12/28 ver 7.04 2バイトオペコードのrex2対応
2023/12/26 ver 7.03 dfvのデフォルト値を0に設定
2023/12/20 ver 7.02 SHA*のAPX対応
2023/12/19 ver 7.01 AESKLE, WIDE_KL, KEYLOCKER, KEYLOCKER_WIDE対応 APX10/APX判定対応
2023/12/01 ver 7.00 APX対応
2023/08/07 ver 6.73 sha512/sm3/sm4/avx-vnni-int16追加
2023/08/02 ver 6.72 xabort, xbegin, xend追加
2023/07/27 ver 6.71 Allocatorでhuge pageを考慮する。
2023/07/05 ver 6.70 vpclmulqdqのailas追加
2023/06/27 ver 6.69.2 `TypeT operator|`にconstexpr追加(thanks to Wunkolo)
2023/03/23 ver 6.69.1 xsave判定追加(thanks to Wunkolo)
2023/02/20 ver 6.69 util::CpuがAMD対応 UINTR命令対応
2022/12/07 ver 6.68 prefetchit{0,1}サポート
2022/11/30 ver 6.67 CMPccXADDサポート
2022/11/25 ver 6.66 RAO-INTサポート

View file

@ -30,7 +30,7 @@ else
endif
ifeq ($(BIT),64)
TARGET += test64 bf64 memfunc64 test_util64 jmp_table64
TARGET += test64 bf64 memfunc64 test_util64 jmp_table64 zero_upper ccmp no_flags
ifeq ($(BOOST_EXIST),1)
TARGET += calc64 #calc2_64
endif
@ -103,6 +103,18 @@ profiler: profiler.cpp ../xbyak/xbyak_util.h
$(CXX) $(CFLAGS) profiler.cpp -o $@
profiler-vtune: profiler.cpp ../xbyak/xbyak_util.h
$(CXX) $(CFLAGS) profiler.cpp -o $@ -DXBYAK_USE_VTUNE -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
zero_upper: zero_upper.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) zero_upper.cpp -o $@
test_zero_upper: zero_upper
sde -future -- ./zero_upper
ccmp: ccmp.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) ccmp.cpp -o $@
test_ccmp: ccmp
sde -future -- ./ccmp
no_flags: no_flags.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) no_flags.cpp -o $@
test_no_flags: no_flags
sde -future -- ./no_flags
clean:
rm -rf $(TARGET) profiler profiler-vtune
@ -122,7 +134,7 @@ toyvm : toyvm.cpp $(XBYAK_INC)
static_buf: static_buf.cpp $(XBYAK_INC)
static_buf64: static_buf.cpp $(XBYAK_INC)
test_util : test_util.cpp $(XBYAK_INC) ../xbyak/xbyak_util.h
test_util2 : test_util.cpp $(XBYAK_INC) ../xbyak/xbyak_util.h
test_util64 : test_util.cpp $(XBYAK_INC) ../xbyak/xbyak_util.h
jmp_table: jmp_table.cpp $(XBYAK_INC)
jmp_table64: jmp_table.cpp $(XBYAK_INC)
memfd: memfd.cpp $(XBYAK_INC)

68
sample/ccmp.cpp Normal file
View file

@ -0,0 +1,68 @@
/*
An example of ccmp
> g++ ccmp.cpp -I ../xbyak
> sde -future -- ./a.out
*/
#include <stdio.h>
#include <xbyak/xbyak.h>
#include <xbyak/xbyak_util.h>
using namespace Xbyak;
struct Code1 : Xbyak::CodeGenerator {
Code1()
{
Xbyak::util::StackFrame sf(this, 2);
const auto& p1 = sf.p[0];
const auto& p2 = sf.p[1];
int dfv = 0;
cmp(p1, 3);
ctesta(p2, 1, dfv); // eflags = (p1 > 3) ? ((p2 & 1) == 0) : dfv;
setz(al|T_zu);
}
};
struct Code2 : Xbyak::CodeGenerator {
Code2()
{
Xbyak::util::StackFrame sf(this, 3);
const auto& p1 = sf.p[0];
const auto& p2 = sf.p[1];
const auto& p3 = sf.p[2];
int dfv = 0;
cmp(p1, 1);
ccmpe(p2, 2, dfv); // eflags = p1==1 ? p2==2 : dfv;
ccmpe(p3, 3, dfv); // eflags = (p1==1 && p2==2) ? p3==3 : dfv;
setz(al|T_zu); // p1==1 && p2==2 && p3==3
}
};
int main()
try
{
{
puts("(p1 > 3) && ((p2 & 1) == 0)");
Code1 c;
auto f = c.getCode<int (*)(int, int)>();
for (int p1 = 2; p1 < 5; p1++) {
for (int p2 = 0; p2 < 3; p2++) {
printf("p1=%d p2=%d ret=%d (%d)\n", p1, p2, f(p1, p2), p1 > 3 && ((p2&1) == 0));
}
}
}
{
puts("p1 == 1 && p2 == 2 && p3 == 3");
Code2 c;
auto f = c.getCode<int (*)(int, int, int)>();
for (int p1 = 0; p1 < 3; p1++) {
for (int p2 = 1; p2 < 4; p2++) {
for (int p3 = 2; p3 < 5; p3++) {
printf("p1=%d p2=%d p3=%d ret=%d (%d)\n", p1, p2, p3, f(p1, p2, p3), p1==1 && p2==2 && p3==3);
}
}
}
}
} catch (std::exception& e) {
printf("ERR %s\n", e.what());
}

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx_vnni waitpkg clflushopt cldemote movdiri movdir64b
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b serialize aeskle wide_kl keylocker keylocker_wide

2
sample/cpuid/arl.txt Normal file
View file

@ -0,0 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd sha512 sm3 sm4 avx_vnni_int16 aeskle wide_kl keylocker keylocker_wide

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl avx512_vnni clflushopt
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl avx512_vnni clflushopt clwb

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi clflushopt
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi clflushopt

View file

@ -1,11 +1,25 @@
#!/bin/bash
UPDATE=0
if [ $# -eq 1 ]; then
UPDATE=1
fi
if [ $UPDATE == 1 ]; then
echo "UPDATE"
fi
make -C ../ test_util64
cpus=(p4p mrm pnr nhm wsm snb ivb hsw bdw slt slm glm glp tnt skl cnl icl skx clx cpx icx knl knm tgl adl spr)
cpus=(p4p mrm pnr nhm wsm snb ivb hsw bdw slt slm glm glp tnt skl cnl icl skx clx cpx icx knl knm tgl adl mtl rpl spr gnr srf grr arl lnl)
for cpu in ${cpus[@]} ; do
echo $cpu
~/bin/sde -$cpu -- ../test_util64 -cpuid > tmp.txt
diff tmp.txt $cpu.txt
if [ $UPDATE == 1 ]; then
~/bin/sde -$cpu -- ../test_util64 -cpuid > $cpu.txt
else
~/bin/sde -$cpu -- ../test_util64 -cpuid > tmp.txt
diff $cpu.txt tmp.txt
fi
done

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl avx512_vnni avx512_bf16 clflushopt
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl avx512_vnni avx512_bf16 clflushopt clwb

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq enh_rep rdrand rdseed smap sha movbe clflushopt
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq enh_rep rdrand rdseed smap sha movbe clflushopt

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq enh_rep rdrand rdseed smap sha movbe clflushopt
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq enh_rep rdrand rdseed smap sha movbe clflushopt

2
sample/cpuid/gnr.txt Normal file
View file

@ -0,0 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_bf16 amx(tile) amx(int8) amx(bf16) avx_vnni avx512_fp16 waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize amx_fp16 prefetchiti avx10

2
sample/cpuid/grr.txt Normal file
View file

@ -0,0 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma rao-int cmpccxadd aeskle wide_kl keylocker keylocker_wide

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand f16c movbe
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand f16c movbe

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq clflushopt
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq clflushopt clwb

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq clflushopt
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq clflushopt clwb

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx enh_rep rdrand f16c
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx enh_rep rdrand f16c

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed prefetchwt1 f16c movbe avx512f avx512pf avx512er avx512cd
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed prefetchwt1 f16c movbe avx512f avx512pf avx512er avx512cd

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed prefetchwt1 f16c movbe avx512f avx512pf avx512er avx512cd avx512_4vnniw avx512_4fmaps avx512_vpopcntdq
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed prefetchwt1 f16c movbe avx512f avx512pf avx512er avx512cd avx512_4vnniw avx512_4fmaps avx512_vpopcntdq

2
sample/cpuid/lnl.txt Normal file
View file

@ -0,0 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd sha512 sm3 sm4 avx_vnni_int16 aeskle wide_kl keylocker keylocker_wide

2
sample/cpuid/mtl.txt Normal file
View file

@ -0,0 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b serialize aeskle wide_kl keylocker keylocker_wide

2
sample/cpuid/rpl.txt Normal file
View file

@ -0,0 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b serialize aeskle wide_kl keylocker keylocker_wide

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe clflushopt
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe clflushopt

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl clflushopt
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl clflushopt clwb

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_bf16 avx512_vp2intersect amx(tile) amx(int8) amx(bf16) avx_vnni avx512_fp16 waitpkg clflushopt cldemote movdiri movdir64b
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_bf16 amx(tile) amx(int8) amx(bf16) avx_vnni avx512_fp16 waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize

2
sample/cpuid/srf.txt Normal file
View file

@ -0,0 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd aeskle wide_kl keylocker keylocker_wide

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_vp2intersect clflushopt movdiri movdir64b
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_vp2intersect clflushopt clwb movdiri movdir64b aeskle wide_kl keylocker keylocker_wide

View file

@ -1,2 +0,0 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_bf16 avx512_vp2intersect amx(tile) amx(int8) amx(bf16) avx_vnni avx512_fp16 waitpkg clflushopt cldemote movdiri movdir64b

View file

@ -1,2 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq enh_rep rdrand rdseed smap sha movbe clflushopt
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq enh_rep rdrand rdseed smap sha movbe gfni clflushopt cldemote clwb

View file

@ -1,10 +0,0 @@
#!/bin/bash
make -C ../ test_util64
cpus=(p4p mrm pnr nhm wsm snb ivb hsw bdw slt slm glm glp tnt skl cnl icl skx clx cpx icx knl knm tgl adl spr)
for cpu in ${cpus[@]} ; do
echo $cpu
~/bin/sde -$cpu -- ../test_util64 -cpuid > $cpu.txt
done

25
sample/no_flags.cpp Normal file
View file

@ -0,0 +1,25 @@
#include <stdio.h>
#include <xbyak/xbyak.h>
struct Code : Xbyak::CodeGenerator {
Code(bool nf) {
xor_(eax, eax); // CF = 0
mov(eax, -1);
if (nf) {
puts("no flags (with T_nf)");
add(eax|T_nf, eax, 1); // does not change CF
} else {
puts("change flags (without T_nf)");
add(eax, eax, 1); // CF = 1
}
adc(eax, 0); // eax = CF ? 1 : 0
ret();
}
};
int main() {
for (int i = 0; i < 2; i++) {
Code c(i);
printf("i=%d ret=%d\n", i, c.getCode<int(*)()>()());
}
}

View file

@ -31,12 +31,14 @@ void putCPUinfo(bool onlyCpuidFeature)
{ Cpu::tSSSE3, "ssse3" },
{ Cpu::tSSE41, "sse41" },
{ Cpu::tSSE42, "sse42" },
{ Cpu::tSSE4a, "sse4a" },
{ Cpu::tPOPCNT, "popcnt" },
{ Cpu::t3DN, "3dn" },
{ Cpu::tE3DN, "e3dn" },
{ Cpu::tAESNI, "aesni" },
{ Cpu::tRDTSCP, "rdtscp" },
{ Cpu::tOSXSAVE, "osxsave(xgetvb)" },
{ Cpu::tXSAVE, "xsave(xgetvb)" },
{ Cpu::tOSXSAVE, "osxsave" },
{ Cpu::tPCLMULQDQ, "pclmulqdq" },
{ Cpu::tAVX, "avx" },
{ Cpu::tFMA, "fma" },
@ -86,8 +88,11 @@ void putCPUinfo(bool onlyCpuidFeature)
{ Cpu::tWAITPKG, "waitpkg" },
{ Cpu::tCLFLUSHOPT, "clflushopt" },
{ Cpu::tCLDEMOTE, "cldemote" },
{ Cpu::tCLWB, "clwb" },
{ Cpu::tMOVDIRI, "movdiri" },
{ Cpu::tMOVDIR64B, "movdir64b" },
{ Cpu::tUINTR, "uintr" },
{ Cpu::tSERIALIZE, "serialize" },
{ Cpu::tCLZERO, "clzero" },
{ Cpu::tAMX_FP16, "amx_fp16" },
{ Cpu::tAVX_VNNI_INT8, "avx_vnni_int8" },
@ -96,12 +101,25 @@ void putCPUinfo(bool onlyCpuidFeature)
{ Cpu::tRAO_INT, "rao-int" },
{ Cpu::tCMPCCXADD, "cmpccxadd" },
{ Cpu::tPREFETCHITI, "prefetchiti" },
{ Cpu::tSHA512, "sha512" },
{ Cpu::tSM3, "sm3" },
{ Cpu::tSM4, "sm4" },
{ Cpu::tAVX_VNNI_INT16, "avx_vnni_int16" },
{ Cpu::tAPX_F, "apx_f" },
{ Cpu::tAVX10, "avx10" },
{ Cpu::tAESKLE, "aeskle" },
{ Cpu::tWIDE_KL, "wide_kl" },
{ Cpu::tKEYLOCKER, "keylocker" },
{ Cpu::tKEYLOCKER_WIDE, "keylocker_wide" },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);
}
printf("\n");
if (onlyCpuidFeature) return;
if (cpu.has(Cpu::tAVX10)) {
printf("AVX10 version %d\n", cpu.getAVX10version());
}
if (cpu.has(Cpu::tPOPCNT)) {
const int n = 0x12345678; // bitcount = 13
const int ok = 13;
@ -127,7 +145,6 @@ void putCPUinfo(bool onlyCpuidFeature)
Core i7-3930K 6 2D
*/
cpu.putFamily();
if (!cpu.has(Cpu::tINTEL)) return;
for (unsigned int i = 0; i < cpu.getDataCacheLevels(); i++) {
printf("cache level=%u data cache size=%u cores sharing data cache=%u\n", i, cpu.getDataCacheSize(i), cpu.getCoresSharingDataCache(i));
}

48
sample/zero_upper.cpp Normal file
View file

@ -0,0 +1,48 @@
/*
An example of T_zu (zero upper) flag
> g++ zero_upper.cpp -I ../xbyak
> sde -future -- ./a.out
*/
#include <stdio.h>
#include <xbyak/xbyak.h>
using namespace Xbyak;
struct Code : Xbyak::CodeGenerator {
Code(int mode)
{
mov(eax, 0x12345678);
cmp(eax, eax); // ZF=1
switch (mode) {
case 0: // imul
puts("imul");
imul(ax,ax, 0x1234);
break;
case 1: // imul+zu
puts("imul+zu");
imul(ax|T_zu, ax, 0x1234);
break;
case 2: // setz
puts("setz");
setz(al);
break;
case 3: // setz+zu
puts("setz+zu");
setz(al|T_zu);
break;
}
ret();
}
};
int main()
try
{
for (int mode = 0; mode < 4; mode++) {
Code c(mode);
auto f = c.getCode<int (*)()>();
printf("ret=%08x\n", f());
}
} catch (std::exception& e) {
printf("ERR %s\n", e.what());
}

View file

@ -1,5 +1,5 @@
TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32
XBYAK_INC=../xbyak/xbyak.h
XBYAK_INC=../xbyak/xbyak.h ../xbyak/xbyak_mnemonic.h
UNAME_S=$(shell uname -s)
ifeq ($(shell ./detect_x32),x32)
X32?=1
@ -13,45 +13,50 @@ ifeq ($(UNAME_S),Darwin)
# 32-bit binary is not supported
ONLY_64BIT=1
endif
ifeq ($(findstring MINGW64,$(UNAME_S)),MINGW64)
ONLY_64BIT=1
endif
ifeq ($(ONLY_64BIT),0)
TARGET += jmp address
endif
ifeq ($(BIT),64)
TARGET += jmp64 address64
TARGET += jmp64 address64 apx
endif
all: $(TARGET)
CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wwrite-strings -Wfloat-equal -Wpointer-arith
CFLAGS=-O2 -Wall -I../ -I./ $(CFLAGS_WARN) $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) #-std=c++0x
CFLAGS=-O2 -Wall -I.. -I. $(CFLAGS_WARN) $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) #-std=c++0x
make_nm:
$(CXX) $(CFLAGS) make_nm.cpp -o $@
normalize_prefix: normalize_prefix.cpp ../xbyak/xbyak.h
normalize_prefix: normalize_prefix.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) normalize_prefix.cpp -o $@
test_mmx: test_mmx.cpp ../xbyak/xbyak.h
test_mmx: test_mmx.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) test_mmx.cpp -o $@ -lpthread
jmp: jmp.cpp ../xbyak/xbyak.h
jmp: jmp.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) jmp.cpp -o $@ -m32
jmp64: jmp.cpp ../xbyak/xbyak.h
jmp64: jmp.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) jmp.cpp -o $@ -m64
address: address.cpp ../xbyak/xbyak.h
address: address.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) address.cpp -o $@ -m32
address64: address.cpp ../xbyak/xbyak.h
address64: address.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) address.cpp -o $@ -m64
bad_address: bad_address.cpp ../xbyak/xbyak.h
bad_address: bad_address.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) bad_address.cpp -o $@
misc: misc.cpp ../xbyak/xbyak.h
misc: misc.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) misc.cpp -o $@
misc32: misc.cpp ../xbyak/xbyak.h
misc32: misc.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) misc.cpp -o $@ -DXBYAK32
cvt_test: cvt_test.cpp ../xbyak/xbyak.h
cvt_test: cvt_test.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) $< -o $@
cvt_test32: cvt_test.cpp ../xbyak/xbyak.h
cvt_test32: cvt_test.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) $< -o $@ -DXBYAK32
noexception: noexception.cpp ../xbyak/xbyak.h
noexception: noexception.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) $< -o $@ -fno-exceptions
apx: apx.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) apx.cpp -o $@
test_nm: normalize_prefix $(TARGET)
$(MAKE) -C ../gen
@ -75,6 +80,7 @@ ifneq ($(X32),1)
CXX=$(CXX) ./test_nm.sh Y64
endif
./jmp64
./apx
endif
test_avx: normalize_prefix
@ -112,3 +118,4 @@ lib_run: lib_test.cpp lib_run.cpp lib.h
$(CXX) $(CFLAGS) lib_run.cpp lib_test.cpp -o lib_run
make_nm: make_nm.cpp $(XBYAK_INC)
.PHONY: test

1964
test/apx.cpp Normal file

File diff suppressed because it is too large Load diff

View file

@ -109,8 +109,8 @@ CYBOZU_TEST_AUTO(changeBit)
{ &dil, &di, &edi, &rdi, &xmm7, &ymm7, &zmm7 },
{ &r8b, &r8w, &r8d, &r8, &xmm8, &ymm8, &zmm8 },
{ &r15b, &r15w, &r15d, &r15, &xmm15, &ymm15, &zmm15 },
{ 0, 0, 0, 0, &xmm16, &ymm16, &zmm16 },
{ 0, 0, 0, 0, &xmm31, &ymm31, &zmm31 },
{ &r16b, &r16w, &r16d, &r16, &xmm16, &ymm16, &zmm16 },
{ &r31b, &r31w, &r31d, &r31, &xmm31, &ymm31, &zmm31 },
};
const int bitTbl[N] = { 8, 16, 32, 64, 128, 256, 512 };
#else

View file

@ -558,6 +558,7 @@ class Test {
"wbinvd",
"wrmsr",
"xlatb",
"xend",
"popf",
"pushf",
@ -1050,6 +1051,10 @@ class Test {
"nle",
"g",
};
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wformat-truncation" // wrong detection
#endif
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
char buf[32];
snprintf(buf, sizeof(buf), "cmov%s", tbl[i]);
@ -1059,6 +1064,9 @@ class Test {
snprintf(buf, sizeof(buf), "set%s", tbl[i]);
put(buf, REG8|REG8_3|MEM);
}
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop
#endif
}
void putReg1() const
{
@ -1326,6 +1334,7 @@ class Test {
#ifdef XBYAK64
put("cmpxchg16b", MEM);
put("fxrstor64", MEM);
put("xbegin", "0x12345678");
#endif
{
const char tbl[][8] = {
@ -1348,6 +1357,7 @@ class Test {
put("xchg", EAX|REG32, EAX|REG32|MEM);
put("xchg", MEM, EAX|REG32);
put("xchg", REG64, REG64|MEM);
put("xabort", IMM8);
}
void putShift() const
{
@ -1493,18 +1503,6 @@ class Test {
put(p, XMM, XMM|MEM, IMM);
}
}
{
const char tbl[][16] = {
"pclmullqlqdq",
"pclmulhqlqdq",
// "pclmullqhdq", // QQQ : not supported by nasm/yasm
// "pclmulhqhdq",
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const char *p = tbl[i];
put(p, XMM, XMM|MEM);
}
}
put("extractps", REG32e|MEM, XMM, IMM);
put("pextrw", REG32e|MEM, XMM, IMM); // pextrw for REG32 is for MMX2
put("pextrb", REG32e|MEM, XMM, IMM);
@ -1522,6 +1520,23 @@ class Test {
#endif
}
void putVpclmulqdq()
{
const char tbl[][16] = {
"vpclmullqlqdq",
"vpclmulhqlqdq",
"vpclmullqhqdq",
"vpclmulhqhqdq",
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const char *p = tbl[i] + 1; // remove the top 'v'
put(p, XMM, XMM|MEM);
p = tbl[i]; // use the top 'v'
put(p, XMM, XMM, XMM|MEM);
put(p, YMM, YMM, YMM|MEM);
put(p, ZMM, ZMM, ZMM|MEM);
}
}
void putSHA() const
{
put("sha1rnds4", XMM, XMM|MEM, IMM);
@ -2569,6 +2584,7 @@ public:
putPushPop8_16();
#else
putSIMPLE();
putVpclmulqdq();
putReg1();
putBt();
putRorM();

View file

@ -1949,6 +1949,12 @@ CYBOZU_TEST_AUTO(misc)
movdiri(ptr[rax+r12], r9);
movdiri(ptr[rax+r12*2+4], r9d);
movdir64b(r10, ptr[r8]);
clui();
senduipi(rax);
senduipi(r10);
stui();
testui();
uiret();
#endif
}
} c;
@ -1972,6 +1978,12 @@ CYBOZU_TEST_AUTO(misc)
0x4e, 0x0f, 0x38, 0xf9, 0x0c, 0x20, // movdiri
0x46, 0x0f, 0x38, 0xf9, 0x4c, 0x60, 0x04, // movdiri
0x66, 0x45, 0x0f, 0x38, 0xf8, 0x10, // movdir64b
0xf3, 0x0f, 0x01, 0xee, // clui
0xf3, 0x0f, 0xc7, 0xf0, // senduipi rax
0xf3, 0x41, 0x0f, 0xc7, 0xf2, // senduipi r10
0xf3, 0x0f, 0x01, 0xef, // stui
0xf3, 0x0f, 0x01, 0xed, // testui
0xf3, 0x0f, 0x01, 0xec, // uiret
#endif
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
@ -2157,4 +2169,116 @@ CYBOZU_TEST_AUTO(prefetchiti)
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(crypto)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vsha512msg1(ymm3, xmm5);
vsha512msg2(ymm9, ymm10);
vsha512rnds2(ymm1, ymm3, xmm2);
vsm3msg1(xmm1, xmm2, xmm3);
vsm3msg1(xmm1, xmm2, ptr [rax]);
vsm3msg2(xmm5, xmm7, xmm3);
vsm3msg2(xmm5, xmm6, ptr [rax]);
vsm3rnds2(xmm5, xmm7, xmm3, 0x12);
vsm3rnds2(xmm5, xmm7, ptr [rcx], 0x34);
vsm4key4(xmm1, xmm2, xmm3);
vsm4key4(xmm1, xmm2, ptr [rdx]);
vsm4rnds4(xmm1, xmm2, xmm3);
vsm4rnds4(xmm5, xmm6, ptr [rcx+rax*4]);
}
} c;
const uint8_t tbl[] = {
// sha512
0xc4, 0xe2, 0x7f, 0xcc, 0xdd,
0xc4, 0x42, 0x7f, 0xcd, 0xca,
0xc4, 0xe2, 0x67, 0xcb, 0xca,
// sm3
0xC4, 0xE2, 0x68, 0xDA, 0xCB,
0xC4, 0xE2, 0x68, 0xDA, 0x08,
0xC4, 0xE2, 0x41, 0xDA, 0xEB,
0xC4, 0xE2, 0x49, 0xDA, 0x28,
0xC4, 0xE3, 0x41, 0xDE, 0xEB, 0x12,
0xC4, 0xE3, 0x41, 0xDE, 0x29, 0x34,
// sm4
0xc4, 0xe2, 0x6a, 0xda, 0xcb,
0xc4, 0xe2, 0x6a, 0xda, 0x0a,
0xc4, 0xe2, 0x6b, 0xda, 0xcb,
0xc4, 0xe2, 0x4b, 0xda, 0x2c, 0x81,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(avx_vnni_int)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vpdpbssd(xmm1, xmm2, xmm3);
vpdpbssd(ymm1, ymm2, ptr [rax]);
vpdpbssds(xmm1, xmm2, xmm3);
vpdpbssds(ymm1, ymm2, ptr [rax]);
vpdpbsud(xmm1, xmm2, xmm3);
vpdpbsud(ymm1, ymm2, ptr [rax]);
vpdpbsuds(xmm1, xmm2, xmm3);
vpdpbsuds(ymm1, ymm2, ptr [rax]);
vpdpbuud(xmm1, xmm2, xmm3);
vpdpbuud(ymm1, ymm2, ptr [rax]);
vpdpbuuds(xmm1, xmm2, xmm3);
vpdpbuuds(ymm1, ymm2, ptr [rax]);
vpdpwsud(xmm1, xmm2, xmm3);
vpdpwsud(ymm1, ymm2, ptr [rax]);
vpdpwsuds(xmm1, xmm2, xmm3);
vpdpwsuds(ymm1, ymm2, ptr [rax]);
vpdpwusd(xmm1, xmm2, xmm3);
vpdpwusd(ymm1, ymm2, ptr [rax]);
vpdpwusds(xmm1, xmm2, xmm3);
vpdpwusds(ymm1, ymm2, ptr [rax]);
vpdpwuud(xmm1, xmm2, xmm3);
vpdpwuud(ymm1, ymm2, ptr [rax]);
vpdpwuuds(xmm1, xmm2, xmm3);
vpdpwuuds(ymm1, ymm2, ptr [rax]);
}
} c;
const uint8_t tbl[] = {
0xc4, 0xe2, 0x6b, 0x50, 0xcb,
0xc4, 0xe2, 0x6f, 0x50, 0x08,
0xc4, 0xe2, 0x6b, 0x51, 0xcb,
0xc4, 0xe2, 0x6f, 0x51, 0x08,
0xc4, 0xe2, 0x6a, 0x50, 0xcb,
0xc4, 0xe2, 0x6e, 0x50, 0x08,
0xc4, 0xe2, 0x6a, 0x51, 0xcb,
0xc4, 0xe2, 0x6e, 0x51, 0x08,
0xc4, 0xe2, 0x68, 0x50, 0xcb,
0xc4, 0xe2, 0x6c, 0x50, 0x08,
0xc4, 0xe2, 0x68, 0x51, 0xcb,
0xc4, 0xe2, 0x6c, 0x51, 0x08,
0xc4, 0xe2, 0x6a, 0xd2, 0xcb,
0xc4, 0xe2, 0x6e, 0xd2, 0x08,
0xc4, 0xe2, 0x6a, 0xd3, 0xcb,
0xc4, 0xe2, 0x6e, 0xd3, 0x08,
0xc4, 0xe2, 0x69, 0xd2, 0xcb,
0xc4, 0xe2, 0x6d, 0xd2, 0x08,
0xc4, 0xe2, 0x69, 0xd3, 0xcb,
0xc4, 0xe2, 0x6d, 0xd3, 0x08,
0xc4, 0xe2, 0x68, 0xd2, 0xcb,
0xc4, 0xe2, 0x6c, 0xd2, 0x08,
0xc4, 0xe2, 0x68, 0xd3, 0xcb,
0xc4, 0xe2, 0x6c, 0xd3, 0x08,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
#endif

View file

@ -8,14 +8,25 @@
typedef unsigned char uint8_t;
std::string normalize(const std::string& line)
std::string normalize(std::string line)
{
size_t pos = line.find('(');
/* nasm generates byte codes containing () for xbegin, so remove it. */
if (pos != std::string::npos) {
line.erase(pos, 1);
pos = line.find(')');
if (pos == std::string::npos) {
fprintf(stderr, "line error {%s}\n", line.c_str());
return "";
}
line.erase(pos, 1);
}
static const char tbl[][3] = { "66", "67", "F2", "F3" };
size_t tblNum = sizeof(tbl) / sizeof(tbl[0]);
typedef std::set<std::string> StringSet;
StringSet suf;
size_t pos = 0;
pos = 0;
for (; pos < line.size(); pos += 2) {
bool found = false;
for (size_t i = 0; i < tblNum; i++) {

View file

@ -23,7 +23,7 @@ echo "xbyak"
echo "compile nm_frame.cpp"
$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
./nm_frame > x.lst
diff ok.lst x.lst && echo "ok"
diff -bB ok.lst x.lst && echo "ok"
}

View file

@ -5,4 +5,10 @@ call test_address
call test_address 64
echo *** test jmp address ***
call test_jmp
echo *** test misc ***
set FILE=misc
call test_misc
echo *** test APX ***
set FILE=apx
call test_misc
echo *** all test end ***

View file

@ -48,4 +48,4 @@ echo "xbyak"
echo "compile nm_frame.cpp"
$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
./nm_frame | $FILTER > x.lst
diff -B ok.lst x.lst && echo "ok"
diff -bB ok.lst x.lst && echo "ok"

View file

@ -35,4 +35,4 @@ echo "xbyak"
echo "compile nm_frame.cpp"
$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame -DXBYAK_AVX512
./nm_frame | $FILTER > x.lst
diff -B ok.lst x.lst && echo "ok"
diff -bB ok.lst x.lst && echo "ok"

View file

@ -1,4 +1,4 @@
call set_opt
bmake -f Makefile.win all
cl -I../ -I./ -DXBYAK_TEST misc.cpp %OPT% /Od /Zi
misc
cl -I../ -I./ -DXBYAK_TEST %FILE%.cpp %OPT% /Od /Zi
%FILE%

View file

@ -61,4 +61,4 @@ echo "xbyak"
echo "compile nm_frame.cpp"
$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
./nm_frame | $FILTER > x.lst
diff -B ok.lst x.lst && echo "ok"
diff -bB ok.lst x.lst && echo "ok"

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -9,6 +9,13 @@
#define XBYAK_THROW(x) ;
#define XBYAK_THROW_RET(x, y) return y;
#endif
#ifndef XBYAK_CONSTEXPR
#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || (defined(_MSC_VER) && _MSC_VER >= 1910)
#define XBYAK_CONSTEXPR constexpr
#else
#define XBYAK_CONSTEXPR
#endif
#endif
#else
#include <string.h>
@ -93,7 +100,7 @@ struct TypeT {
};
template<uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2>
TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) { return TypeT<L1 | L2, H1 | H2>(); }
XBYAK_CONSTEXPR TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) { return TypeT<L1 | L2, H1 | H2>(); }
template<typename T>
inline T max_(T x, T y) { return x >= y ? x : y; }
@ -137,6 +144,7 @@ private:
uint32_t dataCacheSize_[maxNumberCacheLevels];
uint32_t coresSharignDataCache_[maxNumberCacheLevels];
uint32_t dataCacheLevels_;
uint32_t avx10version_;
uint32_t get32bitAsBE(const char *x) const
{
@ -173,11 +181,9 @@ private:
}
void setNumCores()
{
if (!has(tINTEL)) return;
if (!has(tINTEL) && !has(tAMD)) return;
uint32_t data[4] = {};
/* CAUTION: These numbers are configuration as shipped by Intel. */
getCpuidEx(0x0, 0, data);
if (data[0] >= 0xB) {
/*
@ -211,7 +217,48 @@ private:
}
void setCacheHierarchy()
{
if (!has(tINTEL)) return;
if (!has(tINTEL) && !has(tAMD)) return;
// https://github.com/amd/ZenDNN/blob/a08bf9a9efc160a69147cdecfb61cc85cc0d4928/src/cpu/x64/xbyak/xbyak_util.h#L236-L288
if (has(tAMD)) {
// There are 3 Data Cache Levels (L1, L2, L3)
dataCacheLevels_ = 3;
const uint32_t leaf = 0x8000001D; // for modern AMD CPus
// Sub leaf value ranges from 0 to 3
// Sub leaf value 0 refers to L1 Data Cache
// Sub leaf value 1 refers to L1 Instruction Cache
// Sub leaf value 2 refers to L2 Cache
// Sub leaf value 3 refers to L3 Cache
// For legacy AMD CPU, use leaf 0x80000005 for L1 cache
// and 0x80000006 for L2 and L3 cache
int cache_index = 0;
for (uint32_t sub_leaf = 0; sub_leaf <= dataCacheLevels_; sub_leaf++) {
// Skip sub_leaf = 1 as it refers to
// L1 Instruction Cache (not required)
if (sub_leaf == 1) {
continue;
}
uint32_t data[4] = {};
getCpuidEx(leaf, sub_leaf, data);
// Cache Size = Line Size * Partitions * Associativity * Cache Sets
dataCacheSize_[cache_index] =
(extractBit(data[1], 22, 31) + 1) // Associativity-1
* (extractBit(data[1], 12, 21) + 1) // Partitions-1
* (extractBit(data[1], 0, 11) + 1) // Line Size
* (data[2] + 1);
// Calculate the number of cores sharing the current data cache
int smt_width = numCores_[0];
int logical_cores = numCores_[1];
int actual_logical_cores = extractBit(data[0], 14, 25) /* # of cores * # of threads */ + 1;
if (logical_cores != 0) {
actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
}
coresSharignDataCache_[cache_index] = local::max_(actual_logical_cores / smt_width, 1);
++cache_index;
}
return;
}
// intel
const uint32_t NO_CACHE = 0;
const uint32_t DATA_CACHE = 1;
// const uint32_t INSTRUCTION_CACHE = 2;
@ -417,6 +464,21 @@ public:
XBYAK_DEFINE_TYPE(72, tRAO_INT);
XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
XBYAK_DEFINE_TYPE(75, tSERIALIZE);
XBYAK_DEFINE_TYPE(76, tUINTR);
XBYAK_DEFINE_TYPE(77, tXSAVE);
XBYAK_DEFINE_TYPE(78, tSHA512);
XBYAK_DEFINE_TYPE(79, tSM3);
XBYAK_DEFINE_TYPE(80, tSM4);
XBYAK_DEFINE_TYPE(81, tAVX_VNNI_INT16);
XBYAK_DEFINE_TYPE(82, tAPX_F);
XBYAK_DEFINE_TYPE(83, tAVX10);
XBYAK_DEFINE_TYPE(84, tAESKLE);
XBYAK_DEFINE_TYPE(85, tWIDE_KL);
XBYAK_DEFINE_TYPE(86, tKEYLOCKER);
XBYAK_DEFINE_TYPE(87, tKEYLOCKER_WIDE);
XBYAK_DEFINE_TYPE(88, tSSE4a);
XBYAK_DEFINE_TYPE(89, tCLWB);
#undef XBYAK_SPLIT_ID
#undef XBYAK_DEFINE_TYPE
@ -428,6 +490,7 @@ public:
, dataCacheSize_()
, coresSharignDataCache_()
, dataCacheLevels_(0)
, avx10version_(0)
{
uint32_t data[4] = {};
const uint32_t& EAX = data[0];
@ -462,13 +525,14 @@ public:
if (maxExtendedNum >= 0x80000001) {
getCpuid(0x80000001, data);
if (EDX & (1U << 31)) type_ |= t3DN;
if (EDX & (1U << 30)) type_ |= tE3DN;
if (EDX & (1U << 27)) type_ |= tRDTSCP;
if (EDX & (1U << 22)) type_ |= tMMX2;
if (EDX & (1U << 15)) type_ |= tCMOV;
if (ECX & (1U << 5)) type_ |= tLZCNT;
if (ECX & (1U << 6)) type_ |= tSSE4a;
if (ECX & (1U << 8)) type_ |= tPREFETCHW;
if (EDX & (1U << 15)) type_ |= tCMOV;
if (EDX & (1U << 22)) type_ |= tMMX2;
if (EDX & (1U << 27)) type_ |= tRDTSCP;
if (EDX & (1U << 30)) type_ |= tE3DN;
if (EDX & (1U << 31)) type_ |= t3DN;
}
if (maxExtendedNum >= 0x80000008) {
@ -478,16 +542,17 @@ public:
getCpuid(1, data);
if (ECX & (1U << 0)) type_ |= tSSE3;
if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
if (ECX & (1U << 9)) type_ |= tSSSE3;
if (ECX & (1U << 19)) type_ |= tSSE41;
if (ECX & (1U << 20)) type_ |= tSSE42;
if (ECX & (1U << 22)) type_ |= tMOVBE;
if (ECX & (1U << 23)) type_ |= tPOPCNT;
if (ECX & (1U << 25)) type_ |= tAESNI;
if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
if (ECX & (1U << 26)) type_ |= tXSAVE;
if (ECX & (1U << 27)) type_ |= tOSXSAVE;
if (ECX & (1U << 30)) type_ |= tRDRAND;
if (ECX & (1U << 29)) type_ |= tF16C;
if (ECX & (1U << 30)) type_ |= tRDRAND;
if (EDX & (1U << 15)) type_ |= tCMOV;
if (EDX & (1U << 23)) type_ |= tMMX;
@ -498,8 +563,8 @@ public:
// check XFEATURE_ENABLED_MASK[2:1] = '11b'
uint64_t bv = getXfeature();
if ((bv & 6) == 6) {
if (ECX & (1U << 28)) type_ |= tAVX;
if (ECX & (1U << 12)) type_ |= tFMA;
if (ECX & (1U << 28)) type_ |= tAVX;
// do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support
#if !defined(__APPLE__)
if (((bv >> 5) & 7) == 7)
@ -533,29 +598,36 @@ public:
const uint32_t maxNumSubLeaves = EAX;
if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
if (EBX & (1U << 3)) type_ |= tBMI1;
if (EBX & (1U << 4)) type_ |= tHLE;
if (EBX & (1U << 8)) type_ |= tBMI2;
if (EBX & (1U << 9)) type_ |= tENHANCED_REP;
if (EBX & (1U << 11)) type_ |= tRTM;
if (EBX & (1U << 14)) type_ |= tMPX;
if (EBX & (1U << 18)) type_ |= tRDSEED;
if (EBX & (1U << 19)) type_ |= tADX;
if (EBX & (1U << 20)) type_ |= tSMAP;
if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT;
if (EBX & (1U << 4)) type_ |= tHLE;
if (EBX & (1U << 11)) type_ |= tRTM;
if (EBX & (1U << 14)) type_ |= tMPX;
if (EBX & (1U << 24)) type_ |= tCLWB;
if (EBX & (1U << 29)) type_ |= tSHA;
if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
if (ECX & (1U << 5)) type_ |= tWAITPKG;
if (ECX & (1U << 8)) type_ |= tGFNI;
if (ECX & (1U << 9)) type_ |= tVAES;
if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
if (ECX & (1U << 23)) type_ |= tKEYLOCKER;
if (ECX & (1U << 25)) type_ |= tCLDEMOTE;
if (ECX & (1U << 27)) type_ |= tMOVDIRI;
if (ECX & (1U << 28)) type_ |= tMOVDIR64B;
if (EDX & (1U << 5)) type_ |= tUINTR;
if (EDX & (1U << 14)) type_ |= tSERIALIZE;
if (EDX & (1U << 22)) type_ |= tAMX_BF16;
if (EDX & (1U << 24)) type_ |= tAMX_TILE;
if (EDX & (1U << 25)) type_ |= tAMX_INT8;
if (EDX & (1U << 22)) type_ |= tAMX_BF16;
if (maxNumSubLeaves >= 1) {
getCpuidEx(7, 1, data);
if (EAX & (1U << 0)) type_ |= tSHA512;
if (EAX & (1U << 1)) type_ |= tSM3;
if (EAX & (1U << 2)) type_ |= tSM4;
if (EAX & (1U << 3)) type_ |= tRAO_INT;
if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
if (type_ & tAVX512F) {
@ -566,9 +638,22 @@ public:
if (EAX & (1U << 23)) type_ |= tAVX_IFMA;
if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8;
if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT;
if (EDX & (1U << 10)) type_ |= tAVX_VNNI_INT16;
if (EDX & (1U << 14)) type_ |= tPREFETCHITI;
if (EDX & (1U << 19)) type_ |= tAVX10;
if (EDX & (1U << 21)) type_ |= tAPX_F;
}
}
if (maxNum >= 0x19) {
getCpuidEx(0x19, 0, data);
if (EBX & (1U << 0)) type_ |= tAESKLE;
if (EBX & (1U << 2)) type_ |= tWIDE_KL;
if (type_ & (tKEYLOCKER|tAESKLE|tWIDE_KL)) type_ |= tKEYLOCKER_WIDE;
}
if (has(tAVX10) && maxNum >= 0x24) {
getCpuidEx(0x24, 0, data);
avx10version_ = EBX & mask(7);
}
setFamily();
setNumCores();
setCacheHierarchy();
@ -585,6 +670,7 @@ public:
{
return (type & type_) == type;
}
int getAVX10version() const { return avx10version_; }
};
#ifndef XBYAK_ONLY_CLASS_CPU