externals: Update xbyak to v7.05

Merge commit 'fdf626b74f35deedce0e6196c36b8c9f846c038a'
This commit is contained in:
Merry 2024-01-30 00:36:49 +00:00
commit 213fe7a452
62 changed files with 5179 additions and 2059 deletions

View file

@ -149,7 +149,7 @@ if ("arm64" IN_LIST ARCHITECTURE OR DYNARMIC_TESTS)
endif() endif()
if ("x86_64" IN_LIST ARCHITECTURE) if ("x86_64" IN_LIST ARCHITECTURE)
find_package(xbyak 6 CONFIG) find_package(xbyak 7 CONFIG)
find_package(Zydis 4 CONFIG) find_package(Zydis 4 CONFIG)
endif() endif()

View file

@ -15,7 +15,7 @@ if (NOT @BUILD_SHARED_LIBS@)
endif() endif()
if ("x86_64" IN_LIST ARCHITECTURE) if ("x86_64" IN_LIST ARCHITECTURE)
find_dependency(xbyak 6) find_dependency(xbyak 7)
find_dependency(Zydis 4) find_dependency(Zydis 4)
endif() endif()

View file

@ -1,53 +1,49 @@
cmake_minimum_required(VERSION 2.6...3.0.2) cmake_minimum_required(VERSION 3.5)
project(xbyak LANGUAGES CXX VERSION 6.68) project(xbyak LANGUAGES CXX VERSION 7.05)
file(GLOB headers xbyak/*.h) file(GLOB headers xbyak/*.h)
if (DEFINED CMAKE_VERSION AND CMAKE_VERSION VERSION_GREATER_EQUAL 3.0.2) include(GNUInstallDirs)
include(GNUInstallDirs) add_library(${PROJECT_NAME} INTERFACE)
add_library(${PROJECT_NAME} INTERFACE) add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
target_include_directories(
${PROJECT_NAME} INTERFACE
"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
)
install( target_include_directories(
TARGETS ${PROJECT_NAME} ${PROJECT_NAME} INTERFACE
EXPORT ${PROJECT_NAME}-targets "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME} "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
) )
include(CMakePackageConfigHelpers) install(
configure_package_config_file( TARGETS ${PROJECT_NAME}
cmake/config.cmake.in EXPORT ${PROJECT_NAME}-targets
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}
)
include(CMakePackageConfigHelpers)
configure_package_config_file(
cmake/config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config.cmake"
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)
write_basic_package_version_file(
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake"
COMPATIBILITY SameMajorVersion
)
install(
FILES
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config.cmake" "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config.cmake"
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)
write_basic_package_version_file(
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake" "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake"
COMPATIBILITY SameMajorVersion DESTINATION
) ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)
install( install(
FILES EXPORT ${PROJECT_NAME}-targets
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config.cmake" NAMESPACE ${PROJECT_NAME}::
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake" DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
DESTINATION )
${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)
install(
EXPORT ${PROJECT_NAME}-targets
NAMESPACE ${PROJECT_NAME}::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)
elseif(NOT DEFINED CMAKE_INSTALL_INCLUDEDIR)
set(CMAKE_INSTALL_INCLUDEDIR "include")
endif()
install( install(
FILES ${headers} FILES ${headers}

View file

@ -1,5 +1,18 @@
# History # History
* 2024/Jan/03 ver 7.05 support RAO-INT for APX
* 2023/Dec/28 ver 7.04 rex2 supports two-byte opecode
* 2023/Dec/26 ver 7.03 set the default value of dfv to 0
* 2023/Dec/20 ver 7.02 SHA* support APX
* 2023/Dec/19 ver 7.01 support AESKLE, WIDE_KL, KEYLOCKER, KEYLOCKER_WIDE, detection of APX10/APX
* 2023/Dec/01 ver 7.00 support APX
* 2023/Aug/07 ver 6.73 add sha512/sm3/sm4/avx-vnni-int16
* 2023/Aug/02 ver 6.72 add xbegin/xabort/xend
* 2023/Jul/27 ver 6.71 Allocator supports huge page
* 2023/Jul/05 ver 6.70 add alias of vclmulqdq, correct alias of pclmulqdq
* 2023/Jun/27 ver 6.69.2 add constexpr to `TypeT operator|` (thanks to Wunkolo)
* 2023/Mar/23 ver 6.69.1 add detection of xsave (thanks to Wunkolo)
* 2023/Feb/20 ver 6.69 util::Cpu supports AMD CPUs. support UINTR
* 2022/Dec/07 ver 6.68 support prefetchit{0,1} * 2022/Dec/07 ver 6.68 support prefetchit{0,1}
* 2022/Nov/30 ver 6.67 support CMPccXADD * 2022/Nov/30 ver 6.67 support CMPccXADD
* 2022/Nov/25 ver 6.66 support RAO-INT * 2022/Nov/25 ver 6.66 support RAO-INT

View file

@ -128,6 +128,34 @@ vpdpbusd(xm0, xm1, xm2); // VEX encoding
* use `ptr_b` for broadcast `{1toX}`. X is automatically determined. * use `ptr_b` for broadcast `{1toX}`. X is automatically determined.
* specify `xword`/`yword`/`zword(_b)` for m128/m256/m512 if necessary. * specify `xword`/`yword`/`zword(_b)` for m128/m256/m512 if necessary.
## APX
[Advanced Performance Extensions (APX) Architecture Specification](https://www.intel.com/content/www/us/en/content-details/786223/intel-advanced-performance-extensions-intel-apx-architecture-specification.html)
- Support 64-bit 16 additional GPRs (general-purpose registers) r16, ..., r31
- 32-bit regs are r16d, ..., r31d
- 16-bit regs are r16w, ..., r31w
- 8-bit regs are r16b, ..., r31b
- `add(r20, r21);`
- `lea(r30, ptr[r29+r31]);`
- Support three-operand instruction
- `add(r20, r21, r23);`
- `add(r20, ptr[rax + rcx * 8 + 0x1234], r23);`
- Support T_nf for NF=1 (status flags update suppression)
- `add(r20|T_nf, r21, r23);` // Set EVEX.NF=1
- Support T_zu for NF=ZU (zero upper) for imul and setcc
- `imul(ax|T_zu, cx, 0x1234);` // Set ND=ZU
- `imul(ax|T_zu|T_nf, cx, 0x1234);` // Set ND=ZU and EVEX.NF=1
- `setb(r31b|T_zu);` // same as set(r31b); movzx(r31, r31b);
- See [sample/zero_upper.cpp](../sample/zero_upper.cpp)
### ccmpSCC and ctestSCC
- ccmpSCC(op1, op2, dfv = 0); // eflags = eflags == SCC ? cmp(op1, op2) : dfv
- ctestSCC(op1, op2, dfv = 0); // eflags = eflags == SCC ? test(op1, op2) : dfv
- SCC means source condition code such as z, a, gt.
- See [sample/ccmp.cpp](../sample/ccmp.cpp)
- Specify the union of T_of(=8), T_sf(=4), T_zf(=2), or T_cf(=1) for dfv.
## Label ## Label
Two kinds of Label are supported. (String literal and Label class). Two kinds of Label are supported. (String literal and Label class).

View file

@ -1,12 +1,17 @@
TARGET=../xbyak/xbyak_mnemonic.h TARGET=../xbyak/xbyak_mnemonic.h
BIN=sortline gen_code gen_avx512 BIN=sortline gen_code gen_avx512
CFLAGS=-I../ -O2 -Wall -Wextra -Wno-missing-field-initializers $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) CFLAGS=-I../ -I ./ -Wall -Wextra -Wno-missing-field-initializers $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS)
all: $(TARGET) ../CMakeLists.txt ../meson.build ../readme.md ../readme.txt all: $(TARGET) ../CMakeLists.txt ../meson.build ../readme.md ../readme.txt
avx_type_def.h: ../xbyak/xbyak.h
sed -n '/@@@begin of avx_type_def.h/,/@@@end of avx_type_def.h/p' $< > $@
avx_type.hpp: avx_type_def.h
sortline: sortline.cpp sortline: sortline.cpp
$(CXX) $(CFLAGS) $< -o $@ $(CXX) $(CFLAGS) $< -o $@
gen_code: gen_code.cpp ../xbyak/xbyak.h avx_type.hpp gen_code: gen_code.cpp avx_type.hpp
$(CXX) $(CFLAGS) $< -o $@ $(CXX) $(CFLAGS) $< -o $@
gen_avx512: gen_avx512.cpp ../xbyak/xbyak.h avx_type.hpp gen_avx512: gen_avx512.cpp avx_type.hpp
$(CXX) $(CFLAGS) $< -o $@ $(CXX) $(CFLAGS) $< -o $@
$(TARGET): $(BIN) $(TARGET): $(BIN)
@ -36,4 +41,4 @@ VER=$(shell head -n 1 ../xbyak/xbyak_mnemonic.h|grep -o "[0-9.]*")
sed -l 2 -i -e "s/Xbyak [0-9.]*/Xbyak $(VER)/" $@ sed -l 2 -i -e "s/Xbyak [0-9.]*/Xbyak $(VER)/" $@
clean: clean:
$(RM) $(BIN) $(TARGET) $(RM) $(BIN) $(TARGET) avx_type_def.h

View file

@ -1,190 +1,72 @@
#include <assert.h> #include <assert.h>
// copy CodeGenerator::AVXtype #include "avx_type_def.h"
enum AVXtype {
// low 3 bit
T_N1 = 1,
T_N2 = 2,
T_N4 = 3,
T_N8 = 4,
T_N16 = 5,
T_N32 = 6,
T_NX_MASK = 7,
//
T_N_VL = 1 << 3, // N * (1, 2, 4) for VL
T_DUP = 1 << 4, // N = (8, 32, 64)
T_66 = 1 << 5, // pp = 1
T_F3 = 1 << 6, // pp = 2
T_F2 = T_66 | T_F3, // pp = 3
T_ER_R = 1 << 7, // reg{er}
T_0F = 1 << 8,
T_0F38 = 1 << 9,
T_0F3A = 1 << 10,
T_L0 = 1 << 11,
T_L1 = 1 << 12,
T_W0 = 1 << 13,
T_W1 = 1 << 14,
T_EW0 = 1 << 15,
T_EW1 = 1 << 16,
T_YMM = 1 << 17, // support YMM, ZMM
T_EVEX = 1 << 18,
T_ER_X = 1 << 19, // xmm{er}
T_ER_Y = 1 << 20, // ymm{er}
T_ER_Z = 1 << 21, // zmm{er}
T_SAE_X = 1 << 22, // xmm{sae}
T_SAE_Y = 1 << 23, // ymm{sae}
T_SAE_Z = 1 << 24, // zmm{sae}
T_MUST_EVEX = 1 << 25, // contains T_EVEX
T_B32 = 1 << 26, // m32bcst
T_B64 = 1 << 27, // m64bcst
T_B16 = T_B32 | T_B64, // m16bcst
T_M_K = 1 << 28, // mem{k}
T_VSIB = 1 << 29,
T_MEM_EVEX = 1 << 30, // use evex if mem
T_FP16 = 1 << 31,
T_MAP5 = T_FP16 | T_0F,
T_MAP6 = T_FP16 | T_0F38,
T_XXX
};
// T_66 = 1, T_F3 = 2, T_F2 = 3
uint32_t getPP(int type) { return (type >> 5) & 3; }
const int NONE = 256; // same as Xbyak::CodeGenerator::NONE const int NONE = 256; // same as Xbyak::CodeGenerator::NONE
std::string type2String(int type) std::string type2String(uint64_t type)
{ {
if (type == 0) return "T_NONE";
std::string str; std::string str;
int low = type & T_NX_MASK; int low = type & T_NX_MASK;
if (0 < low) { if (0 < low && low < 7) {
const char *tbl[8] = { const char *tbl[8] = {
"T_N1", "T_N2", "T_N4", "T_N8", "T_N16", "T_N32" "T_N1", "T_N2", "T_N4", "T_N8", "T_N16", "T_N32"
}; };
assert(low < int(sizeof(tbl) / sizeof(tbl[0]))); assert(low < int(sizeof(tbl) / sizeof(tbl[0])));
str = tbl[low - 1]; str = tbl[low - 1];
} }
if (type & T_N_VL) { if (type & T_N_VL) str += "|T_N_VL";
if (!str.empty()) str += " | "; if (type & T_APX) str += "|T_APX";
str += "T_N_VL"; if ((type & T_NX_MASK) == T_DUP) str += "|T_DUP";
} if (type & T_66) str += "|T_66";
if (type & T_DUP) { if (type & T_F3) str += "|T_F3";
if (!str.empty()) str += " | "; if (type & T_F2) str += "|T_F2";
str += "T_DUP";
}
if (type & T_F2) {
if (!str.empty()) str += " | ";
switch (type & T_F2) {
case T_66: str += "T_66"; break;
case T_F3: str += "T_F3"; break;
case T_F2: str += "T_F2"; break;
default: break;
}
}
if (type & T_0F) { if (type & T_0F) {
if (!str.empty()) str += " | ";
if (type & T_FP16) { if (type & T_FP16) {
str += "T_MAP5"; str += "|T_MAP5";
} else { } else {
str += "T_0F"; str += "|T_0F";
} }
} }
if (type & T_0F38) { if (type & T_0F38) {
if (!str.empty()) str += " | ";
if (type & T_FP16) { if (type & T_FP16) {
str += "T_MAP6"; str += "|T_MAP6";
} else { } else {
str += "T_0F38"; str += "|T_0F38";
} }
} }
if (type & T_0F3A) { if (type & T_0F3A) str += "|T_0F3A";
if (!str.empty()) str += " | "; if (type & T_L0) str += "|T_L0";
str += "T_0F3A"; if (type & T_L1) str += "|T_L1";
} if (type & T_W0) str += "|T_W0";
if (type & T_L0) { if (type & T_W1) str += "|T_W1";
if (!str.empty()) str += " | "; if (type & T_EW0) str += "|T_EW0";
str += "VEZ_L0"; if (type & T_EW1) str += "|T_EW1";
} if (type & T_YMM) str += "|T_YMM";
if (type & T_L1) { if (type & T_EVEX) str += "|T_EVEX";
if (!str.empty()) str += " | "; if (type & T_ER_X) str += "|T_ER_X";
str += "VEZ_L1"; if (type & T_ER_Y) str += "|T_ER_Y";
} if (type & T_ER_Z) str += "|T_ER_Z";
if (type & T_W0) { if (type & T_ER_R) str += "|T_ER_R";
if (!str.empty()) str += " | "; if (type & T_SAE_X) str += "|T_SAE_X";
str += "T_W0"; if (type & T_SAE_Y) str += "|T_SAE_Y";
} if (type & T_SAE_Z) str += "|T_SAE_Z";
if (type & T_W1) { if (type & T_MUST_EVEX) str += "|T_MUST_EVEX";
if (!str.empty()) str += " | ";
str += "T_W1"; switch (type & T_B16) { // T_B16 = T_B32 | T_B64
} case T_B16: str += "|T_B16"; break;
if (type & T_EW0) { case T_B32: str += "|T_B32"; break;
if (!str.empty()) str += " | "; case T_B64: str += "|T_B64"; break;
str += "T_EW0"; default: break;
}
if (type & T_EW1) {
if (!str.empty()) str += " | ";
str += "T_EW1";
}
if (type & T_YMM) {
if (!str.empty()) str += " | ";
str += "T_YMM";
}
if (type & T_EVEX) {
if (!str.empty()) str += " | ";
str += "T_EVEX";
}
if (type & T_ER_X) {
if (!str.empty()) str += " | ";
str += "T_ER_X";
}
if (type & T_ER_Y) {
if (!str.empty()) str += " | ";
str += "T_ER_Y";
}
if (type & T_ER_Z) {
if (!str.empty()) str += " | ";
str += "T_ER_Z";
}
if (type & T_ER_R) {
if (!str.empty()) str += " | ";
str += "T_ER_R";
}
if (type & T_SAE_X) {
if (!str.empty()) str += " | ";
str += "T_SAE_X";
}
if (type & T_SAE_Y) {
if (!str.empty()) str += " | ";
str += "T_SAE_Y";
}
if (type & T_SAE_Z) {
if (!str.empty()) str += " | ";
str += "T_SAE_Z";
}
if (type & T_MUST_EVEX) {
if (!str.empty()) str += " | ";
str += "T_MUST_EVEX";
}
if (type & T_B32) {
if (!str.empty()) str += " | ";
if (type & T_B64) {
str += "T_B16"; // T_B16 = T_B32 | T_B64
} else {
str += "T_B32";
}
} else if (type & T_B64) {
if (!str.empty()) str += " | ";
str += "T_B64";
}
if (type & T_M_K) {
if (!str.empty()) str += " | ";
str += "T_M_K";
}
if (type & T_VSIB) {
if (!str.empty()) str += " | ";
str += "T_VSIB";
}
if (type & T_MEM_EVEX) {
if (!str.empty()) str += " | ";
str += "T_MEM_EVEX";
} }
if (type & T_M_K) str += "|T_M_K";
if (type & T_VSIB) str += "|T_VSIB";
if (type & T_MEM_EVEX) str += "|T_MEM_EVEX";
if (type & T_NF) str += "|T_NF";
if (type & T_CODE1_IF1) str += "|T_CODE1_IF1";
if (type & T_ND1) str += "|T_ND1";
if (type & T_ZU) str += "|T_ZU";
if (str[0] == '|') str = str.substr(1);
return str; return str;
} }

52
externals/xbyak/gen/avx_type_def.h vendored Normal file
View file

@ -0,0 +1,52 @@
// @@@begin of avx_type_def.h
static const uint64_t T_NONE = 0ull;
// low 3 bit
static const uint64_t T_N1 = 1ull;
static const uint64_t T_N2 = 2ull;
static const uint64_t T_N4 = 3ull;
static const uint64_t T_N8 = 4ull;
static const uint64_t T_N16 = 5ull;
static const uint64_t T_N32 = 6ull;
static const uint64_t T_NX_MASK = 7ull;
static const uint64_t T_DUP = T_NX_MASK;//1 << 4, // N = (8, 32, 64)
static const uint64_t T_N_VL = 1ull << 3; // N * (1, 2, 4) for VL
static const uint64_t T_APX = 1ull << 4;
static const uint64_t T_66 = 1ull << 5; // pp = 1
static const uint64_t T_F3 = 1ull << 6; // pp = 2
static const uint64_t T_ER_R = 1ull << 7; // reg{er}
static const uint64_t T_0F = 1ull << 8;
static const uint64_t T_0F38 = 1ull << 9;
static const uint64_t T_0F3A = 1ull << 10;
static const uint64_t T_L0 = 1ull << 11;
static const uint64_t T_L1 = 1ull << 12;
static const uint64_t T_W0 = 1ull << 13;
static const uint64_t T_W1 = 1ull << 14;
static const uint64_t T_EW0 = 1ull << 15;
static const uint64_t T_EW1 = 1ull << 16;
static const uint64_t T_YMM = 1ull << 17; // support YMM, ZMM
static const uint64_t T_EVEX = 1ull << 18;
static const uint64_t T_ER_X = 1ull << 19; // xmm{er}
static const uint64_t T_ER_Y = 1ull << 20; // ymm{er}
static const uint64_t T_ER_Z = 1ull << 21; // zmm{er}
static const uint64_t T_SAE_X = 1ull << 22; // xmm{sae}
static const uint64_t T_SAE_Y = 1ull << 23; // ymm{sae}
static const uint64_t T_SAE_Z = 1ull << 24; // zmm{sae}
static const uint64_t T_MUST_EVEX = 1ull << 25; // contains T_EVEX
static const uint64_t T_B32 = 1ull << 26; // m32bcst
static const uint64_t T_B64 = 1ull << 27; // m64bcst
static const uint64_t T_B16 = T_B32 | T_B64; // m16bcst (Be careful)
static const uint64_t T_M_K = 1ull << 28; // mem{k}
static const uint64_t T_VSIB = 1ull << 29;
static const uint64_t T_MEM_EVEX = 1ull << 30; // use evex if mem
static const uint64_t T_FP16 = 1ull << 31; // avx512-fp16
static const uint64_t T_MAP5 = T_FP16 | T_0F;
static const uint64_t T_MAP6 = T_FP16 | T_0F38;
static const uint64_t T_NF = 1ull << 32; // T_nf
static const uint64_t T_CODE1_IF1 = 1ull << 33; // code|=1 if !r.isBit(8)
static const uint64_t T_ND1 = 1ull << 35; // ND=1
static const uint64_t T_ZU = 1ull << 36; // ND=ZU
static const uint64_t T_F2 = 1ull << 37; // pp = 3
// T_66 = 1, T_F3 = 2, T_F2 = 3
static inline uint32_t getPP(uint64_t type) { return (type & T_66) ? 1 : (type & T_F3) ? 2 : (type & T_F2) ? 3 : 0; }
// @@@end of avx_type_def.h

View file

@ -15,8 +15,7 @@ using namespace Xbyak;
void putOpmask(bool only64bit) void putOpmask(bool only64bit)
{ {
if (only64bit) { if (only64bit) {
puts("void kmovq(const Opmask& k, const Reg64& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W1, 0x92); }"); puts("void kmovq(const Reg64& r, const Opmask& k) { opKmov(k, r, true, 64); }");
puts("void kmovq(const Reg64& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W1, 0x93); }");
return; return;
} }
@ -76,22 +75,14 @@ void putOpmask(bool only64bit)
printf("void %sd(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x%02X, imm); }\n", p.name, p.code + 1); printf("void %sd(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x%02X, imm); }\n", p.name, p.code + 1);
} }
} }
puts("void kmovw(const Opmask& k, const Operand& op) { if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W0, 0x90); }"); for (int i = 0; i < 4; i++) {
puts("void kmovq(const Opmask& k, const Operand& op) { if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W1, 0x90); }"); const char tbl[] = "bwdq";
puts("void kmovb(const Opmask& k, const Operand& op) { if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W0, 0x90); }"); const int bitTbl[] = { 8, 16, 32, 64 };
puts("void kmovd(const Opmask& k, const Operand& op) { if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W1, 0x90); }"); int bit = bitTbl[i];
printf("void kmov%c(const Opmask& k, const Operand& op) { opKmov(k, op, false, %d); }\n", tbl[i], bit);
puts("void kmovw(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W0, 0x91); }"); printf("void kmov%c(const Address& addr, const Opmask& k) { opKmov(k, addr, true, %d); }\n", tbl[i], bit);
puts("void kmovq(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W1, 0x91); }"); if (i < 3) printf("void kmov%c(const Reg32& r, const Opmask& k) { opKmov(k, r, true, %d); }\n", tbl[i], bit);
puts("void kmovb(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W0, 0x91); }"); }
puts("void kmovd(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W1, 0x91); }");
puts("void kmovw(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_W0, 0x92); }");
puts("void kmovw(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_W0, 0x93); }");
puts("void kmovb(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_66 | T_W0, 0x92); }");
puts("void kmovb(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_66 | T_W0, 0x93); }");
puts("void kmovd(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W0, 0x92); }");
puts("void kmovd(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W0, 0x93); }");
} }
// vcmppd(k, x, op) // vcmppd(k, x, op)
@ -100,7 +91,7 @@ void putVcmp()
const struct Tbl { const struct Tbl {
uint8_t code; uint8_t code;
const char *name; const char *name;
int type; uint64_t type;
bool hasIMM; bool hasIMM;
} tbl[] = { } tbl[] = {
{ 0xC2, "vcmppd", T_0F | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_YMM | T_66 | T_B64, true }, { 0xC2, "vcmppd", T_0F | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_YMM | T_66 | T_B64, true },
@ -142,9 +133,9 @@ void putVcmp()
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i]; const Tbl *p = &tbl[i];
std::string type = type2String(p->type); std::string s = type2String(p->type);
printf("void %s(const Opmask& k, const Xmm& x, const Operand& op%s) { opAVX_K_X_XM(k, x, op, %s, 0x%02X%s); }\n" printf("void %s(const Opmask& k, const Xmm& x, const Operand& op%s) { opAVX_K_X_XM(k, x, op, %s, 0x%02X%s); }\n"
, p->name, p->hasIMM ? ", uint8_t imm" : "", type.c_str(), p->code, p->hasIMM ? ", imm" : ""); , p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? ", imm" : "");
} }
puts("void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F); }"); puts("void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F); }");
puts("void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E); }"); puts("void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E); }");
@ -173,7 +164,7 @@ void putX_XM()
const struct Tbl { const struct Tbl {
uint8_t code; uint8_t code;
const char *name; const char *name;
int type; uint64_t type;
} tbl[] = { } tbl[] = {
{ 0x6F, "vmovdqa32", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z }, { 0x6F, "vmovdqa32", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z },
{ 0x6F, "vmovdqa64", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z }, { 0x6F, "vmovdqa64", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z },
@ -210,8 +201,8 @@ void putX_XM()
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i]; const Tbl *p = &tbl[i];
std::string type = type2String(p->type); std::string s = type2String(p->type);
printf("void %s(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, %s, 0x%02X); }\n", p->name, type.c_str(), p->code); printf("void %s(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code);
} }
puts("void vpabsq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_MUST_EVEX | T_EW1 | T_B64 | T_YMM, 0x1F); }"); puts("void vpabsq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_MUST_EVEX | T_EW1 | T_B64 | T_YMM, 0x1F); }");
@ -229,7 +220,7 @@ void putM_X()
const struct Tbl { const struct Tbl {
uint8_t code; uint8_t code;
const char *name; const char *name;
int type; uint64_t type;
} tbl[] = { } tbl[] = {
{ 0x7F, "vmovdqa32", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, { 0x7F, "vmovdqa32", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K },
{ 0x7F, "vmovdqa64", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, { 0x7F, "vmovdqa64", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K },
@ -242,8 +233,8 @@ void putM_X()
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i]; const Tbl *p = &tbl[i];
std::string type = type2String(p->type); std::string s = type2String(p->type);
printf("void %s(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, %s, 0x%02X); }\n", p->name, type.c_str(), p->code); printf("void %s(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, %s, 0x%02X); }\n", p->name, s.c_str(), p->code);
} }
} }
@ -252,7 +243,7 @@ void putXM_X()
const struct Tbl { const struct Tbl {
uint8_t code; uint8_t code;
const char *name; const char *name;
int type; uint64_t type;
} tbl[] = { } tbl[] = {
{ 0x8A, "vcompresspd", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_N8 }, { 0x8A, "vcompresspd", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_N8 },
{ 0x8A, "vcompressps", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 }, { 0x8A, "vcompressps", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N4 },
@ -265,8 +256,8 @@ void putXM_X()
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i]; const Tbl *p = &tbl[i];
std::string type = type2String(p->type); std::string s = type2String(p->type);
printf("void %s(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, %s, 0x%02X); }\n", p->name, type.c_str(), p->code); printf("void %s(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code);
} }
} }
@ -275,7 +266,7 @@ void putX_X_XM_IMM()
const struct Tbl { const struct Tbl {
uint8_t code; uint8_t code;
const char *name; const char *name;
int type; uint64_t type;
bool hasIMM; bool hasIMM;
} tbl[] = { } tbl[] = {
{ 0x03, "valignd", T_MUST_EVEX | T_66 | T_0F3A | T_EW0 | T_YMM, true }, { 0x03, "valignd", T_MUST_EVEX | T_66 | T_0F3A | T_EW0 | T_YMM, true },
@ -413,9 +404,9 @@ void putX_X_XM_IMM()
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i]; const Tbl *p = &tbl[i];
std::string type = type2String(p->type); std::string s = type2String(p->type);
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op%s) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X%s); }\n" printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op%s) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X%s); }\n"
, p->name, p->hasIMM ? ", uint8_t imm" : "", type.c_str(), p->code, p->hasIMM ? ", imm" : ""); , p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? ", imm" : "");
} }
} }
@ -425,7 +416,7 @@ void putShift()
const char *name; const char *name;
uint8_t code; uint8_t code;
int idx; int idx;
int type; uint64_t type;
} tbl[] = { } tbl[] = {
{ "vpsraq", 0x72, 4, T_0F | T_66 | T_YMM | T_MUST_EVEX |T_EW1 | T_B64 }, { "vpsraq", 0x72, 4, T_0F | T_66 | T_YMM | T_MUST_EVEX |T_EW1 | T_B64 },
{ "vprold", 0x72, 1, T_66 | T_0F | T_YMM | T_MUST_EVEX | T_EW0 | T_B32 }, { "vprold", 0x72, 1, T_66 | T_0F | T_YMM | T_MUST_EVEX | T_EW0 | T_B32 },
@ -435,8 +426,8 @@ void putShift()
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i]; const Tbl& p = tbl[i];
std::string type = type2String(p.type); std::string s = type2String(p.type);
printf("void %s(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), %d), x, op, %s, 0x%02X, imm); }\n", p.name, p.idx, type.c_str(), p.code); printf("void %s(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), %d), x, op, %s, 0x%02X, imm); }\n", p.name, p.idx, s.c_str(), p.code);
} }
} }
@ -446,7 +437,7 @@ void putExtractInsert()
const struct Tbl { const struct Tbl {
const char *name; const char *name;
uint8_t code; uint8_t code;
int type; uint64_t type;
bool isZMM; bool isZMM;
} tbl[] = { } tbl[] = {
{ "vextractf32x4", 0x19, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_YMM | T_N16, false }, { "vextractf32x4", 0x19, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_YMM | T_N16, false },
@ -461,16 +452,16 @@ void putExtractInsert()
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i]; const Tbl& p = tbl[i];
std::string type = type2String(p.type); std::string s = type2String(p.type);
const char *kind = p.isZMM ? "Operand::MEM | Operand::YMM" : "Operand::MEM | Operand::XMM"; const char *kind = p.isZMM ? "Operand::MEM | Operand::YMM" : "Operand::MEM | Operand::XMM";
printf("void %s(const Operand& op, const %s& r, uint8_t imm) { if (!op.is(%s)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, %s, 0x%2X, imm); }\n", p.name, p.isZMM ? "Zmm" : "Ymm", kind, type.c_str(), p.code); printf("void %s(const Operand& op, const %s& r, uint8_t imm) { if (!op.is(%s)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, %s, 0x%2X, imm); }\n", p.name, p.isZMM ? "Zmm" : "Ymm", kind, s.c_str(), p.code);
} }
} }
{ {
const struct Tbl { const struct Tbl {
const char *name; const char *name;
uint8_t code; uint8_t code;
int type; uint64_t type;
bool isZMM; bool isZMM;
} tbl[] = { } tbl[] = {
{ "vinsertf32x4", 0x18, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_YMM | T_N16, false }, { "vinsertf32x4", 0x18, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_YMM | T_N16, false },
@ -485,12 +476,12 @@ void putExtractInsert()
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i]; const Tbl& p = tbl[i];
std::string type = type2String(p.type); std::string s = type2String(p.type);
const char *x = p.isZMM ? "Zmm" : "Ymm"; const char *x = p.isZMM ? "Zmm" : "Ymm";
const char *cond = p.isZMM ? "op.is(Operand::MEM | Operand::YMM)" : "(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))"; const char *cond = p.isZMM ? "op.is(Operand::MEM | Operand::YMM)" : "(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))";
printf("void %s(const %s& r1, const %s& r2, const Operand& op, uint8_t imm) {" printf("void %s(const %s& r1, const %s& r2, const Operand& op, uint8_t imm) {"
"if (!%s) XBYAK_THROW(ERR_BAD_COMBINATION) " "if (!%s) XBYAK_THROW(ERR_BAD_COMBINATION) "
"opVex(r1, &r2, op, %s, 0x%2X, imm); }\n", p.name, x, x, cond, type.c_str(), p.code); "opVex(r1, &r2, op, %s, 0x%2X, imm); }\n", p.name, x, x, cond, s.c_str(), p.code);
} }
} }
} }
@ -501,7 +492,7 @@ void putBroadcast(bool only64bit)
const struct Tbl { const struct Tbl {
uint8_t code; uint8_t code;
const char *name; const char *name;
int type; uint64_t type;
int reg; int reg;
} tbl[] = { } tbl[] = {
{ 0x7A, "vpbroadcastb", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0, 8 }, { 0x7A, "vpbroadcastb", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0, 8 },
@ -511,9 +502,9 @@ void putBroadcast(bool only64bit)
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i]; const Tbl& p = tbl[i];
std::string type = type2String(p.type); std::string s = type2String(p.type);
if ((only64bit && p.reg == 64) || (!only64bit && p.reg != 64)) { if ((only64bit && p.reg == 64) || (!only64bit && p.reg != 64)) {
printf("void %s(const Xmm& x, const Reg%d& r) { opVex(x, 0, r, %s, 0x%02X); }\n", p.name, p.reg, type.c_str(), p.code); printf("void %s(const Xmm& x, const Reg%d& r) { opVex(x, 0, r, %s, 0x%02X); }\n", p.name, p.reg, s.c_str(), p.code);
} }
} }
} }
@ -536,7 +527,7 @@ void putCvt()
const struct Tbl { const struct Tbl {
uint8_t code; uint8_t code;
const char *name; const char *name;
int type; uint64_t type;
int ptn; int ptn;
} tbl[] = { } tbl[] = {
{ 0x79, "vcvtsd2usi", T_F2 | T_0F | T_MUST_EVEX | T_N8 | T_ER_X, 0 }, { 0x79, "vcvtsd2usi", T_F2 | T_0F | T_MUST_EVEX | T_N8 | T_ER_X, 0 },
@ -583,28 +574,28 @@ void putCvt()
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i]; const Tbl& p = tbl[i];
std::string type = type2String(p.type); std::string s = type2String(p.type);
switch (p.ptn) { switch (p.ptn) {
case 0: case 0:
printf("void %s(const Reg32e& r, const Operand& op) { int type = (%s) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x%02X); }\n", p.name, type.c_str(), p.code); printf("void %s(const Reg32e& r, const Operand& op) { uint64_t type = (%s) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x%02X); }\n", p.name, s.c_str(), p.code);
break; break;
case 1: case 1:
printf("void %s(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, %s, 0x%02X); }\n", p.name, type.c_str(), p.code); printf("void %s(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
break; break;
case 2: case 2:
printf("void %s(const Xmm& x, const Operand& op) { opCvt2(x, op, %s, 0x%02X); }\n", p.name, type.c_str(), p.code); printf("void %s(const Xmm& x, const Operand& op) { opCvt2(x, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
break; break;
case 3: case 3:
printf("void %s(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, %s, 0x%02X); }\n", p.name, type.c_str(), p.code); printf("void %s(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
break; break;
case 4: case 4:
printf("void %s(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, %s, 0x%02X); }\n", p.name, type.c_str(), p.code); printf("void %s(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
break; break;
case 5: case 5:
printf("void %s(const Xmm& x, const Operand& op) { opCvt5(x, op, %s, 0x%02X); }\n", p.name, type.c_str(), p.code); printf("void %s(const Xmm& x, const Operand& op) { opCvt5(x, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
break; break;
case 6: case 6:
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { if (!(x1.isXMM() && x2.isXMM() && op.isBit(32|64))) XBYAK_THROW(ERR_BAD_COMBINATION) int type = (%s) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8)); opVex(x1, &x2, op, type, 0x%02X); }\n", p.name, type.c_str(), p.code); printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { if (!(x1.isXMM() && x2.isXMM() && op.isBit(32|64))) XBYAK_THROW(ERR_BAD_COMBINATION) uint64_t type = (%s) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8)); opVex(x1, &x2, op, type, 0x%02X); }\n", p.name, s.c_str(), p.code);
break; break;
} }
} }
@ -621,7 +612,7 @@ void putGather()
{ {
const struct Tbl { const struct Tbl {
const char *name; const char *name;
int type; uint64_t type;
uint8_t code; uint8_t code;
int mode; int mode;
} tbl[] = { } tbl[] = {
@ -636,15 +627,15 @@ void putGather()
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i]; const Tbl& p = tbl[i];
std::string type = type2String(p.type | T_VSIB); std::string s = type2String(p.type | T_VSIB);
printf("void %s(const Xmm& x, const Address& addr) { opGather2(x, addr, %s, 0x%02X, %d); }\n", p.name, type.c_str(), p.code, p.mode); printf("void %s(const Xmm& x, const Address& addr) { opGather2(x, addr, %s, 0x%02X, %d); }\n", p.name, s.c_str(), p.code, p.mode);
} }
} }
void putScatter() void putScatter()
{ {
const struct Tbl { const struct Tbl {
const char *name; const char *name;
int type; uint64_t type;
uint8_t code; uint8_t code;
int mode; // reverse of gather int mode; // reverse of gather
} tbl[] = { } tbl[] = {
@ -660,8 +651,8 @@ void putScatter()
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i]; const Tbl& p = tbl[i];
std::string type = type2String(p.type | T_VSIB); std::string s = type2String(p.type | T_VSIB);
printf("void %s(const Address& addr, const Xmm& x) { opGather2(x, addr, %s, 0x%02X, %d); }\n", p.name, type.c_str(), p.code, p.mode); printf("void %s(const Address& addr, const Xmm& x) { opGather2(x, addr, %s, 0x%02X, %d); }\n", p.name, s.c_str(), p.code, p.mode);
} }
} }
@ -689,7 +680,7 @@ void putMov()
const struct Tbl { const struct Tbl {
uint8_t code; uint8_t code;
const char *name; const char *name;
int type; uint64_t type;
int mode; int mode;
} tbl[] = { } tbl[] = {
{ 0x32, "vpmovqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false }, { 0x32, "vpmovqb", T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_N2 | T_N_VL | T_M_K, false },
@ -718,8 +709,8 @@ void putMov()
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i]; const Tbl& p = tbl[i];
std::string type = type2String(p.type); std::string s = type2String(p.type);
printf("void %s(const Operand& op, const Xmm& x) { opVmov(op, x, %s, 0x%02X, %s); }\n", p.name, type.c_str(), p.code, p.mode ? "true" : "false"); printf("void %s(const Operand& op, const Xmm& x) { opVmov(op, x, %s, 0x%02X, %s); }\n", p.name, s.c_str(), p.code, p.mode ? "true" : "false");
} }
} }
} }
@ -729,7 +720,7 @@ void putX_XM_IMM()
const struct Tbl { const struct Tbl {
uint8_t code; uint8_t code;
const char *name; const char *name;
int type; uint64_t type;
bool hasIMM; bool hasIMM;
} tbl[] = { } tbl[] = {
{ 0x26, "vgetmantpd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64 | T_SAE_Z, true }, { 0x26, "vgetmantpd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64 | T_SAE_Z, true },
@ -770,9 +761,9 @@ void putX_XM_IMM()
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i]; const Tbl *p = &tbl[i];
std::string type = type2String(p->type); std::string s = type2String(p->type);
printf("void %s(const Xmm& x, const Operand& op%s) { opAVX_X_XM_IMM(x, op, %s, 0x%02X%s); }\n" printf("void %s(const Xmm& x, const Operand& op%s) { opAVX_X_XM_IMM(x, op, %s, 0x%02X%s); }\n"
, p->name, p->hasIMM ? ", uint8_t imm" : "", type.c_str(), p->code, p->hasIMM ? ", imm" : ""); , p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? ", imm" : "");
} }
} }
@ -784,7 +775,7 @@ void putMisc()
const struct Tbl { const struct Tbl {
const char *name; const char *name;
int zm; int zm;
int type; uint64_t type;
uint8_t code; uint8_t code;
bool isZmm; bool isZmm;
} tbl[] = { } tbl[] = {
@ -810,9 +801,9 @@ void putMisc()
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i]; const Tbl& p = tbl[i];
std::string type = type2String(p.type | T_66 | T_0F38 | T_MUST_EVEX | T_M_K | T_VSIB); std::string s = type2String(p.type | T_66 | T_0F38 | T_MUST_EVEX | T_M_K | T_VSIB);
printf("void %s(const Address& addr) { opGatherFetch(addr, zm%d, %s, 0x%2X, Operand::%s); }\n" printf("void %s(const Address& addr) { opGatherFetch(addr, zm%d, %s, 0x%2X, Operand::%s); }\n"
, p.name, p.zm, type.c_str(), p.code, p.isZmm ? "ZMM" : "YMM"); , p.name, p.zm, s.c_str(), p.code, p.isZmm ? "ZMM" : "YMM");
} }
} }
@ -887,18 +878,18 @@ void putFP16_FMA()
{ "213", 0xA0 }, { "213", 0xA0 },
{ "231", 0xB0 }, { "231", 0xB0 },
}; };
int t = T_66 | T_MAP6 | T_EW0 | T_MUST_EVEX; uint64_t type = T_66 | T_MAP6 | T_EW0 | T_MUST_EVEX;
const char *suf = 0; const char *suf = 0;
if (tbl[i].isPH) { if (tbl[i].isPH) {
t |= T_ER_Z | T_YMM | T_B16; type |= T_ER_Z | T_YMM | T_B16;
suf = "ph"; suf = "ph";
} else { } else {
t |= T_ER_X | T_N2; type |= T_ER_X | T_N2;
suf = "sh"; suf = "sh";
} }
std::string type = type2String(t); std::string s = type2String(type);
printf("void %s%s%s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n" printf("void %s%s%s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n"
, tbl[i].name, ord[k].str, suf, type.c_str(), tbl[i].code | ord[k].code); , tbl[i].name, ord[k].str, suf, s.c_str(), tbl[i].code | ord[k].code);
} }
} }
} }
@ -914,23 +905,23 @@ void putFP16_FMA2()
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
for (int j = 0; j < 2; j++) { for (int j = 0; j < 2; j++) {
int t = T_MAP6 | T_EW0 | T_MUST_EVEX; uint64_t type = T_MAP6 | T_EW0 | T_MUST_EVEX;
if (j == 0) { if (j == 0) {
t |= T_F2; type |= T_F2;
} else { } else {
t |= T_F3; type |= T_F3;
} }
const char *suf = 0; const char *suf = 0;
if (tbl[i].isPH) { if (tbl[i].isPH) {
t |= T_ER_Z | T_YMM | T_B32; type |= T_ER_Z | T_YMM | T_B32;
suf = "ph"; suf = "ph";
} else { } else {
t |= T_ER_X | T_N2; type |= T_ER_X | T_N2;
suf = "sh"; suf = "sh";
} }
std::string type = type2String(t); std::string s = type2String(type);
printf("void vf%s%s%s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n" printf("void vf%s%s%s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n"
, j == 0 ? "c" : "", tbl[i].name, suf, type.c_str(), tbl[i].code); , j == 0 ? "c" : "", tbl[i].name, suf, s.c_str(), tbl[i].code);
} }
} }
} }
@ -938,16 +929,16 @@ void putFP16_FMA2()
void putFP16_2() void putFP16_2()
{ {
{ {
int t = T_F3 | T_MAP5 | T_MUST_EVEX | T_EW0 | T_N2; uint64_t type = T_F3 | T_MAP5 | T_MUST_EVEX | T_EW0 | T_N2;
std::string type = type2String(t); std::string s = type2String(type);
printf("void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, %s, 0x10); }\n", type.c_str()); printf("void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, %s, 0x10); }\n", s.c_str());
printf("void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, %s, 0x10); }\n", type.c_str()); printf("void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, %s, 0x10); }\n", s.c_str());
} }
{ {
int t = T_66 | T_MAP5 | T_MUST_EVEX | T_N2; uint64_t type = T_66 | T_MAP5 | T_MUST_EVEX | T_N2;
std::string type = type2String(t); std::string s = type2String(type);
printf("void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, %s, 0x6E); }\n", type.c_str()); printf("void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, %s, 0x6E); }\n", s.c_str());
printf("void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, %s, 0x7E); }\n", type.c_str()); printf("void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, %s, 0x7E); }\n", s.c_str());
} }
} }

File diff suppressed because it is too large Load diff

View file

@ -5,7 +5,7 @@
project( project(
'xbyak', 'xbyak',
'cpp', 'cpp',
version: '6.68', version: '7.05',
license: 'BSD-3-Clause', license: 'BSD-3-Clause',
default_options: 'b_ndebug=if-release' default_options: 'b_ndebug=if-release'
) )

View file

@ -1,5 +1,5 @@
# Xbyak 6.68 [![Badge Build]][Build Status] # Xbyak 7.05 [![Badge Build]][Build Status]
*A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)* *A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)*
@ -21,13 +21,21 @@ It is named from a Japanese word [開闢](https://translate.google.com/?hl=ja&sl
- header file only - header file only
- Intel/MASM like syntax - Intel/MASM like syntax
- fully support AVX-512 - fully support AVX-512
- support APX/AVX10
**Note**: **Note**:
Use `and_()`, `or_()`, ... instead of `and()`, `or()`. Use `and_()`, `or_()`, ... instead of `and()`, `or()`.
If you want to use them, then specify `-fno-operator-names` option to gcc/clang. If you want to use them, then specify `-fno-operator-names` option to gcc/clang.
### Derived Projects
- [Xbyak_aarch64](https://github.com/fujitsu/xbyak_aarch64/) : for AArch64
- [Xbyak_riscv](https://github.com/herumi/xbyak_riscv) : for RISC-V
### News ### News
- support RAO-INT for APX
- support AVX10 detection, AESKLE, WIDE_KL, KEYLOCKER, KEYLOCKER_WIDE
- support APX except for a few instructions
- add amx_fp16/avx_vnni_int8/avx_ne_convert/avx-ifma - add amx_fp16/avx_vnni_int8/avx_ne_convert/avx-ifma
- add movdiri, movdir64b, clwb, cldemote - add movdiri, movdir64b, clwb, cldemote
- WAITPKG instructions (tpause, umonitor, umwait) are supported. - WAITPKG instructions (tpause, umonitor, umwait) are supported.

View file

@ -1,5 +1,5 @@
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.68 C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.05
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
◎概要 ◎概要
@ -46,6 +46,8 @@ Linuxではmake installで/usr/local/include/xbyakにコピーされます。
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
◎新機能 ◎新機能
APX/AVX10対応
例外なしモード追加 例外なしモード追加
XBYAK_NO_EXCEPTIONを定義してコンパイルするとgcc/clangで-fno-exceptionsオプションでコンパイルできます。 XBYAK_NO_EXCEPTIONを定義してコンパイルするとgcc/clangで-fno-exceptionsオプションでコンパイルできます。
エラーは例外の代わりに`Xbyak::GetError()`で通達されます。 エラーは例外の代わりに`Xbyak::GetError()`で通達されます。
@ -402,6 +404,19 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
◎履歴 ◎履歴
2024/01/03 ver 7.05 APX対応RAO-INT
2023/12/28 ver 7.04 2バイトオペコードのrex2対応
2023/12/26 ver 7.03 dfvのデフォルト値を0に設定
2023/12/20 ver 7.02 SHA*のAPX対応
2023/12/19 ver 7.01 AESKLE, WIDE_KL, KEYLOCKER, KEYLOCKER_WIDE対応 APX10/APX判定対応
2023/12/01 ver 7.00 APX対応
2023/08/07 ver 6.73 sha512/sm3/sm4/avx-vnni-int16追加
2023/08/02 ver 6.72 xabort, xbegin, xend追加
2023/07/27 ver 6.71 Allocatorでhuge pageを考慮する。
2023/07/05 ver 6.70 vpclmulqdqのailas追加
2023/06/27 ver 6.69.2 `TypeT operator|`にconstexpr追加(thanks to Wunkolo)
2023/03/23 ver 6.69.1 xsave判定追加(thanks to Wunkolo)
2023/02/20 ver 6.69 util::CpuがAMD対応 UINTR命令対応
2022/12/07 ver 6.68 prefetchit{0,1}サポート 2022/12/07 ver 6.68 prefetchit{0,1}サポート
2022/11/30 ver 6.67 CMPccXADDサポート 2022/11/30 ver 6.67 CMPccXADDサポート
2022/11/25 ver 6.66 RAO-INTサポート 2022/11/25 ver 6.66 RAO-INTサポート

View file

@ -30,7 +30,7 @@ else
endif endif
ifeq ($(BIT),64) ifeq ($(BIT),64)
TARGET += test64 bf64 memfunc64 test_util64 jmp_table64 TARGET += test64 bf64 memfunc64 test_util64 jmp_table64 zero_upper ccmp no_flags
ifeq ($(BOOST_EXIST),1) ifeq ($(BOOST_EXIST),1)
TARGET += calc64 #calc2_64 TARGET += calc64 #calc2_64
endif endif
@ -103,6 +103,18 @@ profiler: profiler.cpp ../xbyak/xbyak_util.h
$(CXX) $(CFLAGS) profiler.cpp -o $@ $(CXX) $(CFLAGS) profiler.cpp -o $@
profiler-vtune: profiler.cpp ../xbyak/xbyak_util.h profiler-vtune: profiler.cpp ../xbyak/xbyak_util.h
$(CXX) $(CFLAGS) profiler.cpp -o $@ -DXBYAK_USE_VTUNE -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl $(CXX) $(CFLAGS) profiler.cpp -o $@ -DXBYAK_USE_VTUNE -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
zero_upper: zero_upper.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) zero_upper.cpp -o $@
test_zero_upper: zero_upper
sde -future -- ./zero_upper
ccmp: ccmp.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) ccmp.cpp -o $@
test_ccmp: ccmp
sde -future -- ./ccmp
no_flags: no_flags.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) no_flags.cpp -o $@
test_no_flags: no_flags
sde -future -- ./no_flags
clean: clean:
rm -rf $(TARGET) profiler profiler-vtune rm -rf $(TARGET) profiler profiler-vtune
@ -122,7 +134,7 @@ toyvm : toyvm.cpp $(XBYAK_INC)
static_buf: static_buf.cpp $(XBYAK_INC) static_buf: static_buf.cpp $(XBYAK_INC)
static_buf64: static_buf.cpp $(XBYAK_INC) static_buf64: static_buf.cpp $(XBYAK_INC)
test_util : test_util.cpp $(XBYAK_INC) ../xbyak/xbyak_util.h test_util : test_util.cpp $(XBYAK_INC) ../xbyak/xbyak_util.h
test_util2 : test_util.cpp $(XBYAK_INC) ../xbyak/xbyak_util.h test_util64 : test_util.cpp $(XBYAK_INC) ../xbyak/xbyak_util.h
jmp_table: jmp_table.cpp $(XBYAK_INC) jmp_table: jmp_table.cpp $(XBYAK_INC)
jmp_table64: jmp_table.cpp $(XBYAK_INC) jmp_table64: jmp_table.cpp $(XBYAK_INC)
memfd: memfd.cpp $(XBYAK_INC) memfd: memfd.cpp $(XBYAK_INC)

68
externals/xbyak/sample/ccmp.cpp vendored Normal file
View file

@ -0,0 +1,68 @@
/*
An example of ccmp
> g++ ccmp.cpp -I ../xbyak
> sde -future -- ./a.out
*/
#include <stdio.h>
#include <xbyak/xbyak.h>
#include <xbyak/xbyak_util.h>
using namespace Xbyak;
struct Code1 : Xbyak::CodeGenerator {
Code1()
{
Xbyak::util::StackFrame sf(this, 2);
const auto& p1 = sf.p[0];
const auto& p2 = sf.p[1];
int dfv = 0;
cmp(p1, 3);
ctesta(p2, 1, dfv); // eflags = (p1 > 3) ? ((p2 & 1) == 0) : dfv;
setz(al|T_zu);
}
};
struct Code2 : Xbyak::CodeGenerator {
Code2()
{
Xbyak::util::StackFrame sf(this, 3);
const auto& p1 = sf.p[0];
const auto& p2 = sf.p[1];
const auto& p3 = sf.p[2];
int dfv = 0;
cmp(p1, 1);
ccmpe(p2, 2, dfv); // eflags = p1==1 ? p2==2 : dfv;
ccmpe(p3, 3, dfv); // eflags = (p1==1 && p2==2) ? p3==3 : dfv;
setz(al|T_zu); // p1==1 && p2==2 && p3==3
}
};
int main()
try
{
{
puts("(p1 > 3) && ((p2 & 1) == 0)");
Code1 c;
auto f = c.getCode<int (*)(int, int)>();
for (int p1 = 2; p1 < 5; p1++) {
for (int p2 = 0; p2 < 3; p2++) {
printf("p1=%d p2=%d ret=%d (%d)\n", p1, p2, f(p1, p2), p1 > 3 && ((p2&1) == 0));
}
}
}
{
puts("p1 == 1 && p2 == 2 && p3 == 3");
Code2 c;
auto f = c.getCode<int (*)(int, int, int)>();
for (int p1 = 0; p1 < 3; p1++) {
for (int p2 = 1; p2 < 4; p2++) {
for (int p3 = 2; p3 < 5; p3++) {
printf("p1=%d p2=%d p3=%d ret=%d (%d)\n", p1, p2, p3, f(p1, p2, p3), p1==1 && p2==2 && p3==3);
}
}
}
}
} catch (std::exception& e) {
printf("ERR %s\n", e.what());
}

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx_vnni waitpkg clflushopt cldemote movdiri movdir64b mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b serialize aeskle wide_kl keylocker keylocker_wide

2
externals/xbyak/sample/cpuid/arl.txt vendored Normal file
View file

@ -0,0 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd sha512 sm3 sm4 avx_vnni_int16 aeskle wide_kl keylocker keylocker_wide

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl avx512_vnni clflushopt mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl avx512_vnni clflushopt clwb

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi clflushopt mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi clflushopt

View file

@ -1,11 +1,25 @@
#!/bin/bash #!/bin/bash
UPDATE=0
if [ $# -eq 1 ]; then
UPDATE=1
fi
if [ $UPDATE == 1 ]; then
echo "UPDATE"
fi
make -C ../ test_util64 make -C ../ test_util64
cpus=(p4p mrm pnr nhm wsm snb ivb hsw bdw slt slm glm glp tnt skl cnl icl skx clx cpx icx knl knm tgl adl spr) cpus=(p4p mrm pnr nhm wsm snb ivb hsw bdw slt slm glm glp tnt skl cnl icl skx clx cpx icx knl knm tgl adl mtl rpl spr gnr srf grr arl lnl)
for cpu in ${cpus[@]} ; do for cpu in ${cpus[@]} ; do
echo $cpu echo $cpu
~/bin/sde -$cpu -- ../test_util64 -cpuid > tmp.txt if [ $UPDATE == 1 ]; then
diff tmp.txt $cpu.txt ~/bin/sde -$cpu -- ../test_util64 -cpuid > $cpu.txt
else
~/bin/sde -$cpu -- ../test_util64 -cpuid > tmp.txt
diff $cpu.txt tmp.txt
fi
done done

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl avx512_vnni avx512_bf16 clflushopt mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl avx512_vnni avx512_bf16 clflushopt clwb

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq enh_rep rdrand rdseed smap sha movbe clflushopt mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq enh_rep rdrand rdseed smap sha movbe clflushopt

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq enh_rep rdrand rdseed smap sha movbe clflushopt mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq enh_rep rdrand rdseed smap sha movbe clflushopt

2
externals/xbyak/sample/cpuid/gnr.txt vendored Normal file
View file

@ -0,0 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_bf16 amx(tile) amx(int8) amx(bf16) avx_vnni avx512_fp16 waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize amx_fp16 prefetchiti avx10

2
externals/xbyak/sample/cpuid/grr.txt vendored Normal file
View file

@ -0,0 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma rao-int cmpccxadd aeskle wide_kl keylocker keylocker_wide

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand f16c movbe mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand f16c movbe

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq clflushopt mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq clflushopt clwb

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq clflushopt mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq clflushopt clwb

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx enh_rep rdrand f16c mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx enh_rep rdrand f16c

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed prefetchwt1 f16c movbe avx512f avx512pf avx512er avx512cd mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed prefetchwt1 f16c movbe avx512f avx512pf avx512er avx512cd

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed prefetchwt1 f16c movbe avx512f avx512pf avx512er avx512cd avx512_4vnniw avx512_4fmaps avx512_vpopcntdq mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed prefetchwt1 f16c movbe avx512f avx512pf avx512er avx512cd avx512_4vnniw avx512_4fmaps avx512_vpopcntdq

2
externals/xbyak/sample/cpuid/lnl.txt vendored Normal file
View file

@ -0,0 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd sha512 sm3 sm4 avx_vnni_int16 aeskle wide_kl keylocker keylocker_wide

2
externals/xbyak/sample/cpuid/mtl.txt vendored Normal file
View file

@ -0,0 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b serialize aeskle wide_kl keylocker keylocker_wide

2
externals/xbyak/sample/cpuid/rpl.txt vendored Normal file
View file

@ -0,0 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b serialize aeskle wide_kl keylocker keylocker_wide

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe clflushopt mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe clflushopt

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl clflushopt mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl clflushopt clwb

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_bf16 avx512_vp2intersect amx(tile) amx(int8) amx(bf16) avx_vnni avx512_fp16 waitpkg clflushopt cldemote movdiri movdir64b mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_bf16 amx(tile) amx(int8) amx(bf16) avx_vnni avx512_fp16 waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize

2
externals/xbyak/sample/cpuid/srf.txt vendored Normal file
View file

@ -0,0 +1,2 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd aeskle wide_kl keylocker keylocker_wide

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_vp2intersect clflushopt movdiri movdir64b mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_vp2intersect clflushopt clwb movdiri movdir64b aeskle wide_kl keylocker keylocker_wide

View file

@ -1,2 +0,0 @@
vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_bf16 avx512_vp2intersect amx(tile) amx(int8) amx(bf16) avx_vnni avx512_fp16 waitpkg clflushopt cldemote movdiri movdir64b

View file

@ -1,2 +1,2 @@
vendor intel vendor intel
mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq enh_rep rdrand rdseed smap sha movbe clflushopt mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq enh_rep rdrand rdseed smap sha movbe gfni clflushopt cldemote clwb

View file

@ -1,10 +0,0 @@
#!/bin/bash
make -C ../ test_util64
cpus=(p4p mrm pnr nhm wsm snb ivb hsw bdw slt slm glm glp tnt skl cnl icl skx clx cpx icx knl knm tgl adl spr)
for cpu in ${cpus[@]} ; do
echo $cpu
~/bin/sde -$cpu -- ../test_util64 -cpuid > $cpu.txt
done

25
externals/xbyak/sample/no_flags.cpp vendored Normal file
View file

@ -0,0 +1,25 @@
#include <stdio.h>
#include <xbyak/xbyak.h>
struct Code : Xbyak::CodeGenerator {
Code(bool nf) {
xor_(eax, eax); // CF = 0
mov(eax, -1);
if (nf) {
puts("no flags (with T_nf)");
add(eax|T_nf, eax, 1); // does not change CF
} else {
puts("change flags (without T_nf)");
add(eax, eax, 1); // CF = 1
}
adc(eax, 0); // eax = CF ? 1 : 0
ret();
}
};
int main() {
for (int i = 0; i < 2; i++) {
Code c(i);
printf("i=%d ret=%d\n", i, c.getCode<int(*)()>()());
}
}

View file

@ -31,12 +31,14 @@ void putCPUinfo(bool onlyCpuidFeature)
{ Cpu::tSSSE3, "ssse3" }, { Cpu::tSSSE3, "ssse3" },
{ Cpu::tSSE41, "sse41" }, { Cpu::tSSE41, "sse41" },
{ Cpu::tSSE42, "sse42" }, { Cpu::tSSE42, "sse42" },
{ Cpu::tSSE4a, "sse4a" },
{ Cpu::tPOPCNT, "popcnt" }, { Cpu::tPOPCNT, "popcnt" },
{ Cpu::t3DN, "3dn" }, { Cpu::t3DN, "3dn" },
{ Cpu::tE3DN, "e3dn" }, { Cpu::tE3DN, "e3dn" },
{ Cpu::tAESNI, "aesni" }, { Cpu::tAESNI, "aesni" },
{ Cpu::tRDTSCP, "rdtscp" }, { Cpu::tRDTSCP, "rdtscp" },
{ Cpu::tOSXSAVE, "osxsave(xgetvb)" }, { Cpu::tXSAVE, "xsave(xgetvb)" },
{ Cpu::tOSXSAVE, "osxsave" },
{ Cpu::tPCLMULQDQ, "pclmulqdq" }, { Cpu::tPCLMULQDQ, "pclmulqdq" },
{ Cpu::tAVX, "avx" }, { Cpu::tAVX, "avx" },
{ Cpu::tFMA, "fma" }, { Cpu::tFMA, "fma" },
@ -86,8 +88,11 @@ void putCPUinfo(bool onlyCpuidFeature)
{ Cpu::tWAITPKG, "waitpkg" }, { Cpu::tWAITPKG, "waitpkg" },
{ Cpu::tCLFLUSHOPT, "clflushopt" }, { Cpu::tCLFLUSHOPT, "clflushopt" },
{ Cpu::tCLDEMOTE, "cldemote" }, { Cpu::tCLDEMOTE, "cldemote" },
{ Cpu::tCLWB, "clwb" },
{ Cpu::tMOVDIRI, "movdiri" }, { Cpu::tMOVDIRI, "movdiri" },
{ Cpu::tMOVDIR64B, "movdir64b" }, { Cpu::tMOVDIR64B, "movdir64b" },
{ Cpu::tUINTR, "uintr" },
{ Cpu::tSERIALIZE, "serialize" },
{ Cpu::tCLZERO, "clzero" }, { Cpu::tCLZERO, "clzero" },
{ Cpu::tAMX_FP16, "amx_fp16" }, { Cpu::tAMX_FP16, "amx_fp16" },
{ Cpu::tAVX_VNNI_INT8, "avx_vnni_int8" }, { Cpu::tAVX_VNNI_INT8, "avx_vnni_int8" },
@ -96,12 +101,25 @@ void putCPUinfo(bool onlyCpuidFeature)
{ Cpu::tRAO_INT, "rao-int" }, { Cpu::tRAO_INT, "rao-int" },
{ Cpu::tCMPCCXADD, "cmpccxadd" }, { Cpu::tCMPCCXADD, "cmpccxadd" },
{ Cpu::tPREFETCHITI, "prefetchiti" }, { Cpu::tPREFETCHITI, "prefetchiti" },
{ Cpu::tSHA512, "sha512" },
{ Cpu::tSM3, "sm3" },
{ Cpu::tSM4, "sm4" },
{ Cpu::tAVX_VNNI_INT16, "avx_vnni_int16" },
{ Cpu::tAPX_F, "apx_f" },
{ Cpu::tAVX10, "avx10" },
{ Cpu::tAESKLE, "aeskle" },
{ Cpu::tWIDE_KL, "wide_kl" },
{ Cpu::tKEYLOCKER, "keylocker" },
{ Cpu::tKEYLOCKER_WIDE, "keylocker_wide" },
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str); if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);
} }
printf("\n"); printf("\n");
if (onlyCpuidFeature) return; if (onlyCpuidFeature) return;
if (cpu.has(Cpu::tAVX10)) {
printf("AVX10 version %d\n", cpu.getAVX10version());
}
if (cpu.has(Cpu::tPOPCNT)) { if (cpu.has(Cpu::tPOPCNT)) {
const int n = 0x12345678; // bitcount = 13 const int n = 0x12345678; // bitcount = 13
const int ok = 13; const int ok = 13;
@ -127,7 +145,6 @@ void putCPUinfo(bool onlyCpuidFeature)
Core i7-3930K 6 2D Core i7-3930K 6 2D
*/ */
cpu.putFamily(); cpu.putFamily();
if (!cpu.has(Cpu::tINTEL)) return;
for (unsigned int i = 0; i < cpu.getDataCacheLevels(); i++) { for (unsigned int i = 0; i < cpu.getDataCacheLevels(); i++) {
printf("cache level=%u data cache size=%u cores sharing data cache=%u\n", i, cpu.getDataCacheSize(i), cpu.getCoresSharingDataCache(i)); printf("cache level=%u data cache size=%u cores sharing data cache=%u\n", i, cpu.getDataCacheSize(i), cpu.getCoresSharingDataCache(i));
} }

48
externals/xbyak/sample/zero_upper.cpp vendored Normal file
View file

@ -0,0 +1,48 @@
/*
An example of T_zu (zero upper) flag
> g++ zero_upper.cpp -I ../xbyak
> sde -future -- ./a.out
*/
#include <stdio.h>
#include <xbyak/xbyak.h>
using namespace Xbyak;
struct Code : Xbyak::CodeGenerator {
Code(int mode)
{
mov(eax, 0x12345678);
cmp(eax, eax); // ZF=1
switch (mode) {
case 0: // imul
puts("imul");
imul(ax,ax, 0x1234);
break;
case 1: // imul+zu
puts("imul+zu");
imul(ax|T_zu, ax, 0x1234);
break;
case 2: // setz
puts("setz");
setz(al);
break;
case 3: // setz+zu
puts("setz+zu");
setz(al|T_zu);
break;
}
ret();
}
};
int main()
try
{
for (int mode = 0; mode < 4; mode++) {
Code c(mode);
auto f = c.getCode<int (*)()>();
printf("ret=%08x\n", f());
}
} catch (std::exception& e) {
printf("ERR %s\n", e.what());
}

View file

@ -1,5 +1,5 @@
TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32 TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32
XBYAK_INC=../xbyak/xbyak.h XBYAK_INC=../xbyak/xbyak.h ../xbyak/xbyak_mnemonic.h
UNAME_S=$(shell uname -s) UNAME_S=$(shell uname -s)
ifeq ($(shell ./detect_x32),x32) ifeq ($(shell ./detect_x32),x32)
X32?=1 X32?=1
@ -13,45 +13,50 @@ ifeq ($(UNAME_S),Darwin)
# 32-bit binary is not supported # 32-bit binary is not supported
ONLY_64BIT=1 ONLY_64BIT=1
endif endif
ifeq ($(findstring MINGW64,$(UNAME_S)),MINGW64)
ONLY_64BIT=1
endif
ifeq ($(ONLY_64BIT),0) ifeq ($(ONLY_64BIT),0)
TARGET += jmp address TARGET += jmp address
endif endif
ifeq ($(BIT),64) ifeq ($(BIT),64)
TARGET += jmp64 address64 TARGET += jmp64 address64 apx
endif endif
all: $(TARGET) all: $(TARGET)
CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wwrite-strings -Wfloat-equal -Wpointer-arith CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wwrite-strings -Wfloat-equal -Wpointer-arith
CFLAGS=-O2 -Wall -I../ -I./ $(CFLAGS_WARN) $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) #-std=c++0x CFLAGS=-O2 -Wall -I.. -I. $(CFLAGS_WARN) $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) #-std=c++0x
make_nm: make_nm:
$(CXX) $(CFLAGS) make_nm.cpp -o $@ $(CXX) $(CFLAGS) make_nm.cpp -o $@
normalize_prefix: normalize_prefix.cpp ../xbyak/xbyak.h normalize_prefix: normalize_prefix.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) normalize_prefix.cpp -o $@ $(CXX) $(CFLAGS) normalize_prefix.cpp -o $@
test_mmx: test_mmx.cpp ../xbyak/xbyak.h test_mmx: test_mmx.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) test_mmx.cpp -o $@ -lpthread $(CXX) $(CFLAGS) test_mmx.cpp -o $@ -lpthread
jmp: jmp.cpp ../xbyak/xbyak.h jmp: jmp.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) jmp.cpp -o $@ -m32 $(CXX) $(CFLAGS) jmp.cpp -o $@ -m32
jmp64: jmp.cpp ../xbyak/xbyak.h jmp64: jmp.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) jmp.cpp -o $@ -m64 $(CXX) $(CFLAGS) jmp.cpp -o $@ -m64
address: address.cpp ../xbyak/xbyak.h address: address.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) address.cpp -o $@ -m32 $(CXX) $(CFLAGS) address.cpp -o $@ -m32
address64: address.cpp ../xbyak/xbyak.h address64: address.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) address.cpp -o $@ -m64 $(CXX) $(CFLAGS) address.cpp -o $@ -m64
bad_address: bad_address.cpp ../xbyak/xbyak.h bad_address: bad_address.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) bad_address.cpp -o $@ $(CXX) $(CFLAGS) bad_address.cpp -o $@
misc: misc.cpp ../xbyak/xbyak.h misc: misc.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) misc.cpp -o $@ $(CXX) $(CFLAGS) misc.cpp -o $@
misc32: misc.cpp ../xbyak/xbyak.h misc32: misc.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) misc.cpp -o $@ -DXBYAK32 $(CXX) $(CFLAGS) misc.cpp -o $@ -DXBYAK32
cvt_test: cvt_test.cpp ../xbyak/xbyak.h cvt_test: cvt_test.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) $< -o $@ $(CXX) $(CFLAGS) $< -o $@
cvt_test32: cvt_test.cpp ../xbyak/xbyak.h cvt_test32: cvt_test.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) $< -o $@ -DXBYAK32 $(CXX) $(CFLAGS) $< -o $@ -DXBYAK32
noexception: noexception.cpp ../xbyak/xbyak.h noexception: noexception.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) $< -o $@ -fno-exceptions $(CXX) $(CFLAGS) $< -o $@ -fno-exceptions
apx: apx.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) apx.cpp -o $@
test_nm: normalize_prefix $(TARGET) test_nm: normalize_prefix $(TARGET)
$(MAKE) -C ../gen $(MAKE) -C ../gen
@ -75,6 +80,7 @@ ifneq ($(X32),1)
CXX=$(CXX) ./test_nm.sh Y64 CXX=$(CXX) ./test_nm.sh Y64
endif endif
./jmp64 ./jmp64
./apx
endif endif
test_avx: normalize_prefix test_avx: normalize_prefix
@ -112,3 +118,4 @@ lib_run: lib_test.cpp lib_run.cpp lib.h
$(CXX) $(CFLAGS) lib_run.cpp lib_test.cpp -o lib_run $(CXX) $(CFLAGS) lib_run.cpp lib_test.cpp -o lib_run
make_nm: make_nm.cpp $(XBYAK_INC) make_nm: make_nm.cpp $(XBYAK_INC)
.PHONY: test

1964
externals/xbyak/test/apx.cpp vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -109,8 +109,8 @@ CYBOZU_TEST_AUTO(changeBit)
{ &dil, &di, &edi, &rdi, &xmm7, &ymm7, &zmm7 }, { &dil, &di, &edi, &rdi, &xmm7, &ymm7, &zmm7 },
{ &r8b, &r8w, &r8d, &r8, &xmm8, &ymm8, &zmm8 }, { &r8b, &r8w, &r8d, &r8, &xmm8, &ymm8, &zmm8 },
{ &r15b, &r15w, &r15d, &r15, &xmm15, &ymm15, &zmm15 }, { &r15b, &r15w, &r15d, &r15, &xmm15, &ymm15, &zmm15 },
{ 0, 0, 0, 0, &xmm16, &ymm16, &zmm16 }, { &r16b, &r16w, &r16d, &r16, &xmm16, &ymm16, &zmm16 },
{ 0, 0, 0, 0, &xmm31, &ymm31, &zmm31 }, { &r31b, &r31w, &r31d, &r31, &xmm31, &ymm31, &zmm31 },
}; };
const int bitTbl[N] = { 8, 16, 32, 64, 128, 256, 512 }; const int bitTbl[N] = { 8, 16, 32, 64, 128, 256, 512 };
#else #else

View file

@ -558,6 +558,7 @@ class Test {
"wbinvd", "wbinvd",
"wrmsr", "wrmsr",
"xlatb", "xlatb",
"xend",
"popf", "popf",
"pushf", "pushf",
@ -1050,6 +1051,10 @@ class Test {
"nle", "nle",
"g", "g",
}; };
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wformat-truncation" // wrong detection
#endif
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
char buf[32]; char buf[32];
snprintf(buf, sizeof(buf), "cmov%s", tbl[i]); snprintf(buf, sizeof(buf), "cmov%s", tbl[i]);
@ -1059,6 +1064,9 @@ class Test {
snprintf(buf, sizeof(buf), "set%s", tbl[i]); snprintf(buf, sizeof(buf), "set%s", tbl[i]);
put(buf, REG8|REG8_3|MEM); put(buf, REG8|REG8_3|MEM);
} }
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop
#endif
} }
void putReg1() const void putReg1() const
{ {
@ -1326,6 +1334,7 @@ class Test {
#ifdef XBYAK64 #ifdef XBYAK64
put("cmpxchg16b", MEM); put("cmpxchg16b", MEM);
put("fxrstor64", MEM); put("fxrstor64", MEM);
put("xbegin", "0x12345678");
#endif #endif
{ {
const char tbl[][8] = { const char tbl[][8] = {
@ -1348,6 +1357,7 @@ class Test {
put("xchg", EAX|REG32, EAX|REG32|MEM); put("xchg", EAX|REG32, EAX|REG32|MEM);
put("xchg", MEM, EAX|REG32); put("xchg", MEM, EAX|REG32);
put("xchg", REG64, REG64|MEM); put("xchg", REG64, REG64|MEM);
put("xabort", IMM8);
} }
void putShift() const void putShift() const
{ {
@ -1493,18 +1503,6 @@ class Test {
put(p, XMM, XMM|MEM, IMM); put(p, XMM, XMM|MEM, IMM);
} }
} }
{
const char tbl[][16] = {
"pclmullqlqdq",
"pclmulhqlqdq",
// "pclmullqhdq", // QQQ : not supported by nasm/yasm
// "pclmulhqhdq",
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const char *p = tbl[i];
put(p, XMM, XMM|MEM);
}
}
put("extractps", REG32e|MEM, XMM, IMM); put("extractps", REG32e|MEM, XMM, IMM);
put("pextrw", REG32e|MEM, XMM, IMM); // pextrw for REG32 is for MMX2 put("pextrw", REG32e|MEM, XMM, IMM); // pextrw for REG32 is for MMX2
put("pextrb", REG32e|MEM, XMM, IMM); put("pextrb", REG32e|MEM, XMM, IMM);
@ -1522,6 +1520,23 @@ class Test {
#endif #endif
} }
void putVpclmulqdq()
{
const char tbl[][16] = {
"vpclmullqlqdq",
"vpclmulhqlqdq",
"vpclmullqhqdq",
"vpclmulhqhqdq",
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const char *p = tbl[i] + 1; // remove the top 'v'
put(p, XMM, XMM|MEM);
p = tbl[i]; // use the top 'v'
put(p, XMM, XMM, XMM|MEM);
put(p, YMM, YMM, YMM|MEM);
put(p, ZMM, ZMM, ZMM|MEM);
}
}
void putSHA() const void putSHA() const
{ {
put("sha1rnds4", XMM, XMM|MEM, IMM); put("sha1rnds4", XMM, XMM|MEM, IMM);
@ -2569,6 +2584,7 @@ public:
putPushPop8_16(); putPushPop8_16();
#else #else
putSIMPLE(); putSIMPLE();
putVpclmulqdq();
putReg1(); putReg1();
putBt(); putBt();
putRorM(); putRorM();

View file

@ -1949,6 +1949,12 @@ CYBOZU_TEST_AUTO(misc)
movdiri(ptr[rax+r12], r9); movdiri(ptr[rax+r12], r9);
movdiri(ptr[rax+r12*2+4], r9d); movdiri(ptr[rax+r12*2+4], r9d);
movdir64b(r10, ptr[r8]); movdir64b(r10, ptr[r8]);
clui();
senduipi(rax);
senduipi(r10);
stui();
testui();
uiret();
#endif #endif
} }
} c; } c;
@ -1972,6 +1978,12 @@ CYBOZU_TEST_AUTO(misc)
0x4e, 0x0f, 0x38, 0xf9, 0x0c, 0x20, // movdiri 0x4e, 0x0f, 0x38, 0xf9, 0x0c, 0x20, // movdiri
0x46, 0x0f, 0x38, 0xf9, 0x4c, 0x60, 0x04, // movdiri 0x46, 0x0f, 0x38, 0xf9, 0x4c, 0x60, 0x04, // movdiri
0x66, 0x45, 0x0f, 0x38, 0xf8, 0x10, // movdir64b 0x66, 0x45, 0x0f, 0x38, 0xf8, 0x10, // movdir64b
0xf3, 0x0f, 0x01, 0xee, // clui
0xf3, 0x0f, 0xc7, 0xf0, // senduipi rax
0xf3, 0x41, 0x0f, 0xc7, 0xf2, // senduipi r10
0xf3, 0x0f, 0x01, 0xef, // stui
0xf3, 0x0f, 0x01, 0xed, // testui
0xf3, 0x0f, 0x01, 0xec, // uiret
#endif #endif
}; };
const size_t n = sizeof(tbl) / sizeof(tbl[0]); const size_t n = sizeof(tbl) / sizeof(tbl[0]);
@ -2157,4 +2169,116 @@ CYBOZU_TEST_AUTO(prefetchiti)
CYBOZU_TEST_EQUAL(c.getSize(), n); CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
} }
CYBOZU_TEST_AUTO(crypto)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vsha512msg1(ymm3, xmm5);
vsha512msg2(ymm9, ymm10);
vsha512rnds2(ymm1, ymm3, xmm2);
vsm3msg1(xmm1, xmm2, xmm3);
vsm3msg1(xmm1, xmm2, ptr [rax]);
vsm3msg2(xmm5, xmm7, xmm3);
vsm3msg2(xmm5, xmm6, ptr [rax]);
vsm3rnds2(xmm5, xmm7, xmm3, 0x12);
vsm3rnds2(xmm5, xmm7, ptr [rcx], 0x34);
vsm4key4(xmm1, xmm2, xmm3);
vsm4key4(xmm1, xmm2, ptr [rdx]);
vsm4rnds4(xmm1, xmm2, xmm3);
vsm4rnds4(xmm5, xmm6, ptr [rcx+rax*4]);
}
} c;
const uint8_t tbl[] = {
// sha512
0xc4, 0xe2, 0x7f, 0xcc, 0xdd,
0xc4, 0x42, 0x7f, 0xcd, 0xca,
0xc4, 0xe2, 0x67, 0xcb, 0xca,
// sm3
0xC4, 0xE2, 0x68, 0xDA, 0xCB,
0xC4, 0xE2, 0x68, 0xDA, 0x08,
0xC4, 0xE2, 0x41, 0xDA, 0xEB,
0xC4, 0xE2, 0x49, 0xDA, 0x28,
0xC4, 0xE3, 0x41, 0xDE, 0xEB, 0x12,
0xC4, 0xE3, 0x41, 0xDE, 0x29, 0x34,
// sm4
0xc4, 0xe2, 0x6a, 0xda, 0xcb,
0xc4, 0xe2, 0x6a, 0xda, 0x0a,
0xc4, 0xe2, 0x6b, 0xda, 0xcb,
0xc4, 0xe2, 0x4b, 0xda, 0x2c, 0x81,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(avx_vnni_int)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vpdpbssd(xmm1, xmm2, xmm3);
vpdpbssd(ymm1, ymm2, ptr [rax]);
vpdpbssds(xmm1, xmm2, xmm3);
vpdpbssds(ymm1, ymm2, ptr [rax]);
vpdpbsud(xmm1, xmm2, xmm3);
vpdpbsud(ymm1, ymm2, ptr [rax]);
vpdpbsuds(xmm1, xmm2, xmm3);
vpdpbsuds(ymm1, ymm2, ptr [rax]);
vpdpbuud(xmm1, xmm2, xmm3);
vpdpbuud(ymm1, ymm2, ptr [rax]);
vpdpbuuds(xmm1, xmm2, xmm3);
vpdpbuuds(ymm1, ymm2, ptr [rax]);
vpdpwsud(xmm1, xmm2, xmm3);
vpdpwsud(ymm1, ymm2, ptr [rax]);
vpdpwsuds(xmm1, xmm2, xmm3);
vpdpwsuds(ymm1, ymm2, ptr [rax]);
vpdpwusd(xmm1, xmm2, xmm3);
vpdpwusd(ymm1, ymm2, ptr [rax]);
vpdpwusds(xmm1, xmm2, xmm3);
vpdpwusds(ymm1, ymm2, ptr [rax]);
vpdpwuud(xmm1, xmm2, xmm3);
vpdpwuud(ymm1, ymm2, ptr [rax]);
vpdpwuuds(xmm1, xmm2, xmm3);
vpdpwuuds(ymm1, ymm2, ptr [rax]);
}
} c;
const uint8_t tbl[] = {
0xc4, 0xe2, 0x6b, 0x50, 0xcb,
0xc4, 0xe2, 0x6f, 0x50, 0x08,
0xc4, 0xe2, 0x6b, 0x51, 0xcb,
0xc4, 0xe2, 0x6f, 0x51, 0x08,
0xc4, 0xe2, 0x6a, 0x50, 0xcb,
0xc4, 0xe2, 0x6e, 0x50, 0x08,
0xc4, 0xe2, 0x6a, 0x51, 0xcb,
0xc4, 0xe2, 0x6e, 0x51, 0x08,
0xc4, 0xe2, 0x68, 0x50, 0xcb,
0xc4, 0xe2, 0x6c, 0x50, 0x08,
0xc4, 0xe2, 0x68, 0x51, 0xcb,
0xc4, 0xe2, 0x6c, 0x51, 0x08,
0xc4, 0xe2, 0x6a, 0xd2, 0xcb,
0xc4, 0xe2, 0x6e, 0xd2, 0x08,
0xc4, 0xe2, 0x6a, 0xd3, 0xcb,
0xc4, 0xe2, 0x6e, 0xd3, 0x08,
0xc4, 0xe2, 0x69, 0xd2, 0xcb,
0xc4, 0xe2, 0x6d, 0xd2, 0x08,
0xc4, 0xe2, 0x69, 0xd3, 0xcb,
0xc4, 0xe2, 0x6d, 0xd3, 0x08,
0xc4, 0xe2, 0x68, 0xd2, 0xcb,
0xc4, 0xe2, 0x6c, 0xd2, 0x08,
0xc4, 0xe2, 0x68, 0xd3, 0xcb,
0xc4, 0xe2, 0x6c, 0xd3, 0x08,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
#endif #endif

View file

@ -8,14 +8,25 @@
typedef unsigned char uint8_t; typedef unsigned char uint8_t;
std::string normalize(const std::string& line) std::string normalize(std::string line)
{ {
size_t pos = line.find('(');
/* nasm generates byte codes containing () for xbegin, so remove it. */
if (pos != std::string::npos) {
line.erase(pos, 1);
pos = line.find(')');
if (pos == std::string::npos) {
fprintf(stderr, "line error {%s}\n", line.c_str());
return "";
}
line.erase(pos, 1);
}
static const char tbl[][3] = { "66", "67", "F2", "F3" }; static const char tbl[][3] = { "66", "67", "F2", "F3" };
size_t tblNum = sizeof(tbl) / sizeof(tbl[0]); size_t tblNum = sizeof(tbl) / sizeof(tbl[0]);
typedef std::set<std::string> StringSet; typedef std::set<std::string> StringSet;
StringSet suf; StringSet suf;
size_t pos = 0; pos = 0;
for (; pos < line.size(); pos += 2) { for (; pos < line.size(); pos += 2) {
bool found = false; bool found = false;
for (size_t i = 0; i < tblNum; i++) { for (size_t i = 0; i < tblNum; i++) {

View file

@ -23,7 +23,7 @@ echo "xbyak"
echo "compile nm_frame.cpp" echo "compile nm_frame.cpp"
$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame $CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
./nm_frame > x.lst ./nm_frame > x.lst
diff ok.lst x.lst && echo "ok" diff -bB ok.lst x.lst && echo "ok"
} }

View file

@ -5,4 +5,10 @@ call test_address
call test_address 64 call test_address 64
echo *** test jmp address *** echo *** test jmp address ***
call test_jmp call test_jmp
echo *** test misc ***
set FILE=misc
call test_misc
echo *** test APX ***
set FILE=apx
call test_misc
echo *** all test end *** echo *** all test end ***

View file

@ -48,4 +48,4 @@ echo "xbyak"
echo "compile nm_frame.cpp" echo "compile nm_frame.cpp"
$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame $CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
./nm_frame | $FILTER > x.lst ./nm_frame | $FILTER > x.lst
diff -B ok.lst x.lst && echo "ok" diff -bB ok.lst x.lst && echo "ok"

View file

@ -35,4 +35,4 @@ echo "xbyak"
echo "compile nm_frame.cpp" echo "compile nm_frame.cpp"
$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame -DXBYAK_AVX512 $CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame -DXBYAK_AVX512
./nm_frame | $FILTER > x.lst ./nm_frame | $FILTER > x.lst
diff -B ok.lst x.lst && echo "ok" diff -bB ok.lst x.lst && echo "ok"

View file

@ -1,4 +1,4 @@
call set_opt call set_opt
bmake -f Makefile.win all bmake -f Makefile.win all
cl -I../ -I./ -DXBYAK_TEST misc.cpp %OPT% /Od /Zi cl -I../ -I./ -DXBYAK_TEST %FILE%.cpp %OPT% /Od /Zi
misc %FILE%

View file

@ -61,4 +61,4 @@ echo "xbyak"
echo "compile nm_frame.cpp" echo "compile nm_frame.cpp"
$CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame $CXX $CFLAGS -DXBYAK_TEST nm_frame.cpp -o nm_frame
./nm_frame | $FILTER > x.lst ./nm_frame | $FILTER > x.lst
diff -B ok.lst x.lst && echo "ok" diff -bB ok.lst x.lst && echo "ok"

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -9,6 +9,13 @@
#define XBYAK_THROW(x) ; #define XBYAK_THROW(x) ;
#define XBYAK_THROW_RET(x, y) return y; #define XBYAK_THROW_RET(x, y) return y;
#endif #endif
#ifndef XBYAK_CONSTEXPR
#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || (defined(_MSC_VER) && _MSC_VER >= 1910)
#define XBYAK_CONSTEXPR constexpr
#else
#define XBYAK_CONSTEXPR
#endif
#endif
#else #else
#include <string.h> #include <string.h>
@ -93,7 +100,7 @@ struct TypeT {
}; };
template<uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2> template<uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2>
TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) { return TypeT<L1 | L2, H1 | H2>(); } XBYAK_CONSTEXPR TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) { return TypeT<L1 | L2, H1 | H2>(); }
template<typename T> template<typename T>
inline T max_(T x, T y) { return x >= y ? x : y; } inline T max_(T x, T y) { return x >= y ? x : y; }
@ -137,6 +144,7 @@ private:
uint32_t dataCacheSize_[maxNumberCacheLevels]; uint32_t dataCacheSize_[maxNumberCacheLevels];
uint32_t coresSharignDataCache_[maxNumberCacheLevels]; uint32_t coresSharignDataCache_[maxNumberCacheLevels];
uint32_t dataCacheLevels_; uint32_t dataCacheLevels_;
uint32_t avx10version_;
uint32_t get32bitAsBE(const char *x) const uint32_t get32bitAsBE(const char *x) const
{ {
@ -173,11 +181,9 @@ private:
} }
void setNumCores() void setNumCores()
{ {
if (!has(tINTEL)) return; if (!has(tINTEL) && !has(tAMD)) return;
uint32_t data[4] = {}; uint32_t data[4] = {};
/* CAUTION: These numbers are configuration as shipped by Intel. */
getCpuidEx(0x0, 0, data); getCpuidEx(0x0, 0, data);
if (data[0] >= 0xB) { if (data[0] >= 0xB) {
/* /*
@ -211,7 +217,48 @@ private:
} }
void setCacheHierarchy() void setCacheHierarchy()
{ {
if (!has(tINTEL)) return; if (!has(tINTEL) && !has(tAMD)) return;
// https://github.com/amd/ZenDNN/blob/a08bf9a9efc160a69147cdecfb61cc85cc0d4928/src/cpu/x64/xbyak/xbyak_util.h#L236-L288
if (has(tAMD)) {
// There are 3 Data Cache Levels (L1, L2, L3)
dataCacheLevels_ = 3;
const uint32_t leaf = 0x8000001D; // for modern AMD CPus
// Sub leaf value ranges from 0 to 3
// Sub leaf value 0 refers to L1 Data Cache
// Sub leaf value 1 refers to L1 Instruction Cache
// Sub leaf value 2 refers to L2 Cache
// Sub leaf value 3 refers to L3 Cache
// For legacy AMD CPU, use leaf 0x80000005 for L1 cache
// and 0x80000006 for L2 and L3 cache
int cache_index = 0;
for (uint32_t sub_leaf = 0; sub_leaf <= dataCacheLevels_; sub_leaf++) {
// Skip sub_leaf = 1 as it refers to
// L1 Instruction Cache (not required)
if (sub_leaf == 1) {
continue;
}
uint32_t data[4] = {};
getCpuidEx(leaf, sub_leaf, data);
// Cache Size = Line Size * Partitions * Associativity * Cache Sets
dataCacheSize_[cache_index] =
(extractBit(data[1], 22, 31) + 1) // Associativity-1
* (extractBit(data[1], 12, 21) + 1) // Partitions-1
* (extractBit(data[1], 0, 11) + 1) // Line Size
* (data[2] + 1);
// Calculate the number of cores sharing the current data cache
int smt_width = numCores_[0];
int logical_cores = numCores_[1];
int actual_logical_cores = extractBit(data[0], 14, 25) /* # of cores * # of threads */ + 1;
if (logical_cores != 0) {
actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
}
coresSharignDataCache_[cache_index] = local::max_(actual_logical_cores / smt_width, 1);
++cache_index;
}
return;
}
// intel
const uint32_t NO_CACHE = 0; const uint32_t NO_CACHE = 0;
const uint32_t DATA_CACHE = 1; const uint32_t DATA_CACHE = 1;
// const uint32_t INSTRUCTION_CACHE = 2; // const uint32_t INSTRUCTION_CACHE = 2;
@ -417,6 +464,21 @@ public:
XBYAK_DEFINE_TYPE(72, tRAO_INT); XBYAK_DEFINE_TYPE(72, tRAO_INT);
XBYAK_DEFINE_TYPE(73, tCMPCCXADD); XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
XBYAK_DEFINE_TYPE(74, tPREFETCHITI); XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
XBYAK_DEFINE_TYPE(75, tSERIALIZE);
XBYAK_DEFINE_TYPE(76, tUINTR);
XBYAK_DEFINE_TYPE(77, tXSAVE);
XBYAK_DEFINE_TYPE(78, tSHA512);
XBYAK_DEFINE_TYPE(79, tSM3);
XBYAK_DEFINE_TYPE(80, tSM4);
XBYAK_DEFINE_TYPE(81, tAVX_VNNI_INT16);
XBYAK_DEFINE_TYPE(82, tAPX_F);
XBYAK_DEFINE_TYPE(83, tAVX10);
XBYAK_DEFINE_TYPE(84, tAESKLE);
XBYAK_DEFINE_TYPE(85, tWIDE_KL);
XBYAK_DEFINE_TYPE(86, tKEYLOCKER);
XBYAK_DEFINE_TYPE(87, tKEYLOCKER_WIDE);
XBYAK_DEFINE_TYPE(88, tSSE4a);
XBYAK_DEFINE_TYPE(89, tCLWB);
#undef XBYAK_SPLIT_ID #undef XBYAK_SPLIT_ID
#undef XBYAK_DEFINE_TYPE #undef XBYAK_DEFINE_TYPE
@ -428,6 +490,7 @@ public:
, dataCacheSize_() , dataCacheSize_()
, coresSharignDataCache_() , coresSharignDataCache_()
, dataCacheLevels_(0) , dataCacheLevels_(0)
, avx10version_(0)
{ {
uint32_t data[4] = {}; uint32_t data[4] = {};
const uint32_t& EAX = data[0]; const uint32_t& EAX = data[0];
@ -462,13 +525,14 @@ public:
if (maxExtendedNum >= 0x80000001) { if (maxExtendedNum >= 0x80000001) {
getCpuid(0x80000001, data); getCpuid(0x80000001, data);
if (EDX & (1U << 31)) type_ |= t3DN;
if (EDX & (1U << 30)) type_ |= tE3DN;
if (EDX & (1U << 27)) type_ |= tRDTSCP;
if (EDX & (1U << 22)) type_ |= tMMX2;
if (EDX & (1U << 15)) type_ |= tCMOV;
if (ECX & (1U << 5)) type_ |= tLZCNT; if (ECX & (1U << 5)) type_ |= tLZCNT;
if (ECX & (1U << 6)) type_ |= tSSE4a;
if (ECX & (1U << 8)) type_ |= tPREFETCHW; if (ECX & (1U << 8)) type_ |= tPREFETCHW;
if (EDX & (1U << 15)) type_ |= tCMOV;
if (EDX & (1U << 22)) type_ |= tMMX2;
if (EDX & (1U << 27)) type_ |= tRDTSCP;
if (EDX & (1U << 30)) type_ |= tE3DN;
if (EDX & (1U << 31)) type_ |= t3DN;
} }
if (maxExtendedNum >= 0x80000008) { if (maxExtendedNum >= 0x80000008) {
@ -478,16 +542,17 @@ public:
getCpuid(1, data); getCpuid(1, data);
if (ECX & (1U << 0)) type_ |= tSSE3; if (ECX & (1U << 0)) type_ |= tSSE3;
if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
if (ECX & (1U << 9)) type_ |= tSSSE3; if (ECX & (1U << 9)) type_ |= tSSSE3;
if (ECX & (1U << 19)) type_ |= tSSE41; if (ECX & (1U << 19)) type_ |= tSSE41;
if (ECX & (1U << 20)) type_ |= tSSE42; if (ECX & (1U << 20)) type_ |= tSSE42;
if (ECX & (1U << 22)) type_ |= tMOVBE; if (ECX & (1U << 22)) type_ |= tMOVBE;
if (ECX & (1U << 23)) type_ |= tPOPCNT; if (ECX & (1U << 23)) type_ |= tPOPCNT;
if (ECX & (1U << 25)) type_ |= tAESNI; if (ECX & (1U << 25)) type_ |= tAESNI;
if (ECX & (1U << 1)) type_ |= tPCLMULQDQ; if (ECX & (1U << 26)) type_ |= tXSAVE;
if (ECX & (1U << 27)) type_ |= tOSXSAVE; if (ECX & (1U << 27)) type_ |= tOSXSAVE;
if (ECX & (1U << 30)) type_ |= tRDRAND;
if (ECX & (1U << 29)) type_ |= tF16C; if (ECX & (1U << 29)) type_ |= tF16C;
if (ECX & (1U << 30)) type_ |= tRDRAND;
if (EDX & (1U << 15)) type_ |= tCMOV; if (EDX & (1U << 15)) type_ |= tCMOV;
if (EDX & (1U << 23)) type_ |= tMMX; if (EDX & (1U << 23)) type_ |= tMMX;
@ -498,8 +563,8 @@ public:
// check XFEATURE_ENABLED_MASK[2:1] = '11b' // check XFEATURE_ENABLED_MASK[2:1] = '11b'
uint64_t bv = getXfeature(); uint64_t bv = getXfeature();
if ((bv & 6) == 6) { if ((bv & 6) == 6) {
if (ECX & (1U << 28)) type_ |= tAVX;
if (ECX & (1U << 12)) type_ |= tFMA; if (ECX & (1U << 12)) type_ |= tFMA;
if (ECX & (1U << 28)) type_ |= tAVX;
// do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support // do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support
#if !defined(__APPLE__) #if !defined(__APPLE__)
if (((bv >> 5) & 7) == 7) if (((bv >> 5) & 7) == 7)
@ -533,29 +598,36 @@ public:
const uint32_t maxNumSubLeaves = EAX; const uint32_t maxNumSubLeaves = EAX;
if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2; if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
if (EBX & (1U << 3)) type_ |= tBMI1; if (EBX & (1U << 3)) type_ |= tBMI1;
if (EBX & (1U << 4)) type_ |= tHLE;
if (EBX & (1U << 8)) type_ |= tBMI2; if (EBX & (1U << 8)) type_ |= tBMI2;
if (EBX & (1U << 9)) type_ |= tENHANCED_REP; if (EBX & (1U << 9)) type_ |= tENHANCED_REP;
if (EBX & (1U << 11)) type_ |= tRTM;
if (EBX & (1U << 14)) type_ |= tMPX;
if (EBX & (1U << 18)) type_ |= tRDSEED; if (EBX & (1U << 18)) type_ |= tRDSEED;
if (EBX & (1U << 19)) type_ |= tADX; if (EBX & (1U << 19)) type_ |= tADX;
if (EBX & (1U << 20)) type_ |= tSMAP; if (EBX & (1U << 20)) type_ |= tSMAP;
if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT; if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT;
if (EBX & (1U << 4)) type_ |= tHLE; if (EBX & (1U << 24)) type_ |= tCLWB;
if (EBX & (1U << 11)) type_ |= tRTM;
if (EBX & (1U << 14)) type_ |= tMPX;
if (EBX & (1U << 29)) type_ |= tSHA; if (EBX & (1U << 29)) type_ |= tSHA;
if (ECX & (1U << 0)) type_ |= tPREFETCHWT1; if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
if (ECX & (1U << 5)) type_ |= tWAITPKG; if (ECX & (1U << 5)) type_ |= tWAITPKG;
if (ECX & (1U << 8)) type_ |= tGFNI; if (ECX & (1U << 8)) type_ |= tGFNI;
if (ECX & (1U << 9)) type_ |= tVAES; if (ECX & (1U << 9)) type_ |= tVAES;
if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ; if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
if (ECX & (1U << 23)) type_ |= tKEYLOCKER;
if (ECX & (1U << 25)) type_ |= tCLDEMOTE; if (ECX & (1U << 25)) type_ |= tCLDEMOTE;
if (ECX & (1U << 27)) type_ |= tMOVDIRI; if (ECX & (1U << 27)) type_ |= tMOVDIRI;
if (ECX & (1U << 28)) type_ |= tMOVDIR64B; if (ECX & (1U << 28)) type_ |= tMOVDIR64B;
if (EDX & (1U << 5)) type_ |= tUINTR;
if (EDX & (1U << 14)) type_ |= tSERIALIZE;
if (EDX & (1U << 22)) type_ |= tAMX_BF16;
if (EDX & (1U << 24)) type_ |= tAMX_TILE; if (EDX & (1U << 24)) type_ |= tAMX_TILE;
if (EDX & (1U << 25)) type_ |= tAMX_INT8; if (EDX & (1U << 25)) type_ |= tAMX_INT8;
if (EDX & (1U << 22)) type_ |= tAMX_BF16;
if (maxNumSubLeaves >= 1) { if (maxNumSubLeaves >= 1) {
getCpuidEx(7, 1, data); getCpuidEx(7, 1, data);
if (EAX & (1U << 0)) type_ |= tSHA512;
if (EAX & (1U << 1)) type_ |= tSM3;
if (EAX & (1U << 2)) type_ |= tSM4;
if (EAX & (1U << 3)) type_ |= tRAO_INT; if (EAX & (1U << 3)) type_ |= tRAO_INT;
if (EAX & (1U << 4)) type_ |= tAVX_VNNI; if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
if (type_ & tAVX512F) { if (type_ & tAVX512F) {
@ -566,9 +638,22 @@ public:
if (EAX & (1U << 23)) type_ |= tAVX_IFMA; if (EAX & (1U << 23)) type_ |= tAVX_IFMA;
if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8; if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8;
if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT; if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT;
if (EDX & (1U << 10)) type_ |= tAVX_VNNI_INT16;
if (EDX & (1U << 14)) type_ |= tPREFETCHITI; if (EDX & (1U << 14)) type_ |= tPREFETCHITI;
if (EDX & (1U << 19)) type_ |= tAVX10;
if (EDX & (1U << 21)) type_ |= tAPX_F;
} }
} }
if (maxNum >= 0x19) {
getCpuidEx(0x19, 0, data);
if (EBX & (1U << 0)) type_ |= tAESKLE;
if (EBX & (1U << 2)) type_ |= tWIDE_KL;
if (type_ & (tKEYLOCKER|tAESKLE|tWIDE_KL)) type_ |= tKEYLOCKER_WIDE;
}
if (has(tAVX10) && maxNum >= 0x24) {
getCpuidEx(0x24, 0, data);
avx10version_ = EBX & mask(7);
}
setFamily(); setFamily();
setNumCores(); setNumCores();
setCacheHierarchy(); setCacheHierarchy();
@ -585,6 +670,7 @@ public:
{ {
return (type & type_) == type; return (type & type_) == type;
} }
int getAVX10version() const { return avx10version_; }
}; };
#ifndef XBYAK_ONLY_CLASS_CPU #ifndef XBYAK_ONLY_CLASS_CPU