externals: Update xbyak

- Fix on-demand AVX512 on macOS

Merge commit '2f9dea5cc355c266ad46d2f6397b141b99f78480'
This commit is contained in:
MerryMage 2021-03-27 21:08:22 +00:00
commit ad9b33164e
19 changed files with 297 additions and 124 deletions

View file

@ -0,0 +1,11 @@
name: test
on: [push]
jobs:
build:
name: test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- run: sudo apt install nasm yasm g++-multilib tcsh
- run: make test

1
externals/xbyak/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/build* # cmake

View file

@ -1,12 +0,0 @@
sudo: true
dist: bionic
language: cpp
compiler:
- gcc
- clang
addons:
apt:
packages:
- nasm yasm g++-multilib tcsh
script:
- make test

View file

@ -1,6 +1,46 @@
cmake_minimum_required(VERSION 2.6) cmake_minimum_required(VERSION 2.6...3.0.2)
project(xbyak)
project(xbyak CXX)
file(GLOB headers xbyak/*.h) file(GLOB headers xbyak/*.h)
install(FILES ${headers} DESTINATION include/xbyak)
if (DEFINED CMAKE_VERSION AND CMAKE_VERSION VERSION_GREATER_EQUAL 3.0.2)
include(GNUInstallDirs)
add_library(${PROJECT_NAME} INTERFACE)
add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
target_include_directories(
${PROJECT_NAME} INTERFACE
"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
)
install(
TARGETS ${PROJECT_NAME}
EXPORT ${PROJECT_NAME}-targets
)
configure_file(
cmake/config.cmake.in
${PROJECT_NAME}Config.cmake
@ONLY
)
install(
FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)
install(
EXPORT ${PROJECT_NAME}-targets
NAMESPACE ${PROJECT_NAME}::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)
elseif(NOT DEFINED CMAKE_INSTALL_INCLUDEDIR)
set(CMAKE_INSTALL_INCLUDEDIR "include")
endif()
install(
FILES ${headers}
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/xbyak
)

1
externals/xbyak/cmake/config.cmake.in vendored Normal file
View file

@ -0,0 +1 @@
include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@-targets.cmake")

View file

@ -363,12 +363,6 @@ void putX_X_XM_IMM()
{ 0x73, "vpshrdvd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false }, { 0x73, "vpshrdvd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
{ 0x73, "vpshrdvq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, false }, { 0x73, "vpshrdvq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, false },
{ 0x50, "vpdpbusd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
{ 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
{ 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
{ 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
{ 0x72, "vcvtne2ps2bf16", T_F2 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false }, { 0x72, "vcvtne2ps2bf16", T_F2 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
{ 0x52, "vdpbf16ps", T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false }, { 0x52, "vdpbf16ps", T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
}; };

View file

@ -1729,6 +1729,24 @@ void put()
printf("void %s(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W%d, 0x%x, %d); }\n", p.name, p.w, p.code, p.mode); printf("void %s(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W%d, 0x%x, %d); }\n", p.name, p.w, p.code, p.mode);
} }
} }
// vnni
{
const struct Tbl {
uint8_t code;
const char *name;
int type;
} tbl[] = {
{ 0x50, "vpdpbusd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
{ 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
{ 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
{ 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
std::string type = type2String(p->type);
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, type.c_str(), p->code);
}
}
} }
void put32() void put32()

View file

@ -1,11 +1,14 @@
[![Build Status](https://travis-ci.org/herumi/xbyak.png)](https://travis-ci.org/herumi/xbyak) [![Build Status](https://github.com/herumi/xbyak/actions/workflows/main.yml/badge.svg)](https://github.com/herumi/xbyak/actions/workflows/main.yml)
# Xbyak 5.97 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++ # Xbyak 5.991 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
## Abstract ## Abstract
Xbyak is a C++ header library that enables dynamically to assemble x86(IA32), x64(AMD64, x86-64) mnemonic. Xbyak is a C++ header library that enables dynamically to assemble x86(IA32), x64(AMD64, x86-64) mnemonic.
The pronunciation of Xbyak is `kəi-bja-k`.
It is named from a Japanese word [開闢](https://translate.google.com/?hl=ja&sl=ja&tl=en&text=%E9%96%8B%E9%97%A2&op=translate), which means the beginning of the world.
## Feature ## Feature
* header file only * header file only
* Intel/MASM like syntax * Intel/MASM like syntax
@ -16,6 +19,7 @@ Use `and_()`, `or_()`, ... instead of `and()`, `or()`.
If you want to use them, then specify `-fno-operator-names` option to gcc/clang. If you want to use them, then specify `-fno-operator-names` option to gcc/clang.
### News ### News
- vnni instructions such as vpdpbusd supports vex encoding.
- (break backward compatibility) `push(byte, imm)` (resp. `push(word, imm)`) forces to cast `imm` to 8(resp. 16) bit. - (break backward compatibility) `push(byte, imm)` (resp. `push(word, imm)`) forces to cast `imm` to 8(resp. 16) bit.
- (Windows) `#include <winsock2.h>` has been removed from xbyak.h, so add it explicitly if you need it. - (Windows) `#include <winsock2.h>` has been removed from xbyak.h, so add it explicitly if you need it.
- support exception-less mode see. [Exception-less mode](#exception-less-mode) - support exception-less mode see. [Exception-less mode](#exception-less-mode)
@ -154,6 +158,10 @@ vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]);
vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512 vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512
vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit
vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding
``` ```
### Remark ### Remark
* `k1`, ..., `k7` are opmask registers. * `k1`, ..., `k7` are opmask registers.
@ -339,9 +347,9 @@ public:
## User allocated memory ## User allocated memory
You can make jit code on prepaired memory. You can make jit code on prepared memory.
Call `setProtectModeRE` yourself to change memory mode if using the prepaired memory. Call `setProtectModeRE` yourself to change memory mode if using the prepared memory.
``` ```
uint8_t alignas(4096) buf[8192]; // C++11 or later uint8_t alignas(4096) buf[8192]; // C++11 or later
@ -438,6 +446,9 @@ modified new BSD License
http://opensource.org/licenses/BSD-3-Clause http://opensource.org/licenses/BSD-3-Clause
## History ## History
* 2020/Nov/16 ver 5.991 disable constexpr for gcc-5 with -std=c++-14
* 2020/Oct/19 ver 5.99 support VNNI instructions(Thanks to akharito)
* 2020/Oct/17 ver 5.98 support the form of [scale * reg]
* 2020/Sep/08 ver 5.97 replace uint32 with uint32_t etc. * 2020/Sep/08 ver 5.97 replace uint32 with uint32_t etc.
* 2020/Aug/28 ver 5.95 some constructors of register classes support constexpr if C++14 or later * 2020/Aug/28 ver 5.95 some constructors of register classes support constexpr if C++14 or later
* 2020/Aug/04 ver 5.941 `CodeGenerator::reset()` calls `ClearError()`. * 2020/Aug/04 ver 5.941 `CodeGenerator::reset()` calls `ClearError()`.

View file

@ -1,5 +1,5 @@
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.97 C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.991
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
◎概要 ◎概要
@ -163,6 +163,9 @@ vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5)
vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 256-bit vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 256-bit
vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding
注意 注意
* k1, ..., k7 は新しいopmaskレジスタです。 * k1, ..., k7 は新しいopmaskレジスタです。
@ -379,6 +382,9 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
◎履歴 ◎履歴
2020/11/16 ver 5.991 g++-5のC++14でconstexpr機能の抑制
2020/10/19 ver 5.99 VNNI命令サポート(Thanks to akharito)
2020/10/17 ver 5.98 [scale * reg]のサポート
2020/09/08 ver 5.97 uint32などをuint32_tに置換 2020/09/08 ver 5.97 uint32などをuint32_tに置換
2020/08/28 ver 5.95 レジスタクラスのコンストラクタがconstexprに対応(C++14以降) 2020/08/28 ver 5.95 レジスタクラスのコンストラクタがconstexprに対応(C++14以降)
2020/08/04 ver 5.941 `CodeGenerator::reset()`が`ClearError()`を呼ぶように変更 2020/08/04 ver 5.941 `CodeGenerator::reset()`が`ClearError()`を呼ぶように変更

View file

@ -1,12 +1,13 @@
#include <stdio.h> #include <stdio.h>
#define XBYAK_NO_OP_NAMES
#include "xbyak/xbyak_util.h" #include "xbyak/xbyak_util.h"
#define NUM_OF_ARRAY(x) (sizeof(x) / sizeof(x[0])) #define NUM_OF_ARRAY(x) (sizeof(x) / sizeof(x[0]))
struct PopCountTest : public Xbyak::CodeGenerator { struct PopCountTest : public Xbyak::CodeGenerator {
PopCountTest(int n) PopCountTest(int n)
: Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE)
{ {
ret();
mov(eax, n); mov(eax, n);
popcnt(eax, eax); popcnt(eax, eax);
ret(); ret();
@ -80,6 +81,10 @@ void putCPUinfo()
{ Cpu::tAVX512_VPOPCNTDQ, "avx512_vpopcntdq" }, { Cpu::tAVX512_VPOPCNTDQ, "avx512_vpopcntdq" },
{ Cpu::tAVX512_BF16, "avx512_bf16" }, { Cpu::tAVX512_BF16, "avx512_bf16" },
{ Cpu::tAVX512_VP2INTERSECT, "avx512_vp2intersect" }, { Cpu::tAVX512_VP2INTERSECT, "avx512_vp2intersect" },
{ Cpu::tAMX_TILE, "amx(tile)" },
{ Cpu::tAMX_INT8, "amx(int8)" },
{ Cpu::tAMX_BF16, "amx(bf16)" },
{ Cpu::tAVX_VNNI, "avx_vnni" },
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str); if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);
@ -88,12 +93,16 @@ void putCPUinfo()
if (cpu.has(Cpu::tPOPCNT)) { if (cpu.has(Cpu::tPOPCNT)) {
const int n = 0x12345678; // bitcount = 13 const int n = 0x12345678; // bitcount = 13
const int ok = 13; const int ok = 13;
int r = PopCountTest(n).getCode<int (*)()>()(); PopCountTest code(n);
code.setProtectModeRE();
int (*f)() = code.getCode<int (*)()>();
int r = f();
if (r == ok) { if (r == ok) {
puts("popcnt ok"); puts("popcnt ok");
} else { } else {
printf("popcnt ng %d %d\n", r, ok); printf("popcnt ng %d %d\n", r, ok);
} }
code.setProtectModeRW();
} }
/* /*
displayFamily displayModel displayFamily displayModel

View file

@ -643,6 +643,7 @@ class Test {
puts(isXbyak_ ? "out_(dx, al); dump();" : "out dx, al"); puts(isXbyak_ ? "out_(dx, al); dump();" : "out dx, al");
puts(isXbyak_ ? "out_(dx, ax); dump();" : "out dx, ax"); puts(isXbyak_ ? "out_(dx, ax); dump();" : "out dx, ax");
puts(isXbyak_ ? "out_(dx, eax); dump();" : "out dx, eax"); puts(isXbyak_ ? "out_(dx, eax); dump();" : "out dx, eax");
puts(isXbyak_ ? "lea(eax, ptr[edi + 4 * eax]); dump();" : "lea eax, [edi + 4 * eax]");
} }
void putJmp() const void putJmp() const
{ {

View file

@ -815,4 +815,32 @@ CYBOZU_TEST_AUTO(tileloadd)
CYBOZU_TEST_EXCEPTION(c.notSupported(), std::exception); CYBOZU_TEST_EXCEPTION(c.notSupported(), std::exception);
CYBOZU_TEST_EXCEPTION(c.notSupported2(), std::exception); CYBOZU_TEST_EXCEPTION(c.notSupported2(), std::exception);
} }
CYBOZU_TEST_AUTO(vnni)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
// default encoding is EVEX
vpdpbusd(xm0, xm1, xm2);
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // EVEX
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX
}
void badVex()
{
vpdpbusd(xm0, xm1, xm31, VexEncoding);
}
} c;
const uint8_t tbl[] = {
0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
0xC4, 0xE2, 0x71, 0x50, 0xC2,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
CYBOZU_TEST_EXCEPTION(c.badVex(), std::exception);
}
#endif #endif

View file

@ -12,7 +12,7 @@ g++ $CFLAGS address.cpp -o address
./address $1 > a.asm ./address $1 > a.asm
echo "asm" echo "asm"
$EXE -f$OPT3 a.asm -l a.lst $EXE -f$OPT3 a.asm -l a.lst
awk '{if (index($3, "-")) { conti=substr($3, 0, length($3) - 1) } else { conti = conti $3; print conti; conti = "" }} ' < a.lst | $FILTER > ok.lst awk '{printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER > ok.lst
echo "xbyak" echo "xbyak"
./address $1 jit > nm.cpp ./address $1 jit > nm.cpp

View file

@ -1,39 +1,44 @@
#!/bin/tcsh #!/bin/sh
set FILTER="grep -v warning" FILTER="grep -v warning"
if ($1 == "Y") then case $1 in
Y)
echo "yasm(32bit)" echo "yasm(32bit)"
set EXE=yasm EXE=yasm
set OPT2="-DUSE_YASM -DXBYAK32" OPT2="-DUSE_YASM -DXBYAK32"
set OPT3=win32 OPT3=win32
else if ($1 == "64") then ;;
64)
echo "nasm(64bit)" echo "nasm(64bit)"
set EXE=nasm EXE=nasm
set OPT2=-DXBYAK64 OPT2=-DXBYAK64
set OPT3=win64 OPT3=win64
set FILTER=./normalize_prefix FILTER=./normalize_prefix
else if ($1 == "Y64") then ;;
Y64)
echo "yasm(64bit)" echo "yasm(64bit)"
set EXE=yasm EXE=yasm
set OPT2="-DUSE_YASM -DXBYAK64" OPT2="-DUSE_YASM -DXBYAK64"
set OPT3=win64 OPT3=win64
set FILTER=./normalize_prefix FILTER=./normalize_prefix
else ;;
*)
echo "nasm(32bit)" echo "nasm(32bit)"
set EXE=nasm EXE=nasm
set OPT2=-DXBYAK32 OPT2=-DXBYAK32
set OPT3=win32 OPT3=win32
endif ;;
esac
set CFLAGS="-Wall -fno-operator-names -I../ $OPT2 -DUSE_AVX" CFLAGS="-Wall -fno-operator-names -I../ $OPT2 -DUSE_AVX"
echo "compile make_nm.cpp" echo "compile make_nm.cpp"
g++ $CFLAGS make_nm.cpp -o make_nm g++ $CFLAGS make_nm.cpp -o make_nm
./make_nm > a.asm ./make_nm > a.asm
echo "asm" echo "asm"
$EXE -f$OPT3 a.asm -l a.lst $EXE -f$OPT3 a.asm -l a.lst
awk '{if (index($3, "-")) { conti=substr($3, 0, length($3) - 1) } else { conti = conti $3; print conti; conti = "" }} ' < a.lst | $FILTER | grep -v "1+1" > ok.lst awk '$3 != "1+1" {printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER > ok.lst
echo "xbyak" echo "xbyak"
./make_nm jit > nm.cpp ./make_nm jit > nm.cpp

View file

@ -1,28 +1,31 @@
#!/bin/tcsh #!/bin/sh
set FILTER="grep -v warning" FILTER="grep -v warning"
if ($1 == "64") then case $1 in
64)
echo "nasm(64bit)" echo "nasm(64bit)"
set EXE=nasm EXE=nasm
set OPT2=-DXBYAK64 OPT2=-DXBYAK64
set OPT3=win64 OPT3=win64
set FILTER=./normalize_prefix FILTER=./normalize_prefix
else ;;
*)
echo "nasm(32bit)" echo "nasm(32bit)"
set EXE=nasm EXE=nasm
set OPT2=-DXBYAK32 OPT2=-DXBYAK32
set OPT3=win32 OPT3=win32
endif ;;
esac
set CFLAGS="-Wall -fno-operator-names -I../ $OPT2 -DUSE_AVX512" CFLAGS="-Wall -fno-operator-names -I../ $OPT2 -DUSE_AVX512"
echo "compile make_512.cpp" echo "compile make_512.cpp"
g++ $CFLAGS make_512.cpp -o make_512 g++ $CFLAGS make_512.cpp -o make_512
./make_512 > a.asm ./make_512 > a.asm
echo "asm" echo "asm"
$EXE -f$OPT3 a.asm -l a.lst $EXE -f$OPT3 a.asm -l a.lst
awk '{if (index($3, "-")) { conti=substr($3, 0, length($3) - 1) } else { conti = conti $3; print conti; conti = "" }} ' < a.lst | $FILTER > ok.lst awk '{printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER > ok.lst
echo "xbyak" echo "xbyak"
./make_512 jit > nm.cpp ./make_512 jit > nm.cpp

View file

@ -1,50 +1,57 @@
#!/bin/tcsh #!/bin/sh
set FILTER=cat FILTER=cat
if ($1 == "Y") then case $1 in
Y)
echo "yasm(32bit)" echo "yasm(32bit)"
set EXE=yasm EXE=yasm
set OPT2="-DUSE_YASM -DXBYAK32" OPT2="-DUSE_YASM -DXBYAK32"
set OPT3=win32 OPT3=win32
else if ($1 == "64") then ;;
64)
echo "nasm(64bit)" echo "nasm(64bit)"
set EXE=nasm EXE=nasm
set OPT2=-DXBYAK64 OPT2=-DXBYAK64
set OPT3=win64 OPT3=win64
set FILTER=./normalize_prefix FILTER=./normalize_prefix
else if ($1 == "Y64") then ;;
Y64)
echo "yasm(64bit)" echo "yasm(64bit)"
set EXE=yasm EXE=yasm
set OPT2="-DUSE_YASM -DXBYAK64" OPT2="-DUSE_YASM -DXBYAK64"
set OPT3=win64 OPT3=win64
set FILTER=./normalize_prefix FILTER=./normalize_prefix
else if ($1 == "avx512") then ;;
avx512)
echo "nasm(64bit) + avx512" echo "nasm(64bit) + avx512"
set EXE=nasm EXE=nasm
set OPT2="-DXBYAK64 -DUSE_AVX512" OPT2="-DXBYAK64 -DUSE_AVX512"
set OPT3=win64 OPT3=win64
set FILTER=./normalize_prefix FILTER=./normalize_prefix
else if ($1 == "noexcept") then ;;
noexcept)
echo "nasm(32bit) without exception" echo "nasm(32bit) without exception"
set EXE=nasm EXE=nasm
set OPT2="-DXBYAK32 -DXBYAK_NO_EXCEPTION" OPT2="-DXBYAK32 -DXBYAK_NO_EXCEPTION"
set OPT3=win32 OPT3=win32
else ;;
*)
echo "nasm(32bit)" echo "nasm(32bit)"
set EXE=nasm EXE=nasm
set OPT2=-DXBYAK32 OPT2=-DXBYAK32
set OPT3=win32 OPT3=win32
endif ;;
esac
set CFLAGS="-Wall -fno-operator-names -I../ $OPT2" CFLAGS="-Wall -fno-operator-names -I../ $OPT2"
echo "compile make_nm.cpp with $CFLAGS" echo "compile make_nm.cpp with $CFLAGS"
g++ $CFLAGS make_nm.cpp -o make_nm g++ $CFLAGS make_nm.cpp -o make_nm
./make_nm > a.asm ./make_nm > a.asm
echo "asm" echo "asm"
$EXE -f$OPT3 a.asm -l a.lst $EXE -f$OPT3 a.asm -l a.lst
awk '{if (index($3, "-")) { conti=substr($3, 0, length($3) - 1) } else { conti = conti $3; print conti; conti = "" }} ' < a.lst | $FILTER | grep -v "1+1" > ok.lst awk '$3 != "1+1" {printf "%s", sub(/-$/, "", $3) ? $3 : $3 ORS}' a.lst | $FILTER > ok.lst
echo "xbyak" echo "xbyak"
./make_nm jit > nm.cpp ./make_nm jit > nm.cpp

View file

@ -108,7 +108,7 @@
#endif #endif
#endif #endif
#if (__cplusplus >= 201103) || (_MSC_VER >= 1800) #if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1800)
#undef XBYAK_TLS #undef XBYAK_TLS
#define XBYAK_TLS thread_local #define XBYAK_TLS thread_local
#define XBYAK_VARIADIC_TEMPLATE #define XBYAK_VARIADIC_TEMPLATE
@ -117,8 +117,11 @@
#define XBYAK_NOEXCEPT throw() #define XBYAK_NOEXCEPT throw()
#endif #endif
#if (__cplusplus >= 201402L) || (_MSC_VER >= 1910) // Visual Studio 2017 version 15.0 // require c++14 or later
#define XBYAK_CONSTEXPR constexpr // require c++14 or later // Visual Studio 2017 version 15.0 or later
// g++-6 or later
#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || (defined(_MSC_VER) && _MSC_VER >= 1910)
#define XBYAK_CONSTEXPR constexpr
#else #else
#define XBYAK_CONSTEXPR #define XBYAK_CONSTEXPR
#endif #endif
@ -135,7 +138,7 @@ namespace Xbyak {
enum { enum {
DEFAULT_MAX_CODE_SIZE = 4096, DEFAULT_MAX_CODE_SIZE = 4096,
VERSION = 0x5970 /* 0xABCD = A.BC(D) */ VERSION = 0x5991 /* 0xABCD = A.BC(D) */
}; };
#ifndef MIE_INTEGER_TYPE_DEFINED #ifndef MIE_INTEGER_TYPE_DEFINED
@ -413,16 +416,16 @@ public:
{ {
const size_t alignedSizeM1 = inner::ALIGN_PAGE_SIZE - 1; const size_t alignedSizeM1 = inner::ALIGN_PAGE_SIZE - 1;
size = (size + alignedSizeM1) & ~alignedSizeM1; size = (size + alignedSizeM1) & ~alignedSizeM1;
#if defined(XBYAK_USE_MAP_JIT) #if defined(MAP_ANONYMOUS)
int mode = MAP_PRIVATE | MAP_ANONYMOUS; int mode = MAP_PRIVATE | MAP_ANONYMOUS;
const int mojaveVersion = 18;
if (util::getMacOsVersion() >= mojaveVersion) mode |= MAP_JIT;
#elif defined(MAP_ANONYMOUS)
const int mode = MAP_PRIVATE | MAP_ANONYMOUS;
#elif defined(MAP_ANON) #elif defined(MAP_ANON)
const int mode = MAP_PRIVATE | MAP_ANON; int mode = MAP_PRIVATE | MAP_ANON;
#else #else
#error "not supported" #error "not supported"
#endif
#if defined(XBYAK_USE_MAP_JIT)
const int mojaveVersion = 18;
if (util::getMacOsVersion() >= mojaveVersion) mode |= MAP_JIT;
#endif #endif
void *p = mmap(NULL, size, PROT_READ | PROT_WRITE, mode, -1, 0); void *p = mmap(NULL, size, PROT_READ | PROT_WRITE, mode, -1, 0);
if (p == MAP_FAILED) XBYAK_THROW_RET(ERR_CANT_ALLOC, 0) if (p == MAP_FAILED) XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
@ -924,6 +927,10 @@ inline RegExp operator*(const Reg& r, int scale)
{ {
return RegExp(r, scale); return RegExp(r, scale);
} }
inline RegExp operator*(int scale, const Reg& r)
{
return r * scale;
}
inline RegExp operator-(const RegExp& e, size_t disp) inline RegExp operator-(const RegExp& e, size_t disp)
{ {
RegExp ret = e; RegExp ret = e;
@ -1539,6 +1546,12 @@ inline const uint8_t* Label::getAddress() const
return mgr->getCode() + offset; return mgr->getCode() + offset;
} }
typedef enum {
DefaultEncoding,
VexEncoding,
EvexEncoding
} PreferredEncoding;
class CodeGenerator : public CodeArray { class CodeGenerator : public CodeArray {
public: public:
enum LabelType { enum LabelType {
@ -2293,6 +2306,19 @@ private:
if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING) if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
opVex(x, 0, addr, type, code); opVex(x, 0, addr, type, code);
} }
void opVnni(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding)
{
if (encoding == DefaultEncoding) {
encoding = EvexEncoding;
}
if (encoding == EvexEncoding) {
#ifdef XBYAK_DISABLE_AVX512
XBYAK_THROW(ERR_EVEX_IS_INVALID)
#endif
type |= T_MUST_EVEX;
}
opAVX_X_X_XM(x1, x2, op, type, code0);
}
void opInOut(const Reg& a, const Reg& d, uint8_t code) void opInOut(const Reg& a, const Reg& d, uint8_t code)
{ {
if (a.getIdx() == Operand::AL && d.getIdx() == Operand::DX && d.getBit() == 16) { if (a.getIdx() == Operand::AL && d.getIdx() == Operand::DX && d.getBit() == 16) {

View file

@ -1,4 +1,4 @@
const char *getVersionString() const { return "5.97"; } const char *getVersionString() const { return "5.991"; }
void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); } void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); } void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); } void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@ -1180,6 +1180,10 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1
void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); } void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); }
void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); } void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); }
void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); } void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); }
void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); }
void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); }
void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); }
void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); }
void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); } void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); }
void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); } void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); }
void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36); } void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36); }
@ -1883,10 +1887,6 @@ void vpcompressd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 |
void vpcompressq(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8B); } void vpcompressq(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8B); }
void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xC4); } void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xC4); }
void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xC4); } void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xC4); }
void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x50); }
void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x51); }
void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x52); }
void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x53); }
void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8D); } void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8D); }
void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x75); } void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x75); }
void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x76); } void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x76); }

View file

@ -368,6 +368,7 @@ public:
static const Type tAMX_TILE = uint64_t(1) << 59; static const Type tAMX_TILE = uint64_t(1) << 59;
static const Type tAMX_INT8 = uint64_t(1) << 60; static const Type tAMX_INT8 = uint64_t(1) << 60;
static const Type tAMX_BF16 = uint64_t(1) << 61; static const Type tAMX_BF16 = uint64_t(1) << 61;
static const Type tAVX_VNNI = uint64_t(1) << 62;
Cpu() Cpu()
: type_(NONE) : type_(NONE)
@ -389,19 +390,35 @@ public:
if (ECX == get32bitAsBE(amd)) { if (ECX == get32bitAsBE(amd)) {
type_ |= tAMD; type_ |= tAMD;
getCpuid(0x80000001, data); getCpuid(0x80000001, data);
if (EDX & (1U << 31)) type_ |= t3DN; if (EDX & (1U << 31)) {
if (EDX & (1U << 15)) type_ |= tCMOV; type_ |= t3DN;
if (EDX & (1U << 30)) type_ |= tE3DN; // 3DNow! implies support for PREFETCHW on AMD
if (EDX & (1U << 22)) type_ |= tMMX2; type_ |= tPREFETCHW;
if (EDX & (1U << 27)) type_ |= tRDTSCP; }
if (EDX & (1U << 29)) {
// Long mode implies support for PREFETCHW on AMD
type_ |= tPREFETCHW;
}
} }
if (ECX == get32bitAsBE(intel)) { if (ECX == get32bitAsBE(intel)) {
type_ |= tINTEL; type_ |= tINTEL;
}
// Extended flags information
getCpuid(0x80000000, data);
if (EAX >= 0x80000001) {
getCpuid(0x80000001, data); getCpuid(0x80000001, data);
if (EDX & (1U << 31)) type_ |= t3DN;
if (EDX & (1U << 30)) type_ |= tE3DN;
if (EDX & (1U << 27)) type_ |= tRDTSCP; if (EDX & (1U << 27)) type_ |= tRDTSCP;
if (EDX & (1U << 22)) type_ |= tMMX2;
if (EDX & (1U << 15)) type_ |= tCMOV;
if (ECX & (1U << 5)) type_ |= tLZCNT; if (ECX & (1U << 5)) type_ |= tLZCNT;
if (ECX & (1U << 8)) type_ |= tPREFETCHW; if (ECX & (1U << 8)) type_ |= tPREFETCHW;
} }
getCpuid(1, data); getCpuid(1, data);
if (ECX & (1U << 0)) type_ |= tSSE3; if (ECX & (1U << 0)) type_ |= tSSE3;
if (ECX & (1U << 9)) type_ |= tSSSE3; if (ECX & (1U << 9)) type_ |= tSSSE3;
@ -426,7 +443,11 @@ public:
if ((bv & 6) == 6) { if ((bv & 6) == 6) {
if (ECX & (1U << 28)) type_ |= tAVX; if (ECX & (1U << 28)) type_ |= tAVX;
if (ECX & (1U << 12)) type_ |= tFMA; if (ECX & (1U << 12)) type_ |= tFMA;
if (((bv >> 5) & 7) == 7) { // do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support
#if !defined(__APPLE__)
if (((bv >> 5) & 7) == 7)
#endif
{
getCpuidEx(7, 0, data); getCpuidEx(7, 0, data);
if (EBX & (1U << 16)) type_ |= tAVX512F; if (EBX & (1U << 16)) type_ |= tAVX512F;
if (type_ & tAVX512F) { if (type_ & tAVX512F) {
@ -449,16 +470,12 @@ public:
if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS; if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT; if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
} }
// EAX=07H, ECX=1
getCpuidEx(7, 1, data);
if (type_ & tAVX512F) {
if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
}
} }
} }
} }
if (maxNum >= 7) { if (maxNum >= 7) {
getCpuidEx(7, 0, data); getCpuidEx(7, 0, data);
const uint32_t maxNumSubLeaves = EAX;
if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2; if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
if (EBX & (1U << 3)) type_ |= tBMI1; if (EBX & (1U << 3)) type_ |= tBMI1;
if (EBX & (1U << 8)) type_ |= tBMI2; if (EBX & (1U << 8)) type_ |= tBMI2;
@ -474,6 +491,13 @@ public:
if (EDX & (1U << 24)) type_ |= tAMX_TILE; if (EDX & (1U << 24)) type_ |= tAMX_TILE;
if (EDX & (1U << 25)) type_ |= tAMX_INT8; if (EDX & (1U << 25)) type_ |= tAMX_INT8;
if (EDX & (1U << 22)) type_ |= tAMX_BF16; if (EDX & (1U << 22)) type_ |= tAMX_BF16;
if (maxNumSubLeaves >= 1) {
getCpuidEx(7, 1, data);
if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
if (type_ & tAVX512F) {
if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
}
}
} }
setFamily(); setFamily();
setNumCores(); setNumCores();