2190 lines
62 KiB
C++
2190 lines
62 KiB
C++
#include <stdio.h>
|
|
#include "xbyak/xbyak.h"
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include "cybozu/inttype.hpp"
|
|
#define NUM_OF_ARRAY(x) (sizeof(x) / sizeof(x[0]))
|
|
|
|
using namespace Xbyak;
|
|
|
|
const int bitEnd = 64;
|
|
|
|
const uint64 YMM_SAE = 1ULL << 0;
|
|
const uint64 _XMM = 1ULL << 1;
|
|
const uint64 _MEM = 1ULL << 2;
|
|
const uint64 _REG32 = 1ULL << 3;
|
|
const uint64 EAX = 1ULL << 4;
|
|
const uint64 IMM32 = 1ULL << 5;
|
|
const uint64 IMM8 = 1ULL << 6;
|
|
const uint64 _REG8 = 1ULL << 7;
|
|
const uint64 _REG16 = 1ULL << 8;
|
|
const uint64 XMM_K = 1ULL << 9;
|
|
const uint64 YMM_K = 1ULL << 10;
|
|
const uint64 ZMM_K = 1ULL << 11;
|
|
const uint64 AX = 1ULL << 12;
|
|
const uint64 AL = 1ULL << 13;
|
|
const uint64 IMM_1 = 1ULL << 14;
|
|
const uint64 MEM8 = 1ULL << 15;
|
|
const uint64 MEM16 = 1ULL << 16;
|
|
const uint64 MEM32 = 1ULL << 17;
|
|
const uint64 VM32Z = 1ULL << 19;
|
|
const uint64 K_K = 1ULL << 20;
|
|
const uint64 MEM_ONLY_DISP = 1ULL << 21;
|
|
const uint64 VM32X_K = 1ULL << 23;
|
|
const uint64 _YMM = 1ULL << 24;
|
|
const uint64 VM32X_32 = 1ULL << 39;
|
|
const uint64 VM32X_64 = 1ULL << 40;
|
|
const uint64 VM32Y_32 = 1ULL << 41;
|
|
const uint64 VM32Y_64 = 1ULL << 42;
|
|
const uint64 VM32Z_K = 1ULL << 32;
|
|
#ifdef XBYAK64
|
|
const uint64 _MEMe = 1ULL << 25;
|
|
const uint64 REG32_2 = 1ULL << 26; // r8d, ...
|
|
const uint64 REG16_2 = 1ULL << 27; // r8w, ...
|
|
const uint64 REG8_2 = 1ULL << 28; // r8b, ...
|
|
const uint64 REG8_3 = 1ULL << 29; // spl, ...
|
|
const uint64 _REG64 = 1ULL << 30; // rax, ...
|
|
const uint64 _REG64_2 = 1ULL << 31; // r8, ...
|
|
const uint64 _XMM2 = 1ULL << 33;
|
|
const uint64 _YMM2 = 1ULL << 34;
|
|
const uint64 VM32X = VM32X_32 | VM32X_64;
|
|
const uint64 VM32Y = VM32Y_32 | VM32Y_64;
|
|
#else
|
|
const uint64 _MEMe = 0;
|
|
const uint64 REG32_2 = 0;
|
|
const uint64 REG16_2 = 0;
|
|
const uint64 REG8_2 = 0;
|
|
const uint64 REG8_3 = 0;
|
|
const uint64 _REG64 = 0;
|
|
const uint64 _REG64_2 = 0;
|
|
const uint64 _XMM2 = 0;
|
|
const uint64 _YMM2 = 0;
|
|
const uint64 VM32X = VM32X_32;
|
|
const uint64 VM32Y = VM32Y_32;
|
|
#endif
|
|
const uint64 REG64 = _REG64 | _REG64_2;
|
|
const uint64 REG32 = _REG32 | REG32_2 | EAX;
|
|
const uint64 REG16 = _REG16 | REG16_2 | AX;
|
|
const uint64 REG32e = REG32 | REG64;
|
|
const uint64 REG8 = _REG8 | REG8_2|AL;
|
|
const uint64 MEM = _MEM | _MEMe;
|
|
const uint64 MEM64 = 1ULL << 35;
|
|
const uint64 YMM_ER = 1ULL << 36;
|
|
const uint64 VM32Y_K = 1ULL << 37;
|
|
const uint64 IMM_2 = 1ULL << 38;
|
|
const uint64 IMM = IMM_1 | IMM_2;
|
|
const uint64 XMM = _XMM | _XMM2;
|
|
const uint64 YMM = _YMM | _YMM2;
|
|
const uint64 K = 1ULL << 43;
|
|
const uint64 _ZMM = 1ULL << 44;
|
|
const uint64 _ZMM2 = 1ULL << 45;
|
|
#ifdef XBYAK64
|
|
const uint64 ZMM = _ZMM | _ZMM2;
|
|
const uint64 _YMM3 = 1ULL << 46;
|
|
#else
|
|
const uint64 ZMM = _ZMM;
|
|
const uint64 _YMM3 = 0;
|
|
#endif
|
|
const uint64 K2 = 1ULL << 47;
|
|
const uint64 ZMM_SAE = 1ULL << 48;
|
|
const uint64 ZMM_ER = 1ULL << 49;
|
|
#ifdef XBYAK64
|
|
const uint64 _XMM3 = 1ULL << 50;
|
|
#endif
|
|
const uint64 XMM_SAE = 1ULL << 51;
|
|
#ifdef XBYAK64
|
|
const uint64 XMM_KZ = 1ULL << 52;
|
|
const uint64 YMM_KZ = 1ULL << 53;
|
|
const uint64 ZMM_KZ = 1ULL << 54;
|
|
#else
|
|
const uint64 XMM_KZ = 0;
|
|
const uint64 YMM_KZ = 0;
|
|
const uint64 ZMM_KZ = 0;
|
|
#endif
|
|
const uint64 MEM_K = 1ULL << 55;
|
|
const uint64 M_1to2 = 1ULL << 56;
|
|
const uint64 M_1to4 = 1ULL << 57;
|
|
const uint64 M_1to8 = 1ULL << 58;
|
|
const uint64 M_1to16 = 1ULL << 59;
|
|
const uint64 XMM_ER = 1ULL << 60;
|
|
const uint64 M_xword = 1ULL << 61;
|
|
const uint64 M_yword = 1ULL << 62;
|
|
const uint64 MY_1to4 = 1ULL << 18;
|
|
|
|
const uint64 NOPARA = 1ULL << (bitEnd - 1);
|
|
|
|
class Test {
|
|
Test(const Test&);
|
|
void operator=(const Test&);
|
|
const bool isXbyak_;
|
|
int funcNum_;
|
|
// check all op1, op2, op3
|
|
void put(const std::string& nm, uint64 op1 = NOPARA, uint64 op2 = NOPARA, uint64 op3 = NOPARA, uint64 op4 = NOPARA) const
|
|
{
|
|
for (int i = 0; i < bitEnd; i++) {
|
|
if ((op1 & (1ULL << i)) == 0) continue;
|
|
for (int j = 0; j < bitEnd; j++) {
|
|
if ((op2 & (1ULL << j)) == 0) continue;
|
|
for (int k = 0; k < bitEnd; k++) {
|
|
if ((op3 & (1ULL << k)) == 0) continue;
|
|
for (int s = 0; s < bitEnd; s++) {
|
|
if ((op4 & (1ULL << s)) == 0) continue;
|
|
printf("%s ", nm.c_str());
|
|
if (isXbyak_) printf("(");
|
|
if (!(op1 & NOPARA)) printf("%s", get(1ULL << i));
|
|
if (!(op2 & NOPARA)) printf(", %s", get(1ULL << j));
|
|
if (!(op3 & NOPARA)) printf(", %s", get(1ULL << k));
|
|
if (!(op4 & NOPARA)) printf(", %s", get(1ULL << s));
|
|
if (isXbyak_) printf("); dump();");
|
|
printf("\n");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
void put(const char *nm, uint64 op, const char *xbyak, const char *nasm) const
|
|
{
|
|
for (int i = 0; i < bitEnd; i++) {
|
|
if ((op & (1ULL << i)) == 0) continue;
|
|
printf("%s ", nm);
|
|
if (isXbyak_) printf("(");
|
|
if (!(op & NOPARA)) printf("%s", get(1ULL << i));
|
|
printf(", %s", isXbyak_ ? xbyak : nasm);
|
|
if (isXbyak_) printf("); dump();");
|
|
printf("\n");
|
|
}
|
|
}
|
|
void put(const char *nm, const char *xbyak, const char *nasm = 0, uint64 op = NOPARA) const
|
|
{
|
|
if (nasm == 0) nasm = xbyak;
|
|
for (int i = 0; i < bitEnd; i++) {
|
|
if ((op & (1ULL << i)) == 0) continue;
|
|
printf("%s ", nm);
|
|
if (isXbyak_) printf("(");
|
|
printf("%s ", isXbyak_ ? xbyak : nasm);
|
|
if (!(op & NOPARA)) printf(", %s", get(1ULL << i));
|
|
if (isXbyak_) printf("); dump();");
|
|
printf("\n");
|
|
}
|
|
}
|
|
const char *get(uint64 type) const
|
|
{
|
|
int idx = (rand() / 31) & 7;
|
|
switch (type) {
|
|
case _XMM:
|
|
{
|
|
static const char tbl[][6] = {
|
|
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
|
|
};
|
|
return tbl[idx];
|
|
}
|
|
case _YMM:
|
|
{
|
|
static const char tbl[][6] = {
|
|
"ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7"
|
|
};
|
|
return tbl[idx];
|
|
}
|
|
case _ZMM:
|
|
{
|
|
static const char tbl[][6] = {
|
|
"zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7"
|
|
};
|
|
return tbl[idx];
|
|
}
|
|
#ifdef XBYAK64
|
|
case _XMM2:
|
|
{
|
|
static const char tbl[][6] = {
|
|
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
|
|
};
|
|
return tbl[idx];
|
|
}
|
|
case _XMM3:
|
|
{
|
|
static const char tbl[][6] = {
|
|
"xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23"
|
|
};
|
|
return tbl[idx];
|
|
}
|
|
case _YMM2:
|
|
{
|
|
static const char tbl[][6] = {
|
|
"ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
|
|
};
|
|
return tbl[idx];
|
|
}
|
|
case _YMM3:
|
|
{
|
|
static const char tbl[][6] = {
|
|
"ymm16", "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23",
|
|
};
|
|
return tbl[idx];
|
|
}
|
|
case _ZMM2:
|
|
{
|
|
static const char tbl[][6] = {
|
|
"zmm8", "zmm9", "zmm10", "zmm11", "zmm28", "zmm29", "zmm30", "zmm31",
|
|
};
|
|
return tbl[idx];
|
|
}
|
|
#endif
|
|
case _MEM:
|
|
return isXbyak_ ? "ptr[eax+ecx+64]" : "[eax+ecx+64]"; // QQQ
|
|
// return isXbyak_ ? "ptr[eax+ecx+6]" : "[eax+ecx+6]";
|
|
case _MEMe:
|
|
{
|
|
static int ccc = 1;
|
|
#ifdef USE_YASM
|
|
ccc++;
|
|
#endif
|
|
if (ccc & 1) {
|
|
return isXbyak_ ? "ptr[rdx+r15+0x12]" : "[rdx+r15+0x12]";
|
|
} else {
|
|
return isXbyak_ ? "ptr[rip - 0x13456+1-3]" : "[rip - 0x13456+1-3]";
|
|
}
|
|
}
|
|
case MEM8:
|
|
return "byte [eax+edx]";
|
|
case MEM16:
|
|
return "word [esi]";
|
|
case MEM32:
|
|
return "dword [eax+64]";
|
|
case MEM64:
|
|
return "qword [rax+64]";
|
|
case MEM_ONLY_DISP:
|
|
return isXbyak_ ? "ptr[(void*)0x123]" : "[0x123]";
|
|
case _REG16: // not ax
|
|
{
|
|
static const char Reg16Tbl[][4] = {
|
|
"ax", "cx", "dx", "bx", "sp", "bp", "si", "di"
|
|
};
|
|
return Reg16Tbl[(idx % 7) + 1];
|
|
}
|
|
case _REG8: // not al
|
|
{
|
|
static const char Reg8Tbl[][4] = {
|
|
#ifdef XBYAK64 // QQQ
|
|
"al", "cl", "dl", "bl", "al", "cl", "dl", "bl"
|
|
#else
|
|
"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"
|
|
#endif
|
|
};
|
|
return Reg8Tbl[(idx % 7) + 1];
|
|
}
|
|
case _REG32: // not eax
|
|
{
|
|
static const char Reg32Tbl[][4] = {
|
|
"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi"
|
|
};
|
|
return Reg32Tbl[(idx % 7) + 1];
|
|
}
|
|
#ifdef XBYAK64
|
|
case _REG64: // not rax
|
|
{
|
|
static const char Reg64Tbl[][4] = {
|
|
"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi"
|
|
};
|
|
return Reg64Tbl[(idx % 7) + 1];
|
|
}
|
|
case _REG64_2:
|
|
{
|
|
static const char Reg64_2Tbl[][4] = {
|
|
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
|
|
};
|
|
return Reg64_2Tbl[idx];
|
|
}
|
|
case REG32_2:
|
|
{
|
|
static const char Reg32eTbl[][5] = {
|
|
"r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d"
|
|
};
|
|
return Reg32eTbl[idx];
|
|
}
|
|
case REG16_2:
|
|
{
|
|
static const char Reg16eTbl[][5] = {
|
|
"r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w", "r15w"
|
|
};
|
|
return Reg16eTbl[idx];
|
|
}
|
|
case REG8_2:
|
|
{
|
|
static const char Reg8_2Tbl[][5] = {
|
|
"r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b"
|
|
};
|
|
return Reg8_2Tbl[idx];
|
|
}
|
|
case REG8_3:
|
|
{
|
|
static const char Reg8_3Tbl[][5] = {
|
|
"spl", "bpl", "sil", "dil", "spl", "bpl", "sil", "dil"
|
|
};
|
|
return Reg8_3Tbl[idx];
|
|
}
|
|
#endif
|
|
case EAX:
|
|
return "eax";
|
|
case AX:
|
|
return "ax";
|
|
case AL:
|
|
return "al";
|
|
case K_K:
|
|
return isXbyak_ ? "k5 | k3" : "k5{k3}";
|
|
case IMM32:
|
|
return isXbyak_ ? "12345678" : "dword 12345678";
|
|
case IMM8:
|
|
return isXbyak_ ? "4" : "byte 4";
|
|
case IMM_1:
|
|
return "4";
|
|
case IMM_2:
|
|
return isXbyak_ ? "0xda" : "0xda";
|
|
case VM32X_32:
|
|
return isXbyak_ ? "ptr [ebp+64+xmm1*8]" : "[ebp+64+xmm1*8]";
|
|
case VM32X_64:
|
|
return isXbyak_ ? "ptr [rax+64+xmm13*2]" : "[rax+64+xmm13*2]";
|
|
case VM32Y_32:
|
|
return isXbyak_ ? "ptr [ymm4]" : "[ymm4]";
|
|
case VM32Y_64:
|
|
return isXbyak_ ? "ptr [64+ymm13*2+r13]" : "[64+ymm13*2+r13]";
|
|
case VM32X_K:
|
|
return isXbyak_ ? "ptr [64+xmm13*2+r13] | k6" : "[64+xmm13*2+r13]{k6}";
|
|
case VM32Y_K:
|
|
return isXbyak_ ? "ptr [64+ymm13*2+r13] | k6" : "[64+ymm13*2+r13]{k6}";
|
|
case VM32Z_K:
|
|
return isXbyak_ ? "ptr [64+zmm13*2+r13] | k6" : "[64+zmm13*2+r13]{k6}";
|
|
case VM32Z:
|
|
return isXbyak_ ? "ptr [64+zmm13*2+rcx]" : "[64+zmm13*2+rcx]";
|
|
case M_1to2: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to2}";
|
|
case M_1to4: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to4}";
|
|
case M_1to8: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to8}";
|
|
case M_1to16: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to16}";
|
|
|
|
case M_xword: return isXbyak_ ? "ptr [eax+32]" : "oword [eax+32]";
|
|
case M_yword: return isXbyak_ ? "yword [eax+32]" : "yword [eax+32]";
|
|
case MY_1to4: return isXbyak_ ? "yword_b [eax+32]" : "[eax+32]{1to4}";
|
|
case K:
|
|
{
|
|
static const char kTbl[][5] = {
|
|
"k1", "k2", "k3", "k4", "k5", "k6", "k7",
|
|
};
|
|
return kTbl[idx % 7];
|
|
}
|
|
case K2:
|
|
return isXbyak_ ? "k3 | k5" : "k3{k5}";
|
|
#ifdef XBYAK64
|
|
case XMM_SAE:
|
|
return isXbyak_ ? "xmm25 | T_sae" : "xmm25, {sae}";
|
|
case YMM_SAE:
|
|
return isXbyak_ ? "ymm25 | T_sae" : "ymm25, {sae}";
|
|
case ZMM_SAE:
|
|
return isXbyak_ ? "zmm25 | T_sae" : "zmm25, {sae}";
|
|
case XMM_ER:
|
|
return isXbyak_ ? "xmm4 | T_rd_sae" : "xmm4, {rd-sae}";
|
|
case YMM_ER:
|
|
return isXbyak_ ? "ymm20 | T_rd_sae" : "ymm20, {rd-sae}";
|
|
case ZMM_ER:
|
|
return isXbyak_ ? "zmm20 | T_rd_sae" : "zmm20, {rd-sae}";
|
|
case XMM_KZ:
|
|
return isXbyak_ ? "xmm5 | k5" : "xmm5{k5}";
|
|
case YMM_KZ:
|
|
return isXbyak_ ? "ymm2 |k3|T_z" : "ymm2{k3}{z}";
|
|
case ZMM_KZ:
|
|
return isXbyak_ ? "zmm7|k1" : "zmm7{k1}";
|
|
case MEM_K:
|
|
return isXbyak_ ? "ptr [rax] | k1" : "[rax]{k1}";
|
|
#else
|
|
case XMM_SAE:
|
|
return isXbyak_ ? "xmm5 | T_sae" : "xmm5, {sae}";
|
|
case YMM_SAE:
|
|
return isXbyak_ ? "ymm5 | T_sae" : "ymm5, {sae}";
|
|
case ZMM_SAE:
|
|
return isXbyak_ ? "zmm5 | T_sae" : "zmm5, {sae}";
|
|
case XMM_ER:
|
|
return isXbyak_ ? "xmm30 | T_rd_sae" : "xmm30, {rd-sae}";
|
|
case YMM_ER:
|
|
return isXbyak_ ? "ymm2 | T_rd_sae" : "ymm2, {rd-sae}";
|
|
case ZMM_ER:
|
|
return isXbyak_ ? "zmm2 | T_rd_sae" : "zmm2, {rd-sae}";
|
|
case MEM_K:
|
|
return isXbyak_ ? "ptr [eax] | k1" : "[eax]{k1}";
|
|
#endif
|
|
case XMM_K:
|
|
return isXbyak_ ? "xmm5 | k7" : "xmm5{k7}";
|
|
case YMM_K:
|
|
return isXbyak_ ? "ymm5 | k4" : "ymm5{k4}";
|
|
case ZMM_K:
|
|
return isXbyak_ ? "zmm5 | k3" : "zmm5{k3}";
|
|
}
|
|
return 0;
|
|
}
|
|
public:
|
|
Test(bool isXbyak)
|
|
: isXbyak_(isXbyak)
|
|
, funcNum_(1)
|
|
{
|
|
if (!isXbyak_) return;
|
|
printf("%s",
|
|
" void gen0()\n"
|
|
" {\n");
|
|
}
|
|
/*
|
|
gcc and vc give up to compile this source,
|
|
so I split functions.
|
|
*/
|
|
void separateFunc()
|
|
{
|
|
if (!isXbyak_) return;
|
|
printf(
|
|
" }\n"
|
|
" void gen%d()\n"
|
|
" {\n", funcNum_++);
|
|
}
|
|
~Test()
|
|
{
|
|
if (!isXbyak_) return;
|
|
printf("%s",
|
|
" }\n"
|
|
" void gen()\n"
|
|
" {\n");
|
|
for (int i = 0; i < funcNum_; i++) {
|
|
printf(
|
|
" gen%d();\n", i);
|
|
}
|
|
printf(
|
|
" }\n");
|
|
}
|
|
void put()
|
|
{
|
|
putAVX512();
|
|
}
|
|
void putOpmask()
|
|
{
|
|
{
|
|
const char *tbl[] = {
|
|
"kadd",
|
|
"kand",
|
|
"kandn",
|
|
"kor",
|
|
"kxnor",
|
|
"kxor",
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
std::string name = tbl[i];
|
|
put(name + "b", K, K, K);
|
|
put(name + "w", K, K, K);
|
|
put(name + "q", K, K, K);
|
|
put(name + "d", K, K, K);
|
|
}
|
|
put("kunpckbw", K, K, K);
|
|
put("kunpckwd", K, K, K);
|
|
put("kunpckdq", K, K, K);
|
|
}
|
|
{
|
|
const char *tbl[] = {
|
|
"knot",
|
|
"kortest",
|
|
"ktest",
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
std::string name = tbl[i];
|
|
put(name + "b", K, K);
|
|
put(name + "w", K, K);
|
|
put(name + "q", K, K);
|
|
put(name + "d", K, K);
|
|
}
|
|
}
|
|
{
|
|
const char *tbl[] = {
|
|
"kshiftl",
|
|
"kshiftr",
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
std::string name = tbl[i];
|
|
put(name + "b", K, K, IMM8);
|
|
put(name + "w", K, K, IMM8);
|
|
put(name + "q", K, K, IMM8);
|
|
put(name + "d", K, K, IMM8);
|
|
}
|
|
}
|
|
put("kmovw", K, K | MEM | REG32);
|
|
put("kmovq", K, K | MEM);
|
|
put("kmovb", K, K | MEM | REG32);
|
|
put("kmovd", K, K | MEM | REG32);
|
|
|
|
put("kmovw", MEM | REG32, K);
|
|
put("kmovq", MEM, K);
|
|
put("kmovb", MEM | REG32, K);
|
|
put("kmovd", MEM | REG32, K);
|
|
#ifdef XBYAK64
|
|
put("kmovq", K, REG64);
|
|
put("kmovq", REG64, K);
|
|
#endif
|
|
}
|
|
void put_vaddpd(const char *r1, const char *r2, const char *r3, int kIdx = 0, bool z = false, int sae = 0)
|
|
{
|
|
std::string modifier;
|
|
char pk[16] = "";
|
|
const char *pz = "";
|
|
const char *saeTblXbyak[] = { "", "|T_rn_sae", "|T_rd_sae", "|T_ru_sae", "|T_rz_sae" };
|
|
const char *saeTblNASM[] = { "", ",{rn-sae}", ",{rd-sae}", ",{ru-sae}", ",{rz-sae}" };
|
|
if (isXbyak_) {
|
|
if (kIdx) CYBOZU_SNPRINTF(pk, sizeof(pk), "|k%d", kIdx);
|
|
if (z) pz = "|T_z";
|
|
printf("vaddpd(%s%s%s, %s, %s%s); dump();\n", r1, pk, pz, r2, r3, saeTblXbyak[sae]);
|
|
} else {
|
|
if (kIdx) CYBOZU_SNPRINTF(pk, sizeof(pk), "{k%d}", kIdx);
|
|
if (z) pz = "{z}";
|
|
printf("vaddpd %s%s%s, %s, %s%s\n", r1, pk, pz, r2, r3, saeTblNASM[sae]);
|
|
}
|
|
}
|
|
void putCombi()
|
|
{
|
|
const char *xTbl[] = {
|
|
"xmm2",
|
|
#ifdef XBYAK64
|
|
"xmm8", "xmm31"
|
|
#else
|
|
"xmm5", "xmm6"
|
|
#endif
|
|
};
|
|
const char *yTbl[] = {
|
|
"ymm0",
|
|
#ifdef XBYAK64
|
|
"ymm15", "ymm31"
|
|
#else
|
|
"ymm4", "ymm2"
|
|
#endif
|
|
};
|
|
const char *zTbl[] = {
|
|
"zmm1",
|
|
#ifdef XBYAK64
|
|
"zmm9", "zmm30"
|
|
#else
|
|
"zmm3", "zmm7"
|
|
#endif
|
|
};
|
|
const size_t N = NUM_OF_ARRAY(zTbl);
|
|
for (size_t i = 0; i < N; i++) {
|
|
for (size_t j = 0; j < N; j++) {
|
|
separateFunc();
|
|
for (size_t k = 0; k < N; k++) {
|
|
#ifdef XBYAK64
|
|
for (int kIdx = 0; kIdx < 8; kIdx++) {
|
|
for (int z = 0; z < 2; z++) {
|
|
put_vaddpd(xTbl[i], xTbl[j], xTbl[k], kIdx, z == 1);
|
|
put_vaddpd(yTbl[i], yTbl[j], yTbl[k], kIdx, z == 1);
|
|
for (int sae = 0; sae < 5; sae++) {
|
|
put_vaddpd(zTbl[i], zTbl[j], zTbl[k], kIdx, z == 1, sae);
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
put_vaddpd(xTbl[i], xTbl[j], xTbl[k]);
|
|
put_vaddpd(yTbl[i], yTbl[j], yTbl[k]);
|
|
for (int sae = 0; sae < 5; sae++) {
|
|
put_vaddpd(zTbl[i], zTbl[j], zTbl[k], sae);
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
put("vaddpd", XMM, XMM, _MEM);
|
|
put("vaddpd", YMM, YMM, _MEM);
|
|
put("vaddpd", ZMM, ZMM, _MEM);
|
|
}
|
|
void putCmpK()
|
|
{
|
|
{
|
|
const struct Tbl {
|
|
const char *name;
|
|
bool supportYMM;
|
|
} tbl[] = {
|
|
{ "vcmppd", true },
|
|
{ "vcmpps", true },
|
|
{ "vcmpsd", false },
|
|
{ "vcmpss", false },
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const Tbl *p = &tbl[i];
|
|
put(p->name, K, _XMM, _XMM | MEM, IMM8);
|
|
if (!p->supportYMM) continue;
|
|
put(p->name, K, _YMM, _YMM | MEM, IMM8);
|
|
put(p->name, K, _ZMM, _ZMM | MEM, IMM8);
|
|
}
|
|
}
|
|
put("vcmppd", K2, ZMM, ZMM_SAE, IMM);
|
|
#ifdef XBYAK64
|
|
{
|
|
const struct Tbl {
|
|
const char *name;
|
|
} tbl[] = {
|
|
{ "vcomisd" },
|
|
{ "vcomiss" },
|
|
{ "vucomisd" },
|
|
{ "vucomiss" },
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const Tbl *p = &tbl[i];
|
|
put(p->name, XMM | _XMM3, XMM_SAE | XMM | MEM);
|
|
}
|
|
}
|
|
put("vcomiss", _XMM3, XMM | MEM);
|
|
put("vcomiss", XMM, XMM_SAE);
|
|
#endif
|
|
}
|
|
void putBroadcastSub(int idx, int disp)
|
|
{
|
|
#ifdef XBYAK64
|
|
const char *a = "rax";
|
|
#else
|
|
const char *a = "eax";
|
|
#endif
|
|
if (isXbyak_) {
|
|
printf("vaddpd(zmm%d, zmm1, ptr_b[%s+%d]);dump();\n", idx, a, disp);
|
|
printf("vaddpd(ymm%d, ymm1, ptr_b[%s+%d]);dump();\n", idx, a, disp);
|
|
printf("vaddpd(xmm%d, xmm1, ptr_b[%s+%d]);dump();\n", idx, a, disp);
|
|
} else {
|
|
printf("vaddpd zmm%d, zmm1, [%s+%d]{1to8}\n", idx, a, disp);
|
|
printf("vaddpd ymm%d, ymm1, [%s+%d]{1to4}\n", idx, a, disp);
|
|
printf("vaddpd xmm%d, xmm1, [%s+%d]{1to2}\n", idx, a, disp);
|
|
}
|
|
}
|
|
void putBroadcast()
|
|
{
|
|
for (int i = 0; i < 9; i++) {
|
|
putBroadcastSub(0, i);
|
|
#ifdef XBYAK64
|
|
putBroadcastSub(10, i);
|
|
putBroadcastSub(20, i);
|
|
#endif
|
|
}
|
|
put("vpbroadcastb", XMM_KZ | ZMM_KZ, REG8 | _MEM);
|
|
put("vpbroadcastw", XMM_KZ | ZMM_KZ, REG16 | _MEM);
|
|
put("vpbroadcastd", XMM_KZ | ZMM_KZ, REG32 | _MEM);
|
|
#ifdef XBYAK64
|
|
put("vpbroadcastq", XMM_KZ | ZMM_KZ, REG64 | _MEM);
|
|
#endif
|
|
{
|
|
const char *tbl[] = {
|
|
"vpbroadcastb",
|
|
"vpbroadcastw",
|
|
"vpbroadcastd",
|
|
"vpbroadcastq",
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
put(tbl[i], XMM_KZ | ZMM_KZ, _XMM | _MEM);
|
|
}
|
|
}
|
|
put("vbroadcasti32x2", XMM_KZ | YMM_KZ | ZMM_KZ, _XMM | _MEM);
|
|
put("vbroadcasti32x4", YMM_KZ | ZMM_KZ, _MEM);
|
|
put("vbroadcasti64x2", YMM_KZ | ZMM_KZ, _MEM);
|
|
put("vbroadcasti32x8", ZMM_KZ, _MEM);
|
|
put("vbroadcasti64x4", ZMM_KZ, _MEM);
|
|
}
|
|
void putMisc1()
|
|
{
|
|
put("vmaskmovps", XMM, XMM, MEM);
|
|
put("vmaskmovps", YMM, YMM, MEM);
|
|
|
|
put("vmaskmovpd", YMM, YMM, MEM);
|
|
put("vmaskmovpd", XMM, XMM, MEM);
|
|
|
|
put("vmaskmovps", MEM, XMM, XMM);
|
|
put("vmaskmovpd", MEM, XMM, XMM);
|
|
|
|
put("vbroadcastf128", YMM, MEM);
|
|
put("vbroadcasti128", YMM, MEM);
|
|
put("vbroadcastsd", YMM|_YMM3, XMM|MEM);
|
|
put("vbroadcastsd", ZMM, XMM|MEM);
|
|
{
|
|
const char *tbl[] = {
|
|
"vbroadcastss",
|
|
"vpbroadcastb",
|
|
"vpbroadcastw",
|
|
"vpbroadcastd",
|
|
"vpbroadcastq",
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
put(tbl[i], XMM | YMM | ZMM, XMM|MEM);
|
|
}
|
|
}
|
|
|
|
put("vinsertf128", YMM, YMM, XMM | MEM, IMM8);
|
|
put("vinserti128", YMM, YMM, XMM | MEM, IMM8);
|
|
put("vperm2f128", YMM, YMM, YMM | MEM, IMM8);
|
|
put("vperm2i128", YMM, YMM, YMM | MEM, IMM8);
|
|
|
|
{
|
|
const char *tbl[] = {
|
|
"vpmaskmovd", "vpmaskmovq"
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const char *name = tbl[i];
|
|
put(name, XMM, XMM, MEM);
|
|
put(name, YMM, YMM, MEM);
|
|
put(name, MEM, XMM, XMM);
|
|
put(name, MEM, YMM, YMM);
|
|
}
|
|
}
|
|
{
|
|
const char *tbl[] = {
|
|
"vpermd", "vpermps",
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const char *name = tbl[i];
|
|
put(name, YMM, YMM, YMM | MEM);
|
|
}
|
|
}
|
|
{
|
|
const char *tbl[] = {
|
|
"vpermq", "vpermpd",
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const char *name = tbl[i];
|
|
put(name, YMM, YMM | MEM, IMM8);
|
|
}
|
|
}
|
|
put("vpextrw", REG32e | MEM, XMM, IMM); // nasm is ok, yasm generate redundant code
|
|
}
|
|
void putAVX512_M_X()
|
|
{
|
|
const char *tbl[] = {
|
|
"vmovapd",
|
|
"vmovaps",
|
|
"vmovupd",
|
|
"vmovups",
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const char *name = tbl[i];
|
|
put(name, MEM, ZMM);
|
|
put(name, ZMM, MEM);
|
|
#ifdef XBYAK64
|
|
put(name, MEM, _XMM3);
|
|
put(name, _XMM3, MEM);
|
|
#endif
|
|
}
|
|
}
|
|
void put_vmov()
|
|
{
|
|
#ifdef XBYAK64
|
|
put("vmovd", _XMM3, MEM|REG32);
|
|
put("vmovd", MEM|REG32, _XMM3);
|
|
put("vmovq", _XMM3, MEM|REG64|XMM);
|
|
put("vmovq", MEM|REG64|XMM, _XMM3);
|
|
put("vmovhlps", _XMM3, _XMM3, _XMM3);
|
|
put("vmovlhps", _XMM3, _XMM3, _XMM3);
|
|
put("vmovntdqa", _XMM3|_YMM3|ZMM, MEM);
|
|
put("vmovntdq", MEM, _XMM3 | _YMM3 | ZMM);
|
|
put("vmovntpd", MEM, _XMM3 | _YMM3 | ZMM);
|
|
put("vmovntps", MEM, _XMM3 | _YMM3 | ZMM);
|
|
|
|
put("vmovsd", XMM_KZ, _XMM3, _XMM3);
|
|
put("vmovsd", XMM_KZ, MEM);
|
|
put("vmovsd", MEM_K, XMM);
|
|
put("vmovss", XMM_KZ, _XMM3, _XMM3);
|
|
put("vmovss", XMM_KZ, MEM);
|
|
put("vmovss", MEM_K, XMM);
|
|
|
|
put("vmovshdup", _ZMM, _ZMM);
|
|
put("vmovsldup", _ZMM, _ZMM);
|
|
|
|
|
|
{
|
|
const char *tbl[] = {
|
|
"valignd",
|
|
"valignq",
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const char *name = tbl[i];
|
|
put(name, XMM_KZ, _XMM, _XMM | MEM, IMM);
|
|
put(name, _YMM3, _YMM3, _YMM3 | _MEM, IMM);
|
|
put(name, _ZMM, _ZMM, _ZMM | _MEM, IMM);
|
|
}
|
|
}
|
|
{
|
|
const char tbl[][16] = {
|
|
"vmovhpd",
|
|
"vmovhps",
|
|
"vmovlpd",
|
|
"vmovlps",
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
put(tbl[i], _XMM3, _XMM3, MEM);
|
|
put(tbl[i], MEM, _XMM3);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
void put512_X_XM()
|
|
{
|
|
const struct Tbl {
|
|
const char *name;
|
|
bool M_X;
|
|
} tbl[] = {
|
|
{ "vmovddup", false },
|
|
{ "vmovdqa32", true },
|
|
{ "vmovdqa64", true },
|
|
{ "vmovdqu8", true },
|
|
{ "vmovdqu16", true },
|
|
{ "vmovdqu32", true },
|
|
{ "vmovdqu64", true },
|
|
{ "vpabsb", false },
|
|
{ "vpabsw", false },
|
|
{ "vpabsd", false },
|
|
{ "vpabsq", false },
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const Tbl& p = tbl[i];
|
|
put(p.name, _XMM|XMM_KZ, _XMM|MEM);
|
|
put(p.name, _YMM|YMM_KZ, _YMM|MEM);
|
|
put(p.name, _ZMM|ZMM_KZ, _ZMM|MEM);
|
|
if (!p.M_X) continue;
|
|
put(p.name, MEM, _XMM);
|
|
put(p.name, MEM, _YMM);
|
|
put(p.name, MEM, _ZMM);
|
|
}
|
|
put("vsqrtpd", XMM_KZ, M_1to2 | _MEM);
|
|
put("vsqrtpd", YMM_KZ, M_1to4 | _MEM);
|
|
put("vsqrtpd", ZMM_KZ, M_1to8 | _MEM);
|
|
put("vsqrtpd", ZMM_KZ, ZMM_ER);
|
|
|
|
put("vsqrtps", XMM_KZ, M_1to4 | _MEM);
|
|
put("vsqrtps", YMM_KZ, M_1to8 | _MEM);
|
|
put("vsqrtps", ZMM_KZ, M_1to16 | _MEM);
|
|
put("vsqrtps", ZMM_KZ, ZMM_ER);
|
|
|
|
put("vpabsd", ZMM_KZ, M_1to16 | _MEM);
|
|
put("vpabsq", ZMM_KZ, M_1to8 | _MEM);
|
|
|
|
put("vbroadcastf32x2", YMM_KZ | ZMM_KZ, _XMM | _MEM);
|
|
put("vbroadcastf32x4", YMM_KZ | ZMM_KZ, _MEM);
|
|
|
|
put("vbroadcastf64x2", YMM_KZ | ZMM_KZ, _MEM);
|
|
put("vbroadcastf64x4", ZMM_KZ, _MEM);
|
|
put("vbroadcastf32x8", ZMM_KZ, _MEM);
|
|
}
|
|
void put512_X_X_XM()
|
|
{
|
|
const struct Tbl {
|
|
const char *name;
|
|
uint64_t mem;
|
|
} tbl[] = {
|
|
{ "vsqrtsd", MEM },
|
|
{ "vsqrtss", MEM },
|
|
{ "vunpckhpd", M_1to2 },
|
|
{ "vunpckhps", M_1to4 },
|
|
{ "vunpcklpd", M_1to2 },
|
|
{ "vunpcklps", M_1to4 },
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const Tbl& p = tbl[i];
|
|
put(p.name, XMM_KZ, _XMM, _XMM|p.mem);
|
|
}
|
|
}
|
|
void put512_X3()
|
|
{
|
|
#ifdef XBYAK64
|
|
const struct Tbl {
|
|
const char *name;
|
|
uint64_t x1;
|
|
uint64_t x2;
|
|
uint64_t xm;
|
|
} tbl[] = {
|
|
{ "vpacksswb", XMM_KZ, _XMM, _XMM | _MEM },
|
|
{ "vpacksswb", YMM_KZ, _YMM, _YMM | _MEM },
|
|
{ "vpacksswb", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
|
|
{ "vpackssdw", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
|
|
{ "vpackssdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM },
|
|
{ "vpackssdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
|
|
|
|
{ "vpackusdw", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
|
|
{ "vpackusdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM },
|
|
{ "vpackusdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
|
|
|
|
{ "vpackuswb", XMM_KZ, _XMM, _XMM | _MEM },
|
|
{ "vpackuswb", YMM_KZ, _YMM, _YMM | _MEM },
|
|
{ "vpackuswb", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
|
|
{ "vpaddb", XMM_KZ, _XMM, _XMM | _MEM },
|
|
{ "vpaddw", XMM_KZ, _XMM, _XMM | _MEM },
|
|
{ "vpaddd", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
|
|
{ "vpaddq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
|
|
|
|
{ "vpaddsb", XMM_KZ, _XMM, _XMM | _MEM },
|
|
{ "vpaddsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
|
|
{ "vpaddsw", XMM_KZ, _XMM, _XMM | _MEM },
|
|
{ "vpaddsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
|
|
{ "vpaddusb", XMM_KZ, _XMM, _XMM | MEM },
|
|
{ "vpaddusb", ZMM_KZ, _ZMM, _ZMM | MEM },
|
|
|
|
{ "vpaddusw", XMM_KZ, _XMM, _XMM | MEM },
|
|
{ "vpaddusw", ZMM_KZ, _ZMM, _ZMM | MEM },
|
|
|
|
{ "vpsubb", XMM_KZ, _XMM, _XMM | _MEM },
|
|
{ "vpsubw", XMM_KZ, _XMM, _XMM | _MEM },
|
|
{ "vpsubd", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
|
|
{ "vpsubq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
|
|
|
|
{ "vpsubsb", XMM_KZ, _XMM, _XMM | _MEM },
|
|
{ "vpsubsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
|
|
{ "vpsubsw", XMM_KZ, _XMM, _XMM | _MEM },
|
|
{ "vpsubsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
|
|
{ "vpsubusb", XMM_KZ, _XMM, _XMM | MEM },
|
|
{ "vpsubusb", ZMM_KZ, _ZMM, _ZMM | MEM },
|
|
|
|
{ "vpsubusw", XMM_KZ, _XMM, _XMM | MEM },
|
|
{ "vpsubusw", ZMM_KZ, _ZMM, _ZMM | MEM },
|
|
|
|
{ "vpandd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
|
|
{ "vpandq", ZMM_KZ, _ZMM, _ZMM | M_1to8 | _MEM },
|
|
|
|
{ "vpandnd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
|
|
{ "vpandnq", ZMM_KZ, _ZMM, _ZMM | M_1to8 | _MEM },
|
|
|
|
{ "vpavgb", ZMM_KZ, _ZMM, _ZMM },
|
|
{ "vpavgw", ZMM_KZ, _ZMM, _ZMM },
|
|
|
|
{ "vpcmpeqb", K2, _ZMM, _ZMM | _MEM },
|
|
{ "vpcmpeqw", K2, _ZMM, _ZMM | _MEM },
|
|
{ "vpcmpeqd", K2, _ZMM, _ZMM | M_1to16 | _MEM },
|
|
{ "vpcmpeqq", K2, _ZMM, _ZMM | M_1to8 | _MEM },
|
|
|
|
{ "vpcmpgtb", K2, _ZMM, _ZMM | _MEM },
|
|
{ "vpcmpgtw", K2, _ZMM, _ZMM | _MEM },
|
|
{ "vpcmpgtd", K2, _ZMM, _ZMM | M_1to16 | _MEM },
|
|
{ "vpcmpgtq", K2, _ZMM, _ZMM | M_1to8 | _MEM },
|
|
|
|
{ "vpmaddubsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
{ "vpmaddwd", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
|
|
{ "vpmaxsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
{ "vpmaxsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
{ "vpmaxsd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
|
|
{ "vpmaxsq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
|
|
|
|
{ "vpmaxub", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
{ "vpmaxuw", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
{ "vpmaxud", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
|
|
{ "vpmaxuq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
|
|
|
|
{ "vpminsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
{ "vpminsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
{ "vpminsd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
|
|
{ "vpminsq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
|
|
|
|
{ "vpminub", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
{ "vpminuw", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
{ "vpminud", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
|
|
{ "vpminuq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
|
|
|
|
{ "vpslldq", _XMM3, _XMM3 | _MEM, IMM8 },
|
|
{ "vpslldq", _YMM3, _YMM3 | _MEM, IMM8 },
|
|
{ "vpslldq", _ZMM, _ZMM | _MEM, IMM8 },
|
|
|
|
{ "vpsrldq", _XMM3, _XMM3 | _MEM, IMM8 },
|
|
{ "vpsrldq", _YMM3, _YMM3 | _MEM, IMM8 },
|
|
{ "vpsrldq", _ZMM, _ZMM | _MEM, IMM8 },
|
|
|
|
{ "vpsraw", XMM_KZ, _XMM | _MEM, IMM8 },
|
|
{ "vpsraw", ZMM_KZ, _ZMM | _MEM, IMM8 },
|
|
|
|
{ "vpsrad", XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 },
|
|
{ "vpsrad", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
|
|
|
|
{ "vpsraq", XMM, XMM, IMM8 },
|
|
{ "vpsraq", XMM_KZ, _XMM | M_1to2 | _MEM, IMM8 },
|
|
{ "vpsraq", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 },
|
|
|
|
{ "vpsllw", _XMM3, _XMM3 | _MEM, IMM8 },
|
|
{ "vpslld", _XMM3, _XMM3 | _MEM | M_1to4, IMM8 },
|
|
{ "vpsllq", _XMM3, _XMM3 | _MEM | M_1to2, IMM8 },
|
|
|
|
{ "vpsrlw", XMM_KZ, _XMM | _MEM, IMM8 },
|
|
{ "vpsrlw", ZMM_KZ, _ZMM | _MEM, IMM8 },
|
|
|
|
{ "vpsrld", XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 },
|
|
{ "vpsrld", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
|
|
|
|
{ "vpsrlq", _XMM3, _XMM3 | _MEM | M_1to2, IMM8 },
|
|
{ "vpsrlq", _ZMM, _ZMM | _MEM | M_1to8, IMM8 },
|
|
|
|
{ "vpsravw", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
|
|
{ "vpsravw", _ZMM, _ZMM, _MEM },
|
|
|
|
{ "vpsravd", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
|
|
{ "vpsravd", _ZMM, _ZMM, M_1to16 | _MEM },
|
|
|
|
{ "vpsravq", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
|
|
{ "vpsravq", _ZMM, _ZMM, M_1to8 | _MEM },
|
|
|
|
{ "vpsllvw", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
|
|
{ "vpsllvw", _ZMM, _ZMM, _MEM },
|
|
|
|
{ "vpsllvd", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
|
|
{ "vpsllvd", _ZMM, _ZMM, M_1to16 | _MEM },
|
|
|
|
{ "vpsllvq", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
|
|
{ "vpsllvq", _ZMM, _ZMM, M_1to8 | _MEM },
|
|
|
|
{ "vpsrlvw", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
|
|
{ "vpsrlvw", _ZMM, _ZMM, _MEM },
|
|
|
|
{ "vpsrlvd", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
|
|
{ "vpsrlvd", _ZMM, _ZMM, M_1to16 | _MEM },
|
|
|
|
{ "vpsrlvq", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
|
|
{ "vpsrlvq", _ZMM, _ZMM, M_1to8 | _MEM },
|
|
|
|
{ "vpshufb", _XMM | XMM_KZ, _XMM, _XMM | _MEM },
|
|
{ "vpshufb", ZMM_KZ, _ZMM, _MEM },
|
|
|
|
{ "vpshufhw", _XMM | XMM_KZ, _XMM | _MEM, IMM8 },
|
|
{ "vpshufhw", ZMM_KZ, _MEM, IMM8 },
|
|
|
|
{ "vpshuflw", _XMM | XMM_KZ, _XMM | _MEM, IMM8 },
|
|
{ "vpshuflw", ZMM_KZ, _MEM, IMM8 },
|
|
|
|
{ "vpshufd", _XMM | XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 },
|
|
{ "vpshufd", _ZMM | ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
|
|
|
|
{ "vpord", _XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
|
|
{ "vpord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM },
|
|
|
|
{ "vporq", _XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
|
|
{ "vporq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM },
|
|
|
|
{ "vpxord", _XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
|
|
{ "vpxord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM },
|
|
|
|
{ "vpxorq", _XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
|
|
{ "vpxorq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM },
|
|
|
|
{ "vpsadbw", _XMM3, _XMM, _XMM | _MEM },
|
|
{ "vpsadbw", _ZMM, _ZMM, _MEM },
|
|
|
|
{ "vpmuldq", _XMM3, _XMM, _XMM | M_1to2 | _MEM },
|
|
{ "vpmuldq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
|
|
|
|
{ "vpmulhrsw", _XMM3, _XMM, _XMM | _MEM },
|
|
{ "vpmulhrsw", ZMM_KZ, _ZMM, _MEM },
|
|
|
|
{ "vpmulhuw", _XMM3, _XMM, _XMM | _MEM },
|
|
{ "vpmulhuw", ZMM_KZ, _ZMM, _MEM },
|
|
|
|
{ "vpmulhw", _XMM3, _XMM, _XMM | _MEM },
|
|
{ "vpmulhw", ZMM_KZ, _ZMM, _MEM },
|
|
|
|
{ "vpmullw", _XMM3, _XMM, _XMM | _MEM },
|
|
{ "vpmullw", ZMM_KZ, _ZMM, _MEM },
|
|
|
|
{ "vpmulld", _XMM3, _XMM, M_1to4 | _MEM },
|
|
{ "vpmulld", ZMM_KZ, _ZMM, M_1to16 | _MEM },
|
|
|
|
{ "vpmullq", _XMM3, _XMM, M_1to2 | _MEM },
|
|
{ "vpmullq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
|
|
|
|
{ "vpmuludq", _XMM3, _XMM, M_1to2 | _MEM },
|
|
{ "vpmuludq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
|
|
|
|
{ "vpunpckhbw", _XMM3, _XMM, _XMM | _MEM },
|
|
{ "vpunpckhbw", _ZMM, _ZMM, _MEM },
|
|
|
|
{ "vpunpckhwd", _XMM3, _XMM, _XMM | _MEM },
|
|
{ "vpunpckhwd", _ZMM, _ZMM, _MEM },
|
|
|
|
{ "vpunpckhdq", _XMM3, _XMM, M_1to4 | _MEM },
|
|
{ "vpunpckhdq", _ZMM, _ZMM, M_1to16 | _MEM },
|
|
|
|
{ "vpunpckhqdq", _XMM3, _XMM, M_1to2 | _MEM },
|
|
{ "vpunpckhqdq", _ZMM, _ZMM, M_1to8 | _MEM },
|
|
|
|
{ "vpunpcklbw", _XMM3, _XMM, _XMM | _MEM },
|
|
{ "vpunpcklbw", _ZMM, _ZMM, _MEM },
|
|
|
|
{ "vpunpcklwd", _XMM3, _XMM, _XMM | _MEM },
|
|
{ "vpunpcklwd", _ZMM, _ZMM, _MEM },
|
|
|
|
{ "vpunpckldq", _XMM3, _XMM, M_1to4 | _MEM },
|
|
{ "vpunpckldq", _ZMM, _ZMM, M_1to16 | _MEM },
|
|
|
|
{ "vpunpcklqdq", _XMM3, _XMM, M_1to2 | _MEM },
|
|
{ "vpunpcklqdq", _ZMM, _ZMM, M_1to8 | _MEM },
|
|
|
|
{ "vextractf32x4", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
|
|
{ "vextractf64x2", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
|
|
{ "vextractf32x8", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
|
|
{ "vextractf64x4", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
|
|
|
|
{ "vextracti32x4", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
|
|
{ "vextracti64x2", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
|
|
{ "vextracti32x8", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
|
|
{ "vextracti64x4", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
|
|
|
|
{ "vextractps", REG32 | _MEM, _XMM3, IMM8 },
|
|
|
|
{ "vpermb", XMM_KZ, _XMM, _XMM | _MEM },
|
|
{ "vpermb", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
|
|
{ "vpermw", XMM_KZ, _XMM, _XMM | _MEM },
|
|
{ "vpermw", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
|
|
{ "vpermd", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM },
|
|
{ "vpermd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
|
|
|
|
{ "vpermilpd", XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
|
|
{ "vpermilpd", ZMM_KZ, _ZMM, M_1to8 | _MEM },
|
|
{ "vpermilpd", XMM_KZ, M_1to2 | _MEM, IMM8 },
|
|
{ "vpermilpd", ZMM_KZ, M_1to8 | _MEM, IMM8 },
|
|
|
|
{ "vpermilps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4 },
|
|
{ "vpermilps", ZMM_KZ, _ZMM, _MEM | M_1to16 },
|
|
{ "vpermilps", XMM_KZ, _MEM | M_1to4 | _MEM, IMM8 },
|
|
{ "vpermilps", ZMM_KZ, _MEM | M_1to16 | _MEM, IMM8 },
|
|
|
|
{ "vpermpd", YMM_KZ, _YMM | M_1to4 | _MEM, IMM8 },
|
|
{ "vpermpd", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 },
|
|
{ "vpermpd", YMM_KZ, _YMM, M_1to4 | _MEM },
|
|
{ "vpermpd", ZMM_KZ, _ZMM, M_1to8 | _MEM },
|
|
|
|
{ "vpermps", YMM_KZ, _YMM, M_1to8 | _MEM },
|
|
{ "vpermps", ZMM_KZ, _ZMM, M_1to16 | _MEM },
|
|
|
|
{ "vpermq", YMM_KZ, _YMM | M_1to4 | _MEM, IMM8 },
|
|
{ "vpermq", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 },
|
|
{ "vpermq", YMM_KZ, _YMM, M_1to4 | _MEM },
|
|
{ "vpermq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const Tbl& p = tbl[i];
|
|
put(p.name, p.x1, p.x2, p.xm);
|
|
}
|
|
#endif
|
|
}
|
|
void put512_X3_I()
|
|
{
|
|
const struct Tbl {
|
|
const char *name;
|
|
uint64_t x1;
|
|
uint64_t x2;
|
|
uint64_t xm;
|
|
} tbl[] = {
|
|
#ifdef XBYAK64
|
|
{ "vinsertps", _XMM, _XMM, _XMM3 | _MEM },
|
|
|
|
{ "vshufpd", XMM_KZ, _XMM, M_1to2 | _MEM },
|
|
{ "vshufpd", ZMM_KZ, _ZMM, M_1to8 | _MEM },
|
|
|
|
{ "vshufps", XMM_KZ, _XMM, M_1to4 | _MEM },
|
|
{ "vshufps", ZMM_KZ, _ZMM, M_1to16 | _MEM },
|
|
|
|
{ "vinsertf32x4", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
|
|
{ "vinsertf32x4", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
|
|
|
|
{ "vinsertf64x2", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
|
|
{ "vinsertf64x2", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
|
|
|
|
{ "vinsertf32x8", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
|
|
{ "vinsertf64x4", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
|
|
|
|
{ "vinserti32x4", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
|
|
{ "vinserti32x4", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
|
|
|
|
{ "vinserti64x2", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
|
|
{ "vinserti64x2", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
|
|
|
|
{ "vinserti32x8", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
|
|
{ "vinserti64x4", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
|
|
#endif
|
|
{ "vpalignr", ZMM_KZ, _ZMM, _ZMM | _MEM },
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const Tbl& p = tbl[i];
|
|
put(p.name, p.x1, p.x2, p.xm, IMM8);
|
|
}
|
|
#ifdef XBYAK64
|
|
put("vpextrb", _REG64 | _MEM, _XMM3, IMM8);
|
|
put("vpextrw", _REG64 | _MEM, _XMM3, IMM8);
|
|
put("vpextrd", _REG32 | _MEM, _XMM3, IMM8);
|
|
put("vpextrq", _REG64 | _MEM, _XMM3, IMM8);
|
|
put("vpinsrb", _XMM3, _XMM3, _REG32 | _MEM, IMM8);
|
|
put("vpinsrw", _XMM3, _XMM3, _REG32 | _MEM, IMM8);
|
|
put("vpinsrd", _XMM3, _XMM3, _REG32 | _MEM, IMM8);
|
|
put("vpinsrq", _XMM3, _XMM3, _REG64 | _MEM, IMM8);
|
|
#endif
|
|
}
|
|
void put512_FMA()
|
|
{
|
|
const struct Tbl {
|
|
const char *name;
|
|
bool supportYMM;
|
|
} tbl[] = {
|
|
{ "vfmadd", true },
|
|
{ "vfmadd", false },
|
|
{ "vfmaddsub", true },
|
|
{ "vfmsubadd", true },
|
|
{ "vfmsub", true },
|
|
{ "vfmsub", false },
|
|
{ "vfnmadd", true },
|
|
{ "vfnmadd", false },
|
|
{ "vfnmsub", true },
|
|
{ "vfnmsub", false },
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const Tbl& p = tbl[i];
|
|
const struct Ord {
|
|
const char *name;
|
|
} ord[] = {
|
|
{ "132" },
|
|
{ "213" },
|
|
{ "231" },
|
|
};
|
|
for (size_t j = 0; j < NUM_OF_ARRAY(ord); j++) {
|
|
const char sufTbl[][2][8] = {
|
|
{ "pd", "ps" },
|
|
{ "sd", "ss" },
|
|
};
|
|
for (size_t k = 0; k < 2; k++) {
|
|
const std::string suf = sufTbl[p.supportYMM ? 0 : 1][k];
|
|
uint64_t mem = 0;
|
|
if (suf == "pd") {
|
|
mem = M_1to2;
|
|
} else if (suf == "ps") {
|
|
mem = M_1to4;
|
|
} else {
|
|
mem = XMM_ER;
|
|
}
|
|
std::string name = std::string(p.name) + ord[j].name + suf;
|
|
const char *q = name.c_str();
|
|
put(q, XMM_KZ, _XMM, mem | _MEM);
|
|
if (!p.supportYMM) continue;
|
|
if (suf == "pd") {
|
|
mem = M_1to8;
|
|
} else if (suf == "ps") {
|
|
mem = M_1to16;
|
|
} else {
|
|
mem = XMM_ER;
|
|
}
|
|
put(q, _ZMM, _ZMM, mem | _MEM);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
void put512_Y_XM()
|
|
{
|
|
const struct Tbl {
|
|
const char *name;
|
|
bool all_xmm; // 2nd param
|
|
} tbl[] = {
|
|
{ "vpmovsxbw", false },
|
|
{ "vpmovsxbd", true },
|
|
{ "vpmovsxbq", true },
|
|
{ "vpmovsxwd", false },
|
|
{ "vpmovsxwq", true },
|
|
{ "vpmovsxdq", false },
|
|
|
|
{ "vpmovzxbw", false },
|
|
{ "vpmovzxbd", true },
|
|
{ "vpmovzxbq", true },
|
|
{ "vpmovzxwd", false },
|
|
{ "vpmovzxwq", true },
|
|
{ "vpmovzxdq", false },
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const Tbl& p = tbl[i];
|
|
const char *name = p.name;
|
|
put(name, XMM_KZ | YMM, _XMM | _MEM);
|
|
if (p.all_xmm) {
|
|
put(name, ZMM, _XMM | _MEM);
|
|
} else {
|
|
put(name, ZMM, YMM | _MEM);
|
|
}
|
|
}
|
|
}
|
|
void put512_AVX1()
|
|
{
|
|
#ifdef XBYAK64
|
|
const struct Tbl {
|
|
std::string name;
|
|
bool only_pd_ps;
|
|
} tbl[] = {
|
|
{ "vadd", false },
|
|
{ "vsub", false },
|
|
{ "vmul", false },
|
|
{ "vdiv", false },
|
|
{ "vmax", false },
|
|
{ "vmin", false },
|
|
{ "vand", true },
|
|
{ "vandn", true },
|
|
{ "vor", true },
|
|
{ "vxor", true },
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const struct Suf {
|
|
const char *suf;
|
|
bool supportYMM;
|
|
} sufTbl[] = {
|
|
{ "pd", true },
|
|
{ "ps", true },
|
|
{ "sd", false },
|
|
{ "ss", false },
|
|
};
|
|
for (size_t j = 0; j < NUM_OF_ARRAY(sufTbl); j++) {
|
|
if (tbl[i].only_pd_ps && j == 2) break;
|
|
std::string suf = sufTbl[j].suf;
|
|
std::string name = tbl[i].name + suf;
|
|
const char *p = name.c_str();
|
|
uint64_t mem = 0;
|
|
if (suf == "pd") {
|
|
mem = M_1to2;
|
|
} else if (suf == "ps") {
|
|
mem = M_1to4;
|
|
}
|
|
put(p, _XMM3 | XMM_KZ, _XMM, mem | _MEM);
|
|
if (!sufTbl[j].supportYMM) continue;
|
|
mem = 0;
|
|
if (suf == "pd") {
|
|
mem = M_1to8;
|
|
} else if (suf == "ps") {
|
|
mem = M_1to16;
|
|
}
|
|
put(p, _ZMM, _ZMM, mem | _MEM);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
void putAVX1()
|
|
{
|
|
const struct Tbl {
|
|
const char *name;
|
|
bool only_pd_ps;
|
|
} tbl[] = {
|
|
{ "add", false },
|
|
{ "sub", false },
|
|
{ "mul", false },
|
|
{ "div", false },
|
|
{ "max", false },
|
|
{ "min", false },
|
|
{ "and", true },
|
|
{ "andn", true },
|
|
{ "or", true },
|
|
{ "xor", true },
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const struct Suf {
|
|
const char *suf;
|
|
bool supportYMM;
|
|
} suf[] = {
|
|
{ "pd", true },
|
|
{ "ps", true },
|
|
{ "sd", false },
|
|
{ "ss", false },
|
|
};
|
|
for (size_t j = 0; j < NUM_OF_ARRAY(suf); j++) {
|
|
if (tbl[i].only_pd_ps && j == 2) break;
|
|
std::string name = std::string("v") + tbl[i].name + suf[j].suf;
|
|
const char *p = name.c_str();
|
|
put(p, XMM, XMM | MEM);
|
|
put(p, XMM, XMM, XMM | MEM);
|
|
if (!suf[j].supportYMM) continue;
|
|
put(p, YMM, YMM | MEM);
|
|
put(p, YMM, YMM, YMM | MEM);
|
|
put(p, ZMM, ZMM, ZMM | MEM);
|
|
}
|
|
}
|
|
}
|
|
void put512_cvt()
|
|
{
|
|
#ifdef XBYAK64
|
|
put("vcvtdq2pd", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vcvtdq2pd", YMM_KZ, _XMM | _MEM | M_1to4);
|
|
put("vcvtdq2pd", ZMM_KZ, _YMM | _MEM | M_1to8);
|
|
|
|
put("vcvtdq2ps", XMM_KZ, _XMM | _MEM | M_1to4);
|
|
put("vcvtdq2ps", YMM_KZ, _YMM | _MEM | M_1to8);
|
|
put("vcvtdq2ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER);
|
|
|
|
put("vcvtpd2dq", XMM_KZ, _XMM | M_xword | M_1to2);
|
|
put("vcvtpd2dq", XMM_KZ, _YMM | M_yword | MY_1to4);
|
|
put("vcvtpd2dq", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
|
|
|
|
put("vcvtpd2ps", XMM_KZ, _XMM | M_xword | M_1to2);
|
|
put("vcvtpd2ps", XMM_KZ, _YMM | M_yword | MY_1to4);
|
|
put("vcvtpd2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
|
|
|
|
put("vcvtpd2qq", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vcvtpd2qq", YMM_KZ, _YMM | _MEM | M_1to4);
|
|
put("vcvtpd2qq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER);
|
|
|
|
put("vcvtpd2udq", XMM_KZ, _XMM | M_xword | M_1to2);
|
|
put("vcvtpd2udq", XMM_KZ, _YMM | M_yword | MY_1to4);
|
|
put("vcvtpd2udq", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
|
|
|
|
put("vcvtpd2uqq", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vcvtpd2uqq", YMM_KZ, _YMM | _MEM | M_1to4);
|
|
put("vcvtpd2uqq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER);
|
|
|
|
put("vcvtph2ps", XMM_KZ, _XMM | _MEM);
|
|
put("vcvtph2ps", YMM_KZ, _XMM | _MEM);
|
|
put("vcvtph2ps", ZMM_KZ, _YMM | _MEM | YMM_SAE);
|
|
|
|
put("vcvtps2ph", XMM_KZ | _MEM, _XMM, IMM8);
|
|
put("vcvtps2ph", XMM_KZ | _MEM, _YMM, IMM8);
|
|
put("vcvtps2ph", YMM_KZ | _MEM, _ZMM, IMM8);
|
|
put("vcvtps2ph", YMM_KZ, ZMM_SAE, IMM8);
|
|
|
|
put("vcvtps2dq", XMM_KZ, _XMM | _MEM | M_1to4);
|
|
put("vcvtps2dq", YMM_KZ, _YMM | _MEM | M_1to8);
|
|
put("vcvtps2dq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER);
|
|
|
|
put("vcvtps2udq", XMM_KZ, _XMM | M_1to4);
|
|
put("vcvtps2udq", YMM_KZ, _YMM | M_1to8);
|
|
put("vcvtps2udq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER);
|
|
|
|
put("vcvtps2qq", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vcvtps2qq", YMM_KZ, _XMM | _MEM | M_1to4);
|
|
put("vcvtps2qq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_ER);
|
|
|
|
put("vcvtps2uqq", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vcvtps2uqq", YMM_KZ, _XMM | _MEM | M_1to4);
|
|
put("vcvtps2uqq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_ER);
|
|
|
|
put("vcvtps2pd", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vcvtps2pd", YMM_KZ, _XMM | _MEM | M_1to4);
|
|
put("vcvtps2pd", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE);
|
|
|
|
put("vcvtqq2pd", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vcvtqq2pd", YMM_KZ, _YMM | _MEM | M_1to4);
|
|
put("vcvtqq2pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER);
|
|
|
|
put("vcvtqq2ps", XMM_KZ, _XMM | M_xword | M_1to2);
|
|
put("vcvtqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4);
|
|
put("vcvtqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
|
|
|
|
put("vcvtsd2si", REG32 | REG64, _XMM3 | _MEM | XMM_ER);
|
|
|
|
put("vcvtsd2usi", REG32 | REG64, _XMM3 | _MEM | XMM_ER);
|
|
|
|
put("vcvtsd2ss", XMM_KZ, _XMM3, _XMM3 | _MEM | XMM_ER);
|
|
|
|
put("vcvtsi2sd", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64);
|
|
put("vcvtsi2sd", XMM, XMM_ER, REG64);
|
|
|
|
put("vcvtsi2ss", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64);
|
|
put("vcvtsi2ss", XMM, XMM_ER, REG32 | REG64);
|
|
|
|
put("vcvtss2sd", XMM_KZ, _XMM3, _XMM3 | _MEM | XMM_SAE);
|
|
|
|
put("vcvtss2si", REG32 | REG64, _XMM3 | _MEM | XMM_ER);
|
|
|
|
put("vcvtss2usi", REG32 | REG64, _XMM3 | _MEM | XMM_ER);
|
|
|
|
put("vcvtpd2dq", XMM_KZ, _XMM | M_xword | M_1to2);
|
|
put("vcvtpd2dq", XMM_KZ, _YMM | M_yword | MY_1to4);
|
|
put("vcvtpd2dq", YMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER);
|
|
|
|
put("vcvttpd2qq", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vcvttpd2qq", YMM_KZ, _YMM | _MEM | M_1to4);
|
|
put("vcvttpd2qq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE);
|
|
|
|
put("vcvttpd2udq", XMM_KZ, _XMM | M_xword | M_1to2);
|
|
put("vcvttpd2udq", XMM_KZ, _YMM | M_yword | MY_1to4);
|
|
put("vcvttpd2udq", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_SAE);
|
|
|
|
put("vcvttpd2uqq", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vcvttpd2uqq", YMM_KZ, _YMM | _MEM | M_1to4);
|
|
put("vcvttpd2uqq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE);
|
|
|
|
put("vcvttps2dq", XMM_KZ, _XMM | _MEM | M_1to4);
|
|
put("vcvttps2dq", YMM_KZ, _YMM | _MEM | M_1to8);
|
|
put("vcvttps2dq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE);
|
|
|
|
put("vcvttps2udq", XMM_KZ, _XMM | M_1to4);
|
|
put("vcvttps2udq", YMM_KZ, _YMM | M_1to8);
|
|
put("vcvttps2udq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE);
|
|
|
|
put("vcvttps2qq", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vcvttps2qq", YMM_KZ, _XMM | _MEM | M_1to4);
|
|
put("vcvttps2qq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE);
|
|
|
|
put("vcvttps2uqq", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vcvttps2uqq", YMM_KZ, _XMM | _MEM | M_1to4);
|
|
put("vcvttps2uqq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE);
|
|
|
|
put("vcvttsd2si", REG32 | REG64, _XMM3 | _MEM | XMM_SAE);
|
|
|
|
put("vcvttsd2usi", REG32 | REG64, _XMM3 | _MEM | XMM_SAE);
|
|
|
|
put("vcvttss2si", REG32 | REG64, _XMM3 | _MEM | XMM_SAE);
|
|
|
|
put("vcvttss2usi", REG32 | REG64, _XMM3 | _MEM | XMM_SAE);
|
|
|
|
put("vcvtudq2pd", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vcvtudq2pd", YMM_KZ, _XMM | _MEM | M_1to4);
|
|
put("vcvtudq2pd", ZMM_KZ, _YMM | _MEM | M_1to8);
|
|
|
|
put("vcvtudq2ps", XMM_KZ, _XMM | _MEM | M_1to4);
|
|
put("vcvtudq2ps", YMM_KZ, _YMM | _MEM | M_1to8);
|
|
put("vcvtudq2ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER);
|
|
|
|
put("vcvtuqq2pd", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vcvtuqq2pd", YMM_KZ, _YMM | _MEM | M_1to4);
|
|
put("vcvtuqq2pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER);
|
|
|
|
put("vcvtuqq2ps", XMM_KZ, _XMM | M_xword | M_1to2);
|
|
put("vcvtuqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4);
|
|
put("vcvtuqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
|
|
|
|
put("vcvtusi2sd", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64);
|
|
put("vcvtusi2sd", XMM, XMM_ER, REG64);
|
|
|
|
put("vcvtusi2ss", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64);
|
|
put("vcvtusi2ss", XMM, XMM_ER, REG32 | REG64);
|
|
#endif
|
|
}
|
|
enum {
|
|
xx_yy_zz,
|
|
xx_yx_zy,
|
|
xx_xy_yz
|
|
};
|
|
void putGather()
|
|
{
|
|
#ifdef XBYAK64
|
|
const struct Tbl {
|
|
const char *name;
|
|
int mode;
|
|
} tbl[] = {
|
|
{ "vpgatherdd", xx_yy_zz },
|
|
{ "vpgatherdq", xx_yx_zy },
|
|
{ "vpgatherqd", xx_xy_yz },
|
|
{ "vpgatherqq", xx_yy_zz },
|
|
{ "vgatherdps", xx_yy_zz },
|
|
{ "vgatherdpd", xx_yx_zy },
|
|
{ "vgatherqps", xx_xy_yz },
|
|
{ "vgatherqpd", xx_yy_zz },
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const Tbl& p = tbl[i];
|
|
switch (p.mode) {
|
|
case xx_yy_zz:
|
|
put(p.name, XMM_K, VM32X);
|
|
put(p.name, YMM_K, VM32Y);
|
|
put(p.name, ZMM_K, VM32Z);
|
|
break;
|
|
case xx_yx_zy:
|
|
put(p.name, XMM_K, VM32X);
|
|
put(p.name, YMM_K, VM32X);
|
|
put(p.name, ZMM_K, VM32Y);
|
|
break;
|
|
case xx_xy_yz:
|
|
put(p.name, XMM_K, VM32X);
|
|
put(p.name, XMM_K, VM32Y);
|
|
put(p.name, YMM_K, VM32Z);
|
|
break;
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
void putScatter()
|
|
{
|
|
#ifdef XBYAK64
|
|
const struct Tbl {
|
|
const char *name;
|
|
int mode;
|
|
} tbl[] = {
|
|
{ "vpscatterdd", xx_yy_zz },
|
|
{ "vpscatterdq", xx_xy_yz },
|
|
{ "vpscatterqd", xx_yx_zy },
|
|
{ "vpscatterqq", xx_yy_zz },
|
|
|
|
{ "vscatterdps", xx_yy_zz },
|
|
{ "vscatterdpd", xx_xy_yz },
|
|
{ "vscatterqps", xx_yx_zy },
|
|
{ "vscatterqpd", xx_yy_zz },
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const Tbl& p = tbl[i];
|
|
switch (p.mode) {
|
|
case xx_yy_zz:
|
|
put(p.name, VM32X_K, _XMM);
|
|
put(p.name, VM32Y_K, _YMM);
|
|
put(p.name, VM32Z_K, _ZMM);
|
|
break;
|
|
case xx_yx_zy:
|
|
put(p.name, VM32X_K, _XMM);
|
|
put(p.name, VM32Y_K, _XMM);
|
|
put(p.name, VM32Z_K, _YMM);
|
|
break;
|
|
case xx_xy_yz:
|
|
put(p.name, VM32X_K, _XMM);
|
|
put(p.name, VM32X_K, _YMM);
|
|
put(p.name, VM32Y_K, _ZMM);
|
|
break;
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
void putBlend()
|
|
{
|
|
put("vblendmpd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
|
|
put("vblendmpd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
|
|
put("vblendmpd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
|
|
|
|
put("vblendmps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
|
|
put("vblendmps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
|
|
put("vblendmps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16);
|
|
|
|
put("vpblendmb", XMM_KZ, _XMM, _XMM | _MEM);
|
|
put("vpblendmb", YMM_KZ, _YMM, _YMM | _MEM);
|
|
put("vpblendmb", ZMM_KZ, _ZMM, _ZMM | _MEM);
|
|
|
|
put("vpblendmb", XMM_KZ, _XMM, _XMM | _MEM);
|
|
put("vpblendmb", YMM_KZ, _YMM, _YMM | _MEM);
|
|
put("vpblendmb", ZMM_KZ, _ZMM, _ZMM | _MEM);
|
|
|
|
put("vpblendmd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
|
|
put("vpblendmd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
|
|
put("vpblendmd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16);
|
|
|
|
put("vpblendmq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
|
|
put("vpblendmq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
|
|
put("vpblendmq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
|
|
}
|
|
void putVpcmp()
|
|
{
|
|
const uint64_t b0Tbl[] = { 0, 0, 0 };
|
|
const uint64_t b4Tbl[] = { M_1to4, M_1to8, M_1to16 };
|
|
const uint64_t b2Tbl[] = { M_1to2, M_1to4, M_1to8 };
|
|
const struct Tbl {
|
|
const char *name;
|
|
uint64_t b;
|
|
} tbl[] = {
|
|
{ "vpcmpb", 0 },
|
|
{ "vpcmpub", 0 },
|
|
{ "vpcmpw", 0 },
|
|
{ "vpcmpuw", 0 },
|
|
{ "vpcmpd", M_1to4 },
|
|
{ "vpcmpud", M_1to4 },
|
|
{ "vpcmpq", M_1to2 },
|
|
{ "vpcmpuq", M_1to2 },
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const Tbl& p = tbl[i];
|
|
const uint64_t *bTbl = p.b == 0 ? b0Tbl : p.b == M_1to4 ? b4Tbl : b2Tbl;
|
|
put(p.name, K_K, _XMM, _XMM | _MEM | bTbl[0], IMM8);
|
|
put(p.name, K_K, _YMM, _YMM | _MEM | bTbl[1], IMM8);
|
|
put(p.name, K_K, _ZMM, _ZMM | _MEM | bTbl[2], IMM8);
|
|
}
|
|
}
|
|
void putVtest()
|
|
{
|
|
const uint64_t b0Tbl[] = { 0, 0, 0 };
|
|
const uint64_t b4Tbl[] = { M_1to4, M_1to8, M_1to16 };
|
|
const uint64_t b2Tbl[] = { M_1to2, M_1to4, M_1to8 };
|
|
const struct Tbl {
|
|
const char *name;
|
|
uint64_t b;
|
|
} tbl[] = {
|
|
{ "vptestmb", 0 },
|
|
{ "vptestmw", 0 },
|
|
{ "vptestmd", M_1to4 },
|
|
{ "vptestmq", M_1to2 },
|
|
|
|
{ "vptestnmb", 0 },
|
|
{ "vptestnmw", 0 },
|
|
{ "vptestnmd", M_1to4 },
|
|
{ "vptestnmq", M_1to2 },
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const Tbl& p = tbl[i];
|
|
const uint64_t *bTbl = p.b == 0 ? b0Tbl : p.b == M_1to4 ? b4Tbl : b2Tbl;
|
|
put(p.name, K_K, _XMM, _XMM | _MEM | bTbl[0]);
|
|
put(p.name, K_K, _YMM, _YMM | _MEM | bTbl[1]);
|
|
put(p.name, K_K, _ZMM, _ZMM | _MEM | bTbl[2]);
|
|
}
|
|
}
|
|
void putCompExp()
|
|
{
|
|
{
|
|
const char *tbl[] = {
|
|
"vcompresspd",
|
|
"vcompressps",
|
|
"vpcompressd",
|
|
"vpcompressq",
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const char *name = tbl[i];
|
|
put(name, XMM_KZ | _MEM, _XMM);
|
|
put(name, YMM_KZ | _MEM, _YMM);
|
|
put(name, ZMM_KZ | _MEM, _ZMM);
|
|
}
|
|
}
|
|
{
|
|
const char *tbl[] = {
|
|
"vexpandpd",
|
|
"vexpandps",
|
|
"vpexpandd",
|
|
"vpexpandq",
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const char *name = tbl[i];
|
|
put(name, XMM_KZ, _XMM | _MEM);
|
|
put(name, YMM_KZ, _YMM | _MEM);
|
|
put(name, ZMM_KZ, _ZMM | _MEM);
|
|
}
|
|
}
|
|
}
|
|
void putPerm()
|
|
{
|
|
const uint64_t b0Tbl[] = { 0, 0, 0 };
|
|
const uint64_t b4Tbl[] = { M_1to4, M_1to8, M_1to16 };
|
|
const uint64_t b2Tbl[] = { M_1to2, M_1to4, M_1to8 };
|
|
const struct Tbl {
|
|
const char *name;
|
|
uint64_t b;
|
|
} tbl[] = {
|
|
{ "vpermt2b", 0 },
|
|
{ "vpermt2w", 0 },
|
|
{ "vpermt2d", M_1to4 },
|
|
{ "vpermt2q", M_1to2 },
|
|
{ "vpermt2ps", M_1to4 },
|
|
{ "vpermt2pd", M_1to2 },
|
|
|
|
{ "vpermi2b", 0 },
|
|
{ "vpermi2w", 0 },
|
|
{ "vpermi2d", M_1to4 },
|
|
{ "vpermi2q", M_1to2 },
|
|
{ "vpermi2ps", M_1to4 },
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
const Tbl& p = tbl[i];
|
|
const uint64_t *bTbl = p.b == 0 ? b0Tbl : p.b == M_1to4 ? b4Tbl : b2Tbl;
|
|
put(p.name, XMM_KZ, _XMM, _XMM | _MEM | bTbl[0]);
|
|
put(p.name, YMM_KZ, _YMM, _YMM | _MEM | bTbl[1]);
|
|
put(p.name, ZMM_KZ, _ZMM, _ZMM | _MEM | bTbl[2]);
|
|
}
|
|
}
|
|
void putShuff()
|
|
{
|
|
put("vshuff32x4", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8);
|
|
put("vshuff32x4", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8);
|
|
|
|
put("vshuff64x2", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8);
|
|
put("vshuff64x2", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8);
|
|
|
|
put("vshufi32x4", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8);
|
|
put("vshufi32x4", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8);
|
|
|
|
put("vshufi64x2", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8);
|
|
put("vshufi64x2", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8);
|
|
}
|
|
void putMov()
|
|
{
|
|
put("vpmovm2b", _XMM | _YMM | _ZMM, K);
|
|
put("vpmovm2w", _XMM | _YMM | _ZMM, K);
|
|
put("vpmovm2d", _XMM | _YMM | _ZMM, K);
|
|
put("vpmovm2q", _XMM | _YMM | _ZMM, K);
|
|
|
|
put("vpmovb2m", K, _XMM | _YMM | _ZMM);
|
|
put("vpmovw2m", K, _XMM | _YMM | _ZMM);
|
|
put("vpmovd2m", K, _XMM | _YMM | _ZMM);
|
|
put("vpmovq2m", K, _XMM | _YMM | _ZMM);
|
|
|
|
put("vpmovqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
|
|
put("vpmovsqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
|
|
put("vpmovusqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
|
|
|
|
put("vpmovqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
|
|
put("vpmovsqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
|
|
put("vpmovusqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
|
|
|
|
put("vpmovqd", XMM_KZ | _MEM, _XMM | _YMM);
|
|
put("vpmovqd", YMM_KZ | _MEM, _ZMM);
|
|
|
|
put("vpmovsqd", XMM_KZ | _MEM, _XMM | _YMM);
|
|
put("vpmovsqd", YMM_KZ | _MEM, _ZMM);
|
|
|
|
put("vpmovusqd", XMM_KZ | _MEM, _XMM | _YMM);
|
|
put("vpmovusqd", YMM_KZ | _MEM, _ZMM);
|
|
|
|
put("vpmovdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
|
|
put("vpmovsdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
|
|
put("vpmovusdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
|
|
|
|
put("vpmovdw", XMM_KZ | _MEM, _XMM | _YMM);
|
|
put("vpmovdw", YMM_KZ | _MEM, _ZMM);
|
|
|
|
put("vpmovsdw", XMM_KZ | _MEM, _XMM | _YMM);
|
|
put("vpmovsdw", YMM_KZ | _MEM, _ZMM);
|
|
|
|
put("vpmovusdw", XMM_KZ | _MEM, _XMM | _YMM);
|
|
put("vpmovusdw", YMM_KZ | _MEM, _ZMM);
|
|
|
|
put("vpmovwb", XMM_KZ | _MEM, _XMM | _YMM);
|
|
put("vpmovwb", YMM_KZ | _MEM, _ZMM);
|
|
|
|
put("vpmovswb", XMM_KZ | _MEM, _XMM | _YMM);
|
|
put("vpmovswb", YMM_KZ | _MEM, _ZMM);
|
|
|
|
put("vpmovuswb", XMM_KZ | _MEM, _XMM | _YMM);
|
|
put("vpmovuswb", YMM_KZ | _MEM, _ZMM);
|
|
}
|
|
void putRot()
|
|
{
|
|
put("vprolvd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
|
|
put("vprolvd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
|
|
put("vprolvd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16);
|
|
|
|
put("vprolvq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
|
|
put("vprolvq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
|
|
put("vprolvq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
|
|
|
|
put("vprorvd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
|
|
put("vprorvd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
|
|
put("vprorvd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16);
|
|
|
|
put("vprorvq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
|
|
put("vprorvq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
|
|
put("vprorvq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
|
|
|
|
put("vprold", XMM_KZ, _XMM | _MEM | M_1to4, IMM8);
|
|
put("vprold", YMM_KZ, _YMM | _MEM | M_1to8, IMM8);
|
|
put("vprold", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8);
|
|
|
|
put("vprolq", XMM_KZ, _XMM | _MEM | M_1to2, IMM8);
|
|
put("vprolq", YMM_KZ, _YMM | _MEM | M_1to4, IMM8);
|
|
put("vprolq", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8);
|
|
|
|
put("vprord", XMM_KZ, _XMM | _MEM | M_1to4, IMM8);
|
|
put("vprord", YMM_KZ, _YMM | _MEM | M_1to8, IMM8);
|
|
put("vprord", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8);
|
|
|
|
put("vprorq", XMM_KZ, _XMM | _MEM | M_1to2, IMM8);
|
|
put("vprorq", YMM_KZ, _YMM | _MEM | M_1to4, IMM8);
|
|
put("vprorq", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8);
|
|
}
|
|
void putMisc2()
|
|
{
|
|
#ifdef XBYAK64
|
|
put("vpternlogd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4, IMM8);
|
|
put("vpternlogd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8);
|
|
put("vpternlogd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8);
|
|
|
|
put("vpternlogq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2, IMM8);
|
|
put("vpternlogq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8);
|
|
put("vpternlogq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8);
|
|
|
|
put("vgetexppd", XMM_KZ, _XMM | MEM | M_1to2);
|
|
put("vgetexppd", YMM_KZ, _YMM | MEM | M_1to4);
|
|
put("vgetexppd", ZMM_KZ, _ZMM | MEM | M_1to8 | ZMM_SAE);
|
|
|
|
put("vgetexpps", XMM_KZ, _XMM | MEM | M_1to4);
|
|
put("vgetexpps", YMM_KZ, _YMM | MEM | M_1to8);
|
|
put("vgetexpps", ZMM_KZ, _ZMM | MEM | M_1to16 | ZMM_SAE);
|
|
|
|
put("vgetexpsd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
|
|
put("vgetexpss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
|
|
|
|
put("vgetmantpd", XMM_KZ, _XMM | _MEM | M_1to2, IMM8);
|
|
put("vgetmantpd", YMM_KZ, _YMM | _MEM | M_1to4, IMM8);
|
|
put("vgetmantpd", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8);
|
|
|
|
put("vgetmantps", XMM_KZ, _XMM | _MEM | M_1to4, IMM8);
|
|
put("vgetmantps", YMM_KZ, _YMM | _MEM | M_1to8, IMM8);
|
|
put("vgetmantps", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8);
|
|
|
|
put("vgetmantsd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
|
|
put("vgetmantss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
|
|
|
|
put("vfixupimmpd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2, IMM8);
|
|
put("vfixupimmpd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8);
|
|
put("vfixupimmpd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8);
|
|
|
|
put("vfixupimmps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4, IMM8);
|
|
put("vfixupimmps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8);
|
|
put("vfixupimmps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8);
|
|
|
|
put("vfixupimmsd", XMM_KZ, _XMM, _XMM | _MEM, IMM8);
|
|
put("vfixupimmss", XMM_KZ, _XMM, _XMM | _MEM, IMM8);
|
|
|
|
put("vrcp14pd", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vrcp14pd", YMM_KZ, _YMM | _MEM | M_1to4);
|
|
put("vrcp14pd", ZMM_KZ, _ZMM | _MEM | M_1to8);
|
|
|
|
put("vrcp14ps", XMM_KZ, _XMM | _MEM | M_1to4);
|
|
put("vrcp14ps", YMM_KZ, _YMM | _MEM | M_1to8);
|
|
put("vrcp14ps", ZMM_KZ, _ZMM | _MEM | M_1to16);
|
|
|
|
put("vrcp14sd", XMM_KZ, _XMM, _XMM | _MEM);
|
|
|
|
put("vrcp14ss", XMM_KZ, _XMM, _XMM | _MEM);
|
|
|
|
put("vrsqrt14pd", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vrsqrt14pd", YMM_KZ, _YMM | _MEM | M_1to4);
|
|
put("vrsqrt14pd", ZMM_KZ, _ZMM | _MEM | M_1to8);
|
|
|
|
put("vrsqrt14ps", XMM_KZ, _XMM | _MEM | M_1to4);
|
|
put("vrsqrt14ps", YMM_KZ, _YMM | _MEM | M_1to8);
|
|
put("vrsqrt14ps", ZMM_KZ, _ZMM | _MEM | M_1to16);
|
|
|
|
put("vrsqrt14sd", XMM_KZ, _XMM, _XMM | _MEM);
|
|
|
|
put("vrsqrt14ss", XMM_KZ, _XMM, _XMM | _MEM);
|
|
|
|
put("vrndscalepd", XMM_KZ, _XMM | _MEM | M_1to2, IMM8);
|
|
put("vrndscalepd", YMM_KZ, _YMM | _MEM | M_1to4, IMM8);
|
|
put("vrndscalepd", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8);
|
|
|
|
put("vrndscaleps", XMM_KZ, _XMM | _MEM | M_1to4, IMM8);
|
|
put("vrndscaleps", YMM_KZ, _YMM | _MEM | M_1to8, IMM8);
|
|
put("vrndscaleps", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8);
|
|
|
|
put("vrndscalesd", XMM_KZ, _XMM, _XMM | _MEM, IMM8);
|
|
|
|
put("vrndscaless", XMM_KZ, _XMM, _XMM | _MEM, IMM8);
|
|
|
|
put("vscalefpd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
|
|
put("vscalefpd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
|
|
put("vscalefpd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 | ZMM_ER);
|
|
|
|
put("vscalefps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
|
|
put("vscalefps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
|
|
put("vscalefps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 | ZMM_ER);
|
|
|
|
put("vscalefsd", XMM_KZ, _XMM, _XMM | _MEM | XMM_ER);
|
|
put("vscalefss", XMM_KZ, _XMM, _XMM | _MEM | XMM_ER);
|
|
|
|
put("vdbpsadbw", XMM_KZ, _XMM, _XMM | _MEM, IMM8);
|
|
put("vdbpsadbw", YMM_KZ, _YMM, _YMM | _MEM, IMM8);
|
|
put("vdbpsadbw", ZMM_KZ, _ZMM, _ZMM | _MEM, IMM8);
|
|
|
|
put("vpmultishiftqb", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
|
|
put("vpmultishiftqb", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
|
|
put("vpmultishiftqb", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
|
|
|
|
put("vpconflictd", XMM_KZ, _XMM | _MEM | M_1to4);
|
|
put("vpconflictd", YMM_KZ, _YMM | _MEM | M_1to8);
|
|
put("vpconflictd", ZMM_KZ, _ZMM | _MEM | M_1to16);
|
|
|
|
put("vpconflictq", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vpconflictq", YMM_KZ, _YMM | _MEM | M_1to4);
|
|
put("vpconflictq", ZMM_KZ, _ZMM | _MEM | M_1to8);
|
|
|
|
put("vplzcntd", XMM_KZ, _XMM | _MEM | M_1to4);
|
|
put("vplzcntd", YMM_KZ, _YMM | _MEM | M_1to8);
|
|
put("vplzcntd", ZMM_KZ, _ZMM | _MEM | M_1to16);
|
|
|
|
put("vplzcntq", XMM_KZ, _XMM | _MEM | M_1to2);
|
|
put("vplzcntq", YMM_KZ, _YMM | _MEM | M_1to4);
|
|
put("vplzcntq", ZMM_KZ, _ZMM | _MEM | M_1to8);
|
|
|
|
put("vpbroadcastmb2q", _XMM | _YMM | _ZMM, K);
|
|
put("vpbroadcastmw2d", _XMM | _YMM | _ZMM, K);
|
|
|
|
put("vexp2pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE);
|
|
put("vexp2ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE);
|
|
|
|
put("vrcp28pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE);
|
|
put("vrcp28ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE);
|
|
put("vrcp28sd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
|
|
put("vrcp28ss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
|
|
|
|
put("vrsqrt28pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE);
|
|
put("vrsqrt28ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE);
|
|
put("vrsqrt28sd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
|
|
put("vrsqrt28ss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
|
|
|
|
put("vgatherpf0dps", VM32Z_K);
|
|
put("vgatherpf0qps", VM32Z_K);
|
|
put("vgatherpf0dpd", VM32Y_K);
|
|
put("vgatherpf0qpd", VM32Z_K);
|
|
|
|
put("vgatherpf1dps", VM32Z_K);
|
|
put("vgatherpf1qps", VM32Z_K);
|
|
put("vgatherpf1dpd", VM32Y_K);
|
|
put("vgatherpf1qpd", VM32Z_K);
|
|
|
|
put("vscatterpf0dps", VM32Z_K);
|
|
put("vscatterpf0qps", VM32Z_K);
|
|
put("vscatterpf0dpd", VM32Y_K);
|
|
put("vscatterpf0qpd", VM32Z_K);
|
|
|
|
put("vscatterpf1dps", VM32Z_K);
|
|
put("vscatterpf1qps", VM32Z_K);
|
|
put("vscatterpf1dpd", VM32Y_K);
|
|
put("vscatterpf1qpd", VM32Z_K);
|
|
|
|
put("vrangepd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2, IMM8);
|
|
put("vrangepd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8);
|
|
put("vrangepd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 | ZMM_SAE, IMM8);
|
|
|
|
put("vrangeps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4, IMM8);
|
|
put("vrangeps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8);
|
|
put("vrangeps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 | ZMM_SAE, IMM8);
|
|
|
|
put("vrangesd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
|
|
put("vrangess", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
|
|
|
|
put("vreducepd", XMM_KZ, _XMM | _MEM | M_1to2, IMM8);
|
|
put("vreducepd", YMM_KZ, _YMM | _MEM | M_1to4, IMM8);
|
|
put("vreducepd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE, IMM8);
|
|
|
|
put("vreduceps", XMM_KZ, _XMM | _MEM | M_1to4, IMM8);
|
|
put("vreduceps", YMM_KZ, _YMM | _MEM | M_1to8, IMM8);
|
|
put("vreduceps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE, IMM8);
|
|
|
|
put("vreducesd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
|
|
put("vreducess", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
|
|
|
|
put("vpmadd52luq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
|
|
put("vpmadd52luq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
|
|
put("vpmadd52luq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
|
|
|
|
put("vpmadd52huq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
|
|
put("vpmadd52huq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
|
|
put("vpmadd52huq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
|
|
#endif
|
|
}
|
|
void classSubMem(const char *nm, char x, bool broadcast, int size)
|
|
{
|
|
printf("%s ", nm);
|
|
if (isXbyak_) {
|
|
printf("(k5|k3, %cword%s [rax+64], 5);dump();\n", x, broadcast ? "_b" : "");
|
|
} else {
|
|
if (broadcast) {
|
|
int d = x == 'x' ? 128 / size : x == 'y' ? 256 / size : 512 / size;
|
|
printf("k5{k3}, [rax+64]{1to%d}, 5\n", d);
|
|
} else {
|
|
if (x == 'x') x = 'o'; // nasm
|
|
printf("k5{k3}, %cword [rax+64], 5\n", x);
|
|
}
|
|
}
|
|
}
|
|
void putClassSub(const char *name, int size)
|
|
{
|
|
put(name, K_K, _XMM | _YMM | _ZMM, IMM8);
|
|
for (int i = 0; i < 2; i++) {
|
|
classSubMem(name, 'x', i == 0, size);
|
|
classSubMem(name, 'y', i == 0, size);
|
|
classSubMem(name, 'z', i == 0, size);
|
|
}
|
|
}
|
|
void putClass()
|
|
{
|
|
#ifdef XBYAK64
|
|
putClassSub("vfpclasspd", 64);
|
|
putClassSub("vfpclassps", 32);
|
|
put("vfpclasssd", K_K, _XMM | _MEM, IMM8);
|
|
put("vfpclassss", K_K, _XMM | _MEM, IMM8);
|
|
#endif
|
|
}
|
|
void putMin()
|
|
{
|
|
#ifdef XBYAK64
|
|
put("vextractf32x4", XMM_KZ, _YMM, IMM8);
|
|
#endif
|
|
}
|
|
void putDisp8N()
|
|
{
|
|
{
|
|
const int tbl[] = {
|
|
-129, -128, -127, 0, 1, 64, 65, 127, 128
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
char xs[128], ns[128];
|
|
int v = tbl[i];
|
|
CYBOZU_SNPRINTF(xs, sizeof(xs), "xmm0, ptr[eax%+d]", v);
|
|
CYBOZU_SNPRINTF(ns, sizeof(ns), "xmm0, [eax%+d]", v);
|
|
put("vpbroadcastb", xs, ns);
|
|
}
|
|
}
|
|
{
|
|
const int tbl[] = {
|
|
-1024, -512 -256, -128, -64, -32, -16, -8, -4, -2, -1,
|
|
0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512
|
|
};
|
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
char xs[128], ns[128];
|
|
int v = tbl[i];
|
|
CYBOZU_SNPRINTF(xs, sizeof(xs), "zmm0, zmm1, ptr_b[eax%+d]", v);
|
|
CYBOZU_SNPRINTF(ns, sizeof(ns), "zmm0, zmm1, [eax%+d]{1to16}", v);
|
|
put("vaddps", xs, ns);
|
|
}
|
|
}
|
|
#ifdef XBYAK64
|
|
put("vfmadd231ps", "zmm8, zmm31, ptr_b[r14+rbp-0x1e4]", "zmm8, zmm31, [r14+rbp-0x1e4]{1to16}");
|
|
#endif
|
|
}
|
|
void putAVX512()
|
|
{
|
|
#ifdef MIN_TEST
|
|
putMin();
|
|
#else
|
|
putOpmask();
|
|
separateFunc();
|
|
putCombi();
|
|
separateFunc();
|
|
putCmpK();
|
|
separateFunc();
|
|
putBroadcast();
|
|
separateFunc();
|
|
putAVX512_M_X();
|
|
separateFunc();
|
|
put_vmov();
|
|
separateFunc();
|
|
put512_X_XM();
|
|
separateFunc();
|
|
put512_X_X_XM();
|
|
separateFunc();
|
|
put512_X3();
|
|
separateFunc();
|
|
put512_X3_I();
|
|
separateFunc();
|
|
put512_FMA();
|
|
separateFunc();
|
|
put512_Y_XM();
|
|
separateFunc();
|
|
put512_AVX1();
|
|
separateFunc();
|
|
put512_cvt();
|
|
separateFunc();
|
|
putMisc1();
|
|
separateFunc();
|
|
putGather();
|
|
separateFunc();
|
|
putBlend();
|
|
separateFunc();
|
|
putVpcmp();
|
|
separateFunc();
|
|
putVtest();
|
|
separateFunc();
|
|
putCompExp();
|
|
separateFunc();
|
|
putPerm();
|
|
separateFunc();
|
|
putShuff();
|
|
separateFunc();
|
|
putMisc2();
|
|
separateFunc();
|
|
putMov();
|
|
separateFunc();
|
|
putRot();
|
|
separateFunc();
|
|
putScatter();
|
|
separateFunc();
|
|
putClass();
|
|
putDisp8N();
|
|
#endif
|
|
}
|
|
};
|
|
|
|
int main(int argc, char *[])
|
|
{
|
|
Test test(argc > 1);
|
|
test.put();
|
|
}
|