213fe7a452
Merge commit 'fdf626b74f35deedce0e6196c36b8c9f846c038a'
445 lines
13 KiB
Markdown
445 lines
13 KiB
Markdown
# Usage
|
|
|
|
Inherit `Xbyak::CodeGenerator` class and make the class method.
|
|
```
|
|
#include <xbyak/xbyak.h>
|
|
|
|
struct Code : Xbyak::CodeGenerator {
|
|
Code(int x)
|
|
{
|
|
mov(eax, x);
|
|
ret();
|
|
}
|
|
};
|
|
```
|
|
Or you can pass the instance of CodeGenerator without inheriting.
|
|
```
|
|
void genCode(Xbyak::CodeGenerator& code, int x) {
|
|
using namespace Xbyak::util;
|
|
code.mov(eax, x);
|
|
code.ret();
|
|
}
|
|
```
|
|
|
|
Make an instance of the class and get the function
|
|
pointer by calling `getCode()` and call it.
|
|
```
|
|
Code c(5);
|
|
int (*f)() = c.getCode<int (*)()>();
|
|
printf("ret=%d\n", f()); // ret = 5
|
|
```
|
|
|
|
## Syntax
|
|
Similar to MASM/NASM syntax with parentheses.
|
|
|
|
```
|
|
NASM Xbyak
|
|
mov eax, ebx --> mov(eax, ebx);
|
|
inc ecx inc(ecx);
|
|
ret --> ret();
|
|
```
|
|
|
|
## Addressing
|
|
Use `qword`, `dword`, `word` and `byte` if it is necessary to specify the size of memory,
|
|
otherwise use `ptr`.
|
|
|
|
```
|
|
(ptr|qword|dword|word|byte) [base + index * (1|2|4|8) + displacement]
|
|
[rip + 32bit disp] ; x64 only
|
|
|
|
NASM Xbyak
|
|
mov eax, [ebx+ecx] --> mov(eax, ptr [ebx+ecx]);
|
|
mov al, [ebx+ecx] --> mov(al, ptr [ebx + ecx]);
|
|
test byte [esp], 4 --> test(byte [esp], 4);
|
|
inc qword [rax] --> inc(qword [rax]);
|
|
```
|
|
**Note**: `qword`, ... are member variables, then don't use `dword` as unsigned int type.
|
|
|
|
### How to use Selector (Segment Register)
|
|
```
|
|
mov eax, [fs:eax] --> putSeg(fs);
|
|
mov(eax, ptr [eax]);
|
|
mov ax, cs --> mov(ax, cs);
|
|
```
|
|
**Note**: Segment class is not derived from `Operand`.
|
|
|
|
## AVX
|
|
|
|
```
|
|
vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3
|
|
vaddps(xmm2, xmm3, ptr [rax]); // use ptr to access memory
|
|
vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3);
|
|
```
|
|
|
|
**Note**:
|
|
If `XBYAK_ENABLE_OMITTED_OPERAND` is defined, then you can use two operand version for backward compatibility.
|
|
But the newer version will not support it.
|
|
```
|
|
vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3
|
|
```
|
|
|
|
## AVX-512
|
|
|
|
```
|
|
vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30);
|
|
vaddpd xmm30, xmm20, [rax] --> vaddpd(xmm30, xmm20, ptr [rax]);
|
|
vaddps xmm30, xmm20, [rax] --> vaddps(xmm30, xmm20, ptr [rax]);
|
|
vaddpd zmm2{k5}, zmm4, zmm2 --> vaddpd(zmm2 | k5, zmm4, zmm2);
|
|
vaddpd zmm2{k5}{z}, zmm4, zmm2 --> vaddpd(zmm2 | k5 | T_z, zmm4, zmm2);
|
|
vaddpd zmm2{k5}{z}, zmm4, zmm2,{rd-sae} --> vaddpd(zmm2 | k5 | T_z, zmm4, zmm2 | T_rd_sae);
|
|
vaddpd(zmm2 | k5 | T_z | T_rd_sae, zmm4, zmm2); // the position of `|` is arbitrary.
|
|
vcmppd k4{k3}, zmm1, zmm2, {sae}, 5 --> vcmppd(k4 | k3, zmm1, zmm2 | T_sae, 5);
|
|
|
|
vaddpd xmm1, xmm2, [rax+256] --> vaddpd(xmm1, xmm2, ptr [rax+256]);
|
|
vaddpd xmm1, xmm2, [rax+256]{1to2} --> vaddpd(xmm1, xmm2, ptr_b [rax+256]);
|
|
vaddpd ymm1, ymm2, [rax+256]{1to4} --> vaddpd(ymm1, ymm2, ptr_b [rax+256]);
|
|
vaddpd zmm1, zmm2, [rax+256]{1to8} --> vaddpd(zmm1, zmm2, ptr_b [rax+256]);
|
|
vaddps zmm1, zmm2, [rax+rcx*8+8]{1to16} --> vaddps(zmm1, zmm2, ptr_b [rax+rcx*8+8]);
|
|
vmovsd [rax]{k1}, xmm4 --> vmovsd(ptr [rax] | k1, xmm4);
|
|
|
|
vcvtpd2dq xmm16, oword [eax+33] --> vcvtpd2dq(xmm16, xword [eax+33]); // use xword for m128 instead of oword
|
|
vcvtpd2dq(xmm16, ptr [eax+33]); // default xword
|
|
vcvtpd2dq xmm21, [eax+32]{1to2} --> vcvtpd2dq(xmm21, ptr_b [eax+32]);
|
|
vcvtpd2dq xmm0, yword [eax+33] --> vcvtpd2dq(xmm0, yword [eax+33]); // use yword for m256
|
|
vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]); // use yword_b to broadcast
|
|
|
|
vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512
|
|
vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit
|
|
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit
|
|
|
|
vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX
|
|
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
|
|
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding
|
|
setDefaultEncoding(VexEncoding); // default encoding is VEX
|
|
vpdpbusd(xm0, xm1, xm2); // VEX encoding
|
|
```
|
|
|
|
- setDefaultEncoding(PreferredEncoding encoding);
|
|
- Set the default encoding to select EVEX or VEX.
|
|
- The default value is EvexEncoding.
|
|
- This function affects only an instruction that has a PreferredEncoding argument such as vpdpbusd.
|
|
|
|
### Remark
|
|
* `k1`, ..., `k7` are opmask registers.
|
|
- `k0` is dealt as no mask.
|
|
- e.g. `vmovaps(zmm0|k0, ptr[rax]);` and `vmovaps(zmm0|T_z, ptr[rax]);` are same to `vmovaps(zmm0, ptr[rax]);`.
|
|
* use `| T_z`, `| T_sae`, `| T_rn_sae`, `| T_rd_sae`, `| T_ru_sae`, `| T_rz_sae` instead of `,{z}`, `,{sae}`, `,{rn-sae}`, `,{rd-sae}`, `,{ru-sae}`, `,{rz-sae}` respectively.
|
|
* `k4 | k3` is different from `k3 | k4`.
|
|
* use `ptr_b` for broadcast `{1toX}`. X is automatically determined.
|
|
* specify `xword`/`yword`/`zword(_b)` for m128/m256/m512 if necessary.
|
|
|
|
## APX
|
|
[Advanced Performance Extensions (APX) Architecture Specification](https://www.intel.com/content/www/us/en/content-details/786223/intel-advanced-performance-extensions-intel-apx-architecture-specification.html)
|
|
- Support 64-bit 16 additional GPRs (general-purpose registers) r16, ..., r31
|
|
- 32-bit regs are r16d, ..., r31d
|
|
- 16-bit regs are r16w, ..., r31w
|
|
- 8-bit regs are r16b, ..., r31b
|
|
- `add(r20, r21);`
|
|
- `lea(r30, ptr[r29+r31]);`
|
|
- Support three-operand instruction
|
|
- `add(r20, r21, r23);`
|
|
- `add(r20, ptr[rax + rcx * 8 + 0x1234], r23);`
|
|
- Support T_nf for NF=1 (status flags update suppression)
|
|
- `add(r20|T_nf, r21, r23);` // Set EVEX.NF=1
|
|
- Support T_zu for NF=ZU (zero upper) for imul and setcc
|
|
- `imul(ax|T_zu, cx, 0x1234);` // Set ND=ZU
|
|
- `imul(ax|T_zu|T_nf, cx, 0x1234);` // Set ND=ZU and EVEX.NF=1
|
|
- `setb(r31b|T_zu);` // same as set(r31b); movzx(r31, r31b);
|
|
- See [sample/zero_upper.cpp](../sample/zero_upper.cpp)
|
|
|
|
### ccmpSCC and ctestSCC
|
|
|
|
- ccmpSCC(op1, op2, dfv = 0); // eflags = eflags == SCC ? cmp(op1, op2) : dfv
|
|
- ctestSCC(op1, op2, dfv = 0); // eflags = eflags == SCC ? test(op1, op2) : dfv
|
|
- SCC means source condition code such as z, a, gt.
|
|
- See [sample/ccmp.cpp](../sample/ccmp.cpp)
|
|
- Specify the union of T_of(=8), T_sf(=4), T_zf(=2), or T_cf(=1) for dfv.
|
|
|
|
|
|
## Label
|
|
Two kinds of Label are supported. (String literal and Label class).
|
|
|
|
### String literal
|
|
```
|
|
L("L1");
|
|
jmp("L1");
|
|
|
|
jmp("L2");
|
|
...
|
|
a few mnemonics (8-bit displacement jmp)
|
|
...
|
|
L("L2");
|
|
|
|
jmp("L3", T_NEAR);
|
|
...
|
|
a lot of mnemonics (32-bit displacement jmp)
|
|
...
|
|
L("L3");
|
|
```
|
|
|
|
* Call `hasUndefinedLabel()` to verify your code has no undefined label.
|
|
* you can use a label for immediate value of mov like as `mov(eax, "L2")`.
|
|
|
|
### Support `@@`, `@f`, `@b` like MASM
|
|
|
|
```
|
|
L("@@"); // <A>
|
|
jmp("@b"); // jmp to <A>
|
|
jmp("@f"); // jmp to <B>
|
|
L("@@"); // <B>
|
|
jmp("@b"); // jmp to <B>
|
|
mov(eax, "@b");
|
|
jmp(eax); // jmp to <B>
|
|
```
|
|
|
|
### Local label
|
|
|
|
Label symbols beginning with a period between `inLocalLabel()` and `outLocalLabel()`
|
|
are treated as a local label.
|
|
`inLocalLabel()` and `outLocalLabel()` can be nested.
|
|
|
|
```
|
|
void func1()
|
|
{
|
|
inLocalLabel();
|
|
L(".lp"); // <A> ; local label
|
|
...
|
|
jmp(".lp"); // jmp to <A>
|
|
L("aaa"); // global label <C>
|
|
outLocalLabel();
|
|
|
|
inLocalLabel();
|
|
L(".lp"); // <B> ; local label
|
|
func1();
|
|
jmp(".lp"); // jmp to <B>
|
|
inLocalLabel();
|
|
jmp("aaa"); // jmp to <C>
|
|
}
|
|
```
|
|
|
|
### short and long jump
|
|
Xbyak deals with jump mnemonics of an undefined label as short jump if no type is specified.
|
|
So if the size between jmp and label is larger than 127 byte, then xbyak will cause an error.
|
|
|
|
```
|
|
jmp("short-jmp"); // short jmp
|
|
// small code
|
|
L("short-jmp");
|
|
|
|
jmp("long-jmp");
|
|
// long code
|
|
L("long-jmp"); // throw exception
|
|
```
|
|
Then specify T_NEAR for jmp.
|
|
```
|
|
jmp("long-jmp", T_NEAR); // long jmp
|
|
// long code
|
|
L("long-jmp");
|
|
```
|
|
Or call `setDefaultJmpNEAR(true);` once, then the default type is set to T_NEAR.
|
|
```
|
|
jmp("long-jmp"); // long jmp
|
|
// long code
|
|
L("long-jmp");
|
|
```
|
|
|
|
### Label class
|
|
|
|
`L()` and `jxx()` support Label class.
|
|
|
|
```
|
|
Xbyak::Label label1, label2;
|
|
L(label1);
|
|
...
|
|
jmp(label1);
|
|
...
|
|
jmp(label2);
|
|
...
|
|
L(label2);
|
|
```
|
|
|
|
Use `putL` for jmp table
|
|
```
|
|
Label labelTbl, L0, L1, L2;
|
|
mov(rax, labelTbl);
|
|
// rdx is an index of jump table
|
|
jmp(ptr [rax + rdx * sizeof(void*)]);
|
|
L(labelTbl);
|
|
putL(L0);
|
|
putL(L1);
|
|
putL(L2);
|
|
L(L0);
|
|
....
|
|
L(L1);
|
|
....
|
|
```
|
|
|
|
`assignL(dstLabel, srcLabel)` binds dstLabel with srcLabel.
|
|
|
|
```
|
|
Label label2;
|
|
Label label1 = L(); // make label1 ; same to Label label1; L(label1);
|
|
...
|
|
jmp(label2); // label2 is not determined here
|
|
...
|
|
assignL(label2, label1); // label2 <- label1
|
|
```
|
|
The `jmp` in the above code jumps to label1 assigned by `assignL`.
|
|
|
|
**Note**:
|
|
* srcLabel must be used in `L()`.
|
|
* dstLabel must not be used in `L()`.
|
|
|
|
`Label::getAddress()` returns the address specified by the label instance and 0 if not specified.
|
|
```
|
|
// not AutoGrow mode
|
|
Label label;
|
|
assert(label.getAddress() == 0);
|
|
L(label);
|
|
assert(label.getAddress() == getCurr());
|
|
```
|
|
|
|
### Rip ; relative addressing
|
|
```
|
|
Label label;
|
|
mov(eax, ptr [rip + label]); // eax = 4
|
|
...
|
|
|
|
L(label);
|
|
dd(4);
|
|
```
|
|
```
|
|
int x;
|
|
...
|
|
mov(eax, ptr[rip + &x]); // throw exception if the difference between &x and current position is larger than 2GiB
|
|
```
|
|
|
|
## Far jump
|
|
|
|
Use `word|dword|qword` instead of `ptr` to specify the address size.
|
|
|
|
### 32 bit mode
|
|
```
|
|
jmp(word[eax], T_FAR); // jmp m16:16(FF /5)
|
|
jmp(dword[eax], T_FAR); // jmp m16:32(FF /5)
|
|
```
|
|
|
|
### 64 bit mode
|
|
```
|
|
jmp(word[rax], T_FAR); // jmp m16:16(FF /5)
|
|
jmp(dword[rax], T_FAR); // jmp m16:32(FF /5)
|
|
jmp(qword[rax], T_FAR); // jmp m16:64(REX.W FF /5)
|
|
```
|
|
The same applies to `call`.
|
|
|
|
## Code size
|
|
The default max code size is 4096 bytes.
|
|
Specify the size in constructor of `CodeGenerator()` if necessary.
|
|
|
|
```
|
|
class Quantize : public Xbyak::CodeGenerator {
|
|
public:
|
|
Quantize()
|
|
: CodeGenerator(8192)
|
|
{
|
|
}
|
|
...
|
|
};
|
|
```
|
|
|
|
## User allocated memory
|
|
|
|
You can make jit code on prepared memory.
|
|
|
|
Call `setProtectModeRE` yourself to change memory mode if using the prepared memory.
|
|
|
|
```
|
|
uint8_t alignas(4096) buf[8192]; // C++11 or later
|
|
|
|
struct Code : Xbyak::CodeGenerator {
|
|
Code() : Xbyak::CodeGenerator(sizeof(buf), buf)
|
|
{
|
|
mov(rax, 123);
|
|
ret();
|
|
}
|
|
};
|
|
|
|
int main()
|
|
{
|
|
Code c;
|
|
c.setProtectModeRE(); // set memory to Read/Exec
|
|
printf("%d\n", c.getCode<int(*)()>()());
|
|
}
|
|
```
|
|
|
|
**Note**: See [../sample/test0.cpp](../sample/test0.cpp).
|
|
|
|
### AutoGrow
|
|
|
|
The memory region for jit is automatically extended if necessary when `AutoGrow` is specified in a constructor of `CodeGenerator`.
|
|
|
|
Call `ready()` or `readyRE()` before calling `getCode()` to fix jump address.
|
|
```
|
|
struct Code : Xbyak::CodeGenerator {
|
|
Code()
|
|
: Xbyak::CodeGenerator(<default memory size>, Xbyak::AutoGrow)
|
|
{
|
|
...
|
|
}
|
|
};
|
|
Code c;
|
|
// generate code for jit
|
|
c.ready(); // mode = Read/Write/Exec
|
|
```
|
|
|
|
**Note**:
|
|
* Don't use the address returned by `getCurr()` before calling `ready()` because it may be invalid address.
|
|
|
|
### Read/Exec mode
|
|
Xbyak set Read/Write/Exec mode to memory to run jit code.
|
|
If you want to use Read/Exec mode for security, then specify `DontSetProtectRWE` for `CodeGenerator` and
|
|
call `setProtectModeRE()` after generating jit code.
|
|
|
|
```
|
|
struct Code : Xbyak::CodeGenerator {
|
|
Code()
|
|
: Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE)
|
|
{
|
|
mov(eax, 123);
|
|
ret();
|
|
}
|
|
};
|
|
|
|
Code c;
|
|
c.setProtectModeRE();
|
|
...
|
|
|
|
```
|
|
Call `readyRE()` instead of `ready()` when using `AutoGrow` mode.
|
|
See [protect-re.cpp](../sample/protect-re.cpp).
|
|
|
|
## Exception-less mode
|
|
If `XBYAK_NO_EXCEPTION` is defined, then gcc/clang can compile xbyak with `-fno-exceptions`.
|
|
In stead of throwing an exception, `Xbyak::GetError()` returns non-zero value (e.g. `ERR_BAD_ADDRESSING`) if there is something wrong.
|
|
The status will not be changed automatically, then you should reset it by `Xbyak::ClearError()`.
|
|
`CodeGenerator::reset()` calls `ClearError()`.
|
|
|
|
## Macro
|
|
|
|
* **XBYAK32** is defined on 32bit.
|
|
* **XBYAK64** is defined on 64bit.
|
|
* **XBYAK64_WIN** is defined on 64bit Windows(VC).
|
|
* **XBYAK64_GCC** is defined on 64bit gcc, cygwin.
|
|
* define **XBYAK_USE_OP_NAMES** on gcc with `-fno-operator-names` if you want to use `and()`, ....
|
|
* define **XBYAK_ENABLE_OMITTED_OPERAND** if you use omitted destination such as `vaddps(xmm2, xmm3);`(deprecated in the future).
|
|
* define **XBYAK_UNDEF_JNL** if Bessel function jnl is defined as macro.
|
|
* define **XBYAK_NO_EXCEPTION** for a compiler option `-fno-exceptions`.
|
|
* define **XBYAK_USE_MEMFD** on Linux then /proc/self/maps shows the area used by xbyak.
|
|
* define **XBYAK_OLD_DISP_CHECK** if the old disp check is necessary (deprecated in the future).
|
|
|
|
## Sample
|
|
|
|
* [test0.cpp](../sample/test0.cpp) ; tiny sample (x86, x64)
|
|
* [quantize.cpp](../sample/quantize.cpp) ; JIT optimized quantization by fast division (x86 only)
|
|
* [calc.cpp](../sample/calc.cpp) ; assemble and estimate a given polynomial (x86, x64)
|
|
* [bf.cpp](../sample/bf.cpp) ; JIT brainfuck (x86, x64)
|