emit_x64: Eliminate conversion of GE flags
* We do this so that we can simplify PackedSelect. * We also try to minimise xmm-gpr/gpr-xmm transfers in PackedSelect.
This commit is contained in:
parent
f734d7000e
commit
305e4baa29
1 changed files with 80 additions and 73 deletions
|
@ -330,9 +330,23 @@ void EmitX64::EmitOrQFlag(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
||||||
|
|
||||||
void EmitX64::EmitGetGEFlags(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
void EmitX64::EmitGetGEFlags(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
||||||
Xbyak::Reg32 result = reg_alloc.ScratchGpr().cvt32();
|
Xbyak::Reg32 result = reg_alloc.ScratchGpr().cvt32();
|
||||||
|
Xbyak::Reg32 tmp;
|
||||||
|
|
||||||
|
if (code->DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) {
|
||||||
|
tmp = reg_alloc.ScratchGpr().cvt32();
|
||||||
|
code->mov(tmp, 0x01010101);
|
||||||
|
}
|
||||||
code->mov(result, MJitStateCpsr());
|
code->mov(result, MJitStateCpsr());
|
||||||
code->shr(result, 16);
|
code->shr(result, 16);
|
||||||
code->and_(result, 0xF);
|
if (code->DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) {
|
||||||
|
code->pdep(result, result, tmp);
|
||||||
|
} else {
|
||||||
|
code->and_(result, 0xF);
|
||||||
|
code->imul(result, result, 0x00204081);
|
||||||
|
code->and_(result, 0x01010101);
|
||||||
|
}
|
||||||
|
code->imul(result, result, 0xFF);
|
||||||
|
|
||||||
reg_alloc.DefineValue(inst, result);
|
reg_alloc.DefineValue(inst, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -340,18 +354,23 @@ void EmitX64::EmitSetGEFlags(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
||||||
constexpr size_t flag_bit = 16;
|
constexpr size_t flag_bit = 16;
|
||||||
constexpr u32 flag_mask = 0xFu << flag_bit;
|
constexpr u32 flag_mask = 0xFu << flag_bit;
|
||||||
auto args = reg_alloc.GetArgumentInfo(inst);
|
auto args = reg_alloc.GetArgumentInfo(inst);
|
||||||
if (args[0].IsImmediate()) {
|
ASSERT(!args[0].IsImmediate());
|
||||||
u32 imm = (args[0].GetImmediateU32() << flag_bit) & flag_mask;
|
|
||||||
code->and_(MJitStateCpsr(), ~flag_mask);
|
|
||||||
code->or_(MJitStateCpsr(), imm);
|
|
||||||
} else {
|
|
||||||
Xbyak::Reg32 to_store = reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
||||||
|
|
||||||
code->shl(to_store, flag_bit);
|
Xbyak::Reg32 to_store = reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||||
code->and_(to_store, flag_mask);
|
|
||||||
code->and_(MJitStateCpsr(), ~flag_mask);
|
if (code->DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) {
|
||||||
code->or_(MJitStateCpsr(), to_store);
|
Xbyak::Reg32 tmp = reg_alloc.ScratchGpr().cvt32();
|
||||||
|
code->mov(tmp, 0x80808080);
|
||||||
|
code->pext(to_store, to_store, tmp);
|
||||||
|
} else {
|
||||||
|
code->and_(to_store, 0x80808080);
|
||||||
|
code->imul(to_store, to_store, 0x00204081);
|
||||||
|
code->shr(to_store, 28);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
code->shl(to_store, flag_bit);
|
||||||
|
code->and_(MJitStateCpsr(), ~flag_mask);
|
||||||
|
code->or_(MJitStateCpsr(), to_store);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitBXWritePC(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
void EmitX64::EmitBXWritePC(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
||||||
|
@ -1444,40 +1463,16 @@ void EmitX64::EmitSignedSaturation(RegAlloc& reg_alloc, IR::Block& block, IR::In
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
static void MaskOfMostSignificantBitFromPackedBytes(BlockOfCode* code, RegAlloc&, Xbyak::Reg32 value, boost::optional<Xbyak::Reg32> = boost::none) {
|
||||||
* Extracts the most significant bits from each of the packed bytes, and packs them together.
|
code->and_(value, 0x80808080);
|
||||||
*
|
code->shr(value, 7);
|
||||||
* value before: a-------b-------c-------d-------
|
code->imul(value, value, 0xFF);
|
||||||
* value after: 0000000000000000000000000000abcd
|
|
||||||
*
|
|
||||||
* @param value The register containing the value to operate on. Result will be stored in the same register.
|
|
||||||
* @param a_tmp A register which can be used as a scratch register.
|
|
||||||
*/
|
|
||||||
static void ExtractMostSignificantBitFromPackedBytes(BlockOfCode* code, RegAlloc& reg_alloc, Xbyak::Reg32 value, boost::optional<Xbyak::Reg32> a_tmp = boost::none) {
|
|
||||||
if (code->DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) {
|
|
||||||
Xbyak::Reg32 tmp = a_tmp ? *a_tmp : reg_alloc.ScratchGpr().cvt32();
|
|
||||||
code->mov(tmp, 0x80808080);
|
|
||||||
code->pext(value, value, tmp);
|
|
||||||
} else {
|
|
||||||
code->and_(value, 0x80808080);
|
|
||||||
code->imul(value, value, 0x00204081);
|
|
||||||
code->shr(value, 28);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
static void MaskOfMostSignificantBitFromPackedWords(BlockOfCode* code, Xbyak::Reg32 value) {
|
||||||
* Extracts the most significant bits from each of the packed words, duplicates them, and packs them together.
|
|
||||||
*
|
|
||||||
* value before: a---------------b---------------
|
|
||||||
* value after: 0000000000000000000000000000aabb
|
|
||||||
*
|
|
||||||
* @param value The register containing the value to operate on. Result will be stored in the same register.
|
|
||||||
*/
|
|
||||||
static void ExtractAndDuplicateMostSignificantBitFromPackedWords(BlockOfCode* code, Xbyak::Reg32 value) {
|
|
||||||
code->and_(value, 0x80008000);
|
code->and_(value, 0x80008000);
|
||||||
code->shr(value, 1);
|
code->shr(value, 15);
|
||||||
code->imul(value, value, 0xC003);
|
code->imul(value, value, 0xFFFF);
|
||||||
code->shr(value, 28);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitPackedAddU8(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
|
void EmitX64::EmitPackedAddU8(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) {
|
||||||
|
@ -1501,7 +1496,6 @@ void EmitX64::EmitPackedAddU8(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* i
|
||||||
code->movd(reg_ge, tmp);
|
code->movd(reg_ge, tmp);
|
||||||
code->not_(reg_ge);
|
code->not_(reg_ge);
|
||||||
|
|
||||||
ExtractMostSignificantBitFromPackedBytes(code, reg_alloc, reg_ge);
|
|
||||||
reg_alloc.DefineValue(ge_inst, reg_ge);
|
reg_alloc.DefineValue(ge_inst, reg_ge);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1532,7 +1526,7 @@ void EmitX64::EmitPackedAddS8(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* i
|
||||||
|
|
||||||
if (ge_inst) {
|
if (ge_inst) {
|
||||||
code->not_(reg_ge);
|
code->not_(reg_ge);
|
||||||
ExtractMostSignificantBitFromPackedBytes(code, reg_alloc, reg_ge);
|
MaskOfMostSignificantBitFromPackedBytes(code, reg_alloc, reg_ge);
|
||||||
reg_alloc.DefineValue(ge_inst, reg_ge);
|
reg_alloc.DefineValue(ge_inst, reg_ge);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1571,7 +1565,6 @@ void EmitX64::EmitPackedAddU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst*
|
||||||
code->movd(reg_ge, tmp_b);
|
code->movd(reg_ge, tmp_b);
|
||||||
}
|
}
|
||||||
|
|
||||||
ExtractMostSignificantBitFromPackedBytes(code, reg_alloc, reg_ge);
|
|
||||||
reg_alloc.DefineValue(ge_inst, reg_ge);
|
reg_alloc.DefineValue(ge_inst, reg_ge);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1601,7 +1594,7 @@ void EmitX64::EmitPackedAddS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst*
|
||||||
|
|
||||||
if (ge_inst) {
|
if (ge_inst) {
|
||||||
code->not_(reg_ge);
|
code->not_(reg_ge);
|
||||||
ExtractAndDuplicateMostSignificantBitFromPackedWords(code, reg_ge);
|
MaskOfMostSignificantBitFromPackedWords(code, reg_ge);
|
||||||
reg_alloc.DefineValue(ge_inst, reg_ge);
|
reg_alloc.DefineValue(ge_inst, reg_ge);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1626,15 +1619,12 @@ void EmitX64::EmitPackedSubU8(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* i
|
||||||
code->pmaxub(xmm_ge, xmm_b);
|
code->pmaxub(xmm_ge, xmm_b);
|
||||||
code->pcmpeqb(xmm_ge, xmm_a);
|
code->pcmpeqb(xmm_ge, xmm_a);
|
||||||
code->movd(reg_ge, xmm_ge);
|
code->movd(reg_ge, xmm_ge);
|
||||||
|
|
||||||
|
reg_alloc.DefineValue(ge_inst, reg_ge);
|
||||||
}
|
}
|
||||||
|
|
||||||
code->psubb(xmm_a, xmm_b);
|
code->psubb(xmm_a, xmm_b);
|
||||||
|
|
||||||
if (ge_inst) {
|
|
||||||
ExtractMostSignificantBitFromPackedBytes(code, reg_alloc, reg_ge);
|
|
||||||
reg_alloc.DefineValue(ge_inst, reg_ge);
|
|
||||||
}
|
|
||||||
|
|
||||||
reg_alloc.DefineValue(inst, xmm_a);
|
reg_alloc.DefineValue(inst, xmm_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1662,7 +1652,7 @@ void EmitX64::EmitPackedSubS8(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* i
|
||||||
|
|
||||||
if (ge_inst) {
|
if (ge_inst) {
|
||||||
code->not_(reg_ge);
|
code->not_(reg_ge);
|
||||||
ExtractMostSignificantBitFromPackedBytes(code, reg_alloc, reg_ge);
|
MaskOfMostSignificantBitFromPackedBytes(code, reg_alloc, reg_ge);
|
||||||
reg_alloc.DefineValue(ge_inst, reg_ge);
|
reg_alloc.DefineValue(ge_inst, reg_ge);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1697,15 +1687,12 @@ void EmitX64::EmitPackedSubU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst*
|
||||||
code->movd(reg_ge, xmm_ge);
|
code->movd(reg_ge, xmm_ge);
|
||||||
code->not_(reg_ge);
|
code->not_(reg_ge);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
reg_alloc.DefineValue(ge_inst, reg_ge);
|
||||||
}
|
}
|
||||||
|
|
||||||
code->psubw(xmm_a, xmm_b);
|
code->psubw(xmm_a, xmm_b);
|
||||||
|
|
||||||
if (ge_inst) {
|
|
||||||
ExtractAndDuplicateMostSignificantBitFromPackedWords(code, reg_ge);
|
|
||||||
reg_alloc.DefineValue(ge_inst, reg_ge);
|
|
||||||
}
|
|
||||||
|
|
||||||
reg_alloc.DefineValue(inst, xmm_a);
|
reg_alloc.DefineValue(inst, xmm_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1732,7 +1719,7 @@ void EmitX64::EmitPackedSubS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst*
|
||||||
|
|
||||||
if (ge_inst) {
|
if (ge_inst) {
|
||||||
code->not_(reg_ge);
|
code->not_(reg_ge);
|
||||||
ExtractAndDuplicateMostSignificantBitFromPackedWords(code, reg_ge);
|
MaskOfMostSignificantBitFromPackedWords(code, reg_ge);
|
||||||
reg_alloc.DefineValue(ge_inst, reg_ge);
|
reg_alloc.DefineValue(ge_inst, reg_ge);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2022,15 +2009,16 @@ void EmitPackedSubAdd(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block,
|
||||||
|
|
||||||
if (!is_signed) {
|
if (!is_signed) {
|
||||||
code->shl(ge_sum, 15);
|
code->shl(ge_sum, 15);
|
||||||
code->sar(ge_sum, 16);
|
code->sar(ge_sum, 31);
|
||||||
} else {
|
} else {
|
||||||
code->not_(ge_sum);
|
code->not_(ge_sum);
|
||||||
|
code->sar(ge_sum, 31);
|
||||||
}
|
}
|
||||||
code->not_(ge_diff);
|
code->not_(ge_diff);
|
||||||
code->and_(ge_sum, hi_is_sum ? 0xC0000000 : 0x30000000);
|
code->sar(ge_diff, 31);
|
||||||
code->and_(ge_diff, hi_is_sum ? 0x30000000 : 0xC0000000);
|
code->and_(ge_sum, hi_is_sum ? 0xFFFF0000 : 0x0000FFFF);
|
||||||
|
code->and_(ge_diff, hi_is_sum ? 0x0000FFFF : 0xFFFF0000);
|
||||||
code->or_(ge_sum, ge_diff);
|
code->or_(ge_sum, ge_diff);
|
||||||
code->shr(ge_sum, 28);
|
|
||||||
|
|
||||||
reg_alloc.DefineValue(ge_inst, ge_sum);
|
reg_alloc.DefineValue(ge_inst, ge_sum);
|
||||||
}
|
}
|
||||||
|
@ -2131,19 +2119,38 @@ void EmitX64::EmitPackedAbsDiffSumS8(RegAlloc& reg_alloc, IR::Block&, IR::Inst*
|
||||||
void EmitX64::EmitPackedSelect(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
void EmitX64::EmitPackedSelect(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
||||||
auto args = reg_alloc.GetArgumentInfo(inst);
|
auto args = reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
Xbyak::Reg32 ge = reg_alloc.UseScratchGpr(args[0]).cvt32();
|
if (args[1].IsInXmm() && args[2].IsInXmm()) {
|
||||||
Xbyak::Reg32 to = reg_alloc.UseScratchGpr(args[1]).cvt32();
|
Xbyak::Xmm ge = reg_alloc.UseScratchXmm(args[0]);
|
||||||
Xbyak::Reg32 from = reg_alloc.UseScratchGpr(args[2]).cvt32();
|
Xbyak::Xmm to = reg_alloc.UseXmm(args[1]);
|
||||||
Xbyak::Reg32 tmp = reg_alloc.ScratchGpr().cvt32();
|
Xbyak::Xmm from = reg_alloc.UseScratchXmm(args[2]);
|
||||||
|
|
||||||
code->mov(tmp, 0x01010101);
|
code->pand(from, ge);
|
||||||
code->pdep(ge, ge, tmp);
|
code->pandn(ge, to);
|
||||||
code->imul(ge, ge, 0xFF);
|
code->por(from, ge);
|
||||||
code->and_(from, ge);
|
|
||||||
code->andn(to, ge, to);
|
|
||||||
code->or_(from, to);
|
|
||||||
|
|
||||||
reg_alloc.DefineValue(inst, from);
|
reg_alloc.DefineValue(inst, from);
|
||||||
|
} else if (code->DoesCpuSupport(Xbyak::util::Cpu::tBMI1)) {
|
||||||
|
Xbyak::Reg32 ge = reg_alloc.UseGpr(args[0]).cvt32();
|
||||||
|
Xbyak::Reg32 to = reg_alloc.UseScratchGpr(args[1]).cvt32();
|
||||||
|
Xbyak::Reg32 from = reg_alloc.UseScratchGpr(args[2]).cvt32();
|
||||||
|
|
||||||
|
code->and_(from, ge);
|
||||||
|
code->andn(to, ge, to);
|
||||||
|
code->or_(from, to);
|
||||||
|
|
||||||
|
reg_alloc.DefineValue(inst, from);
|
||||||
|
} else {
|
||||||
|
Xbyak::Reg32 ge = reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||||
|
Xbyak::Reg32 to = reg_alloc.UseScratchGpr(args[1]).cvt32();
|
||||||
|
Xbyak::Reg32 from = reg_alloc.UseScratchGpr(args[2]).cvt32();
|
||||||
|
|
||||||
|
code->and_(from, ge);
|
||||||
|
code->not_(ge);
|
||||||
|
code->and_(to, ge);
|
||||||
|
code->or_(from, to);
|
||||||
|
|
||||||
|
reg_alloc.DefineValue(inst, from);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void DenormalsAreZero32(BlockOfCode* code, Xbyak::Xmm xmm_value, Xbyak::Reg32 gpr_scratch) {
|
static void DenormalsAreZero32(BlockOfCode* code, Xbyak::Xmm xmm_value, Xbyak::Reg32 gpr_scratch) {
|
||||||
|
|
Loading…
Reference in a new issue