emit_x64_{vector_}floating_point: Centralize implementation of FP{Vector}{Abs,Neg}
Removes dependency on the constants at the top of some files such as `f16_negative_zero` and `f32_non_sign_mask` in favor of the `FPInfo` trait-type. Also removes bypass delays by selecting between instructions such as `pand`, `andps`, or `andpd` depending on the type and keeps them in their respective uop domain. See https://www.agner.org/optimize/instruction_tables.pdf for more info on bypass delays.
This commit is contained in:
parent
759459e181
commit
776208742b
2 changed files with 55 additions and 70 deletions
|
@ -39,9 +39,6 @@ namespace {
|
|||
|
||||
const Xbyak::Reg64 INVALID_REG = Xbyak::Reg64(-1);
|
||||
|
||||
constexpr u64 f16_negative_zero = 0x8000;
|
||||
constexpr u64 f16_non_sign_mask = 0x7fff;
|
||||
|
||||
constexpr u64 f32_negative_zero = 0x80000000u;
|
||||
constexpr u64 f32_nan = 0x7fc00000u;
|
||||
constexpr u64 f32_non_sign_mask = 0x7fffffffu;
|
||||
|
@ -328,58 +325,56 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn)
|
|||
|
||||
} // anonymous namespace
|
||||
|
||||
void EmitX64::EmitFPAbs16(EmitContext& ctx, IR::Inst* inst) {
|
||||
template<size_t fsize>
|
||||
void FPAbs(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||
constexpr FPT non_sign_mask = FP::FPInfo<FPT>::sign_mask - FPT(1u);
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Address mask = code.MConst(xword, non_sign_mask);
|
||||
|
||||
code.pand(result, code.MConst(xword, f16_non_sign_mask));
|
||||
code.andps(result, mask);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPAbs16(EmitContext& ctx, IR::Inst* inst) {
|
||||
FPAbs<16>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPAbs32(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
|
||||
code.pand(result, code.MConst(xword, f32_non_sign_mask));
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
FPAbs<32>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPAbs64(EmitContext& ctx, IR::Inst* inst) {
|
||||
FPAbs<64>(code, ctx, inst);
|
||||
}
|
||||
|
||||
template<size_t fsize>
|
||||
void FPNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||
constexpr FPT sign_mask = FP::FPInfo<FPT>::sign_mask;
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Address mask = code.MConst(xword, u64(sign_mask));
|
||||
|
||||
code.pand(result, code.MConst(xword, f64_non_sign_mask));
|
||||
code.xorps(result, mask);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPNeg16(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
|
||||
code.pxor(result, code.MConst(xword, f16_negative_zero));
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
FPNeg<16>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPNeg32(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
|
||||
code.pxor(result, code.MConst(xword, f32_negative_zero));
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
FPNeg<32>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPNeg64(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
|
||||
code.pxor(result, code.MConst(xword, f64_negative_zero));
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
FPNeg<64>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPAdd32(EmitContext& ctx, IR::Inst* inst) {
|
||||
|
|
|
@ -557,37 +557,32 @@ void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lam
|
|||
|
||||
} // anonymous namespace
|
||||
|
||||
void EmitX64::EmitFPVectorAbs16(EmitContext& ctx, IR::Inst* inst) {
|
||||
template<size_t fsize>
|
||||
void FPVectorAbs(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||
constexpr FPT non_sign_mask = FP::FPInfo<FPT>::sign_mask - FPT(1u);
|
||||
constexpr u64 non_sign_mask64 = Common::Replicate<u64>(non_sign_mask, fsize);
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Address mask = code.MConst(xword, 0x7FFF7FFF7FFF7FFF, 0x7FFF7FFF7FFF7FFF);
|
||||
|
||||
code.pand(a, mask);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, a);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorAbs32(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Address mask = code.MConst(xword, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF);
|
||||
const Xbyak::Address mask = code.MConst(xword, non_sign_mask64, non_sign_mask64);
|
||||
|
||||
code.andps(a, mask);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, a);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorAbs16(EmitContext& ctx, IR::Inst* inst) {
|
||||
FPVectorAbs<16>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorAbs32(EmitContext& ctx, IR::Inst* inst) {
|
||||
FPVectorAbs<32>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorAbs64(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Address mask = code.MConst(xword, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF);
|
||||
|
||||
code.andpd(a, mask);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, a);
|
||||
FPVectorAbs<64>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorAdd32(EmitContext& ctx, IR::Inst* inst) {
|
||||
|
@ -1229,37 +1224,32 @@ void EmitX64::EmitFPVectorMulX64(EmitContext& ctx, IR::Inst* inst) {
|
|||
EmitFPVectorMulX<64>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorNeg16(EmitContext& ctx, IR::Inst* inst) {
|
||||
template<size_t fsize>
|
||||
void FPVectorNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||
constexpr FPT sign_mask = FP::FPInfo<FPT>::sign_mask;
|
||||
constexpr u64 sign_mask64 = Common::Replicate<u64>(sign_mask, fsize);
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Address mask = code.MConst(xword, 0x8000800080008000, 0x8000800080008000);
|
||||
const Xbyak::Address mask = code.MConst(xword, sign_mask64, sign_mask64);
|
||||
|
||||
code.pxor(a, mask);
|
||||
code.xorps(a, mask);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, a);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorNeg16(EmitContext& ctx, IR::Inst* inst) {
|
||||
FPVectorNeg<16>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorNeg32(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Address mask = code.MConst(xword, 0x8000000080000000, 0x8000000080000000);
|
||||
|
||||
code.pxor(a, mask);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, a);
|
||||
FPVectorNeg<32>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorNeg64(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Address mask = code.MConst(xword, 0x8000000000000000, 0x8000000000000000);
|
||||
|
||||
code.pxor(a, mask);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, a);
|
||||
FPVectorNeg<64>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) {
|
||||
|
|
Loading…
Reference in a new issue