emit_x64_{vector_}floating_point: Centralize implementation of FP{Vector}{Abs,Neg}

Removes dependency on the constants at the top of some files
such as `f16_negative_zero` and `f32_non_sign_mask` in favor
of the `FPInfo` trait-type.

Also removes bypass delays by selecting between instructions
such as `pand`, `andps`, or `andpd` depending on the type
and keeps them in their respective uop domain.

See https://www.agner.org/optimize/instruction_tables.pdf for
more info on bypass delays.
This commit is contained in:
Wunkolo 2021-06-09 09:44:55 -07:00 committed by merry
parent 759459e181
commit 776208742b
2 changed files with 55 additions and 70 deletions

View file

@ -39,9 +39,6 @@ namespace {
const Xbyak::Reg64 INVALID_REG = Xbyak::Reg64(-1);
constexpr u64 f16_negative_zero = 0x8000;
constexpr u64 f16_non_sign_mask = 0x7fff;
constexpr u64 f32_negative_zero = 0x80000000u;
constexpr u64 f32_nan = 0x7fc00000u;
constexpr u64 f32_non_sign_mask = 0x7fffffffu;
@ -328,58 +325,56 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn)
} // anonymous namespace
void EmitX64::EmitFPAbs16(EmitContext& ctx, IR::Inst* inst) {
template<size_t fsize>
void FPAbs(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
using FPT = mp::unsigned_integer_of_size<fsize>;
constexpr FPT non_sign_mask = FP::FPInfo<FPT>::sign_mask - FPT(1u);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Address mask = code.MConst(xword, non_sign_mask);
code.pand(result, code.MConst(xword, f16_non_sign_mask));
code.andps(result, mask);
ctx.reg_alloc.DefineValue(inst, result);
}
void EmitX64::EmitFPAbs16(EmitContext& ctx, IR::Inst* inst) {
FPAbs<16>(code, ctx, inst);
}
void EmitX64::EmitFPAbs32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
code.pand(result, code.MConst(xword, f32_non_sign_mask));
ctx.reg_alloc.DefineValue(inst, result);
FPAbs<32>(code, ctx, inst);
}
void EmitX64::EmitFPAbs64(EmitContext& ctx, IR::Inst* inst) {
FPAbs<64>(code, ctx, inst);
}
template<size_t fsize>
void FPNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
using FPT = mp::unsigned_integer_of_size<fsize>;
constexpr FPT sign_mask = FP::FPInfo<FPT>::sign_mask;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Address mask = code.MConst(xword, u64(sign_mask));
code.pand(result, code.MConst(xword, f64_non_sign_mask));
code.xorps(result, mask);
ctx.reg_alloc.DefineValue(inst, result);
}
void EmitX64::EmitFPNeg16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
code.pxor(result, code.MConst(xword, f16_negative_zero));
ctx.reg_alloc.DefineValue(inst, result);
FPNeg<16>(code, ctx, inst);
}
void EmitX64::EmitFPNeg32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
code.pxor(result, code.MConst(xword, f32_negative_zero));
ctx.reg_alloc.DefineValue(inst, result);
FPNeg<32>(code, ctx, inst);
}
void EmitX64::EmitFPNeg64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
code.pxor(result, code.MConst(xword, f64_negative_zero));
ctx.reg_alloc.DefineValue(inst, result);
FPNeg<64>(code, ctx, inst);
}
void EmitX64::EmitFPAdd32(EmitContext& ctx, IR::Inst* inst) {

View file

@ -557,37 +557,32 @@ void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lam
} // anonymous namespace
void EmitX64::EmitFPVectorAbs16(EmitContext& ctx, IR::Inst* inst) {
template<size_t fsize>
void FPVectorAbs(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
using FPT = mp::unsigned_integer_of_size<fsize>;
constexpr FPT non_sign_mask = FP::FPInfo<FPT>::sign_mask - FPT(1u);
constexpr u64 non_sign_mask64 = Common::Replicate<u64>(non_sign_mask, fsize);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Address mask = code.MConst(xword, 0x7FFF7FFF7FFF7FFF, 0x7FFF7FFF7FFF7FFF);
code.pand(a, mask);
ctx.reg_alloc.DefineValue(inst, a);
}
void EmitX64::EmitFPVectorAbs32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Address mask = code.MConst(xword, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF);
const Xbyak::Address mask = code.MConst(xword, non_sign_mask64, non_sign_mask64);
code.andps(a, mask);
ctx.reg_alloc.DefineValue(inst, a);
}
void EmitX64::EmitFPVectorAbs16(EmitContext& ctx, IR::Inst* inst) {
FPVectorAbs<16>(code, ctx, inst);
}
void EmitX64::EmitFPVectorAbs32(EmitContext& ctx, IR::Inst* inst) {
FPVectorAbs<32>(code, ctx, inst);
}
void EmitX64::EmitFPVectorAbs64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Address mask = code.MConst(xword, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF);
code.andpd(a, mask);
ctx.reg_alloc.DefineValue(inst, a);
FPVectorAbs<64>(code, ctx, inst);
}
void EmitX64::EmitFPVectorAdd32(EmitContext& ctx, IR::Inst* inst) {
@ -1229,37 +1224,32 @@ void EmitX64::EmitFPVectorMulX64(EmitContext& ctx, IR::Inst* inst) {
EmitFPVectorMulX<64>(code, ctx, inst);
}
void EmitX64::EmitFPVectorNeg16(EmitContext& ctx, IR::Inst* inst) {
template<size_t fsize>
void FPVectorNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
using FPT = mp::unsigned_integer_of_size<fsize>;
constexpr FPT sign_mask = FP::FPInfo<FPT>::sign_mask;
constexpr u64 sign_mask64 = Common::Replicate<u64>(sign_mask, fsize);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Address mask = code.MConst(xword, 0x8000800080008000, 0x8000800080008000);
const Xbyak::Address mask = code.MConst(xword, sign_mask64, sign_mask64);
code.pxor(a, mask);
code.xorps(a, mask);
ctx.reg_alloc.DefineValue(inst, a);
}
void EmitX64::EmitFPVectorNeg16(EmitContext& ctx, IR::Inst* inst) {
FPVectorNeg<16>(code, ctx, inst);
}
void EmitX64::EmitFPVectorNeg32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Address mask = code.MConst(xword, 0x8000000080000000, 0x8000000080000000);
code.pxor(a, mask);
ctx.reg_alloc.DefineValue(inst, a);
FPVectorNeg<32>(code, ctx, inst);
}
void EmitX64::EmitFPVectorNeg64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Address mask = code.MConst(xword, 0x8000000000000000, 0x8000000000000000);
code.pxor(a, mask);
ctx.reg_alloc.DefineValue(inst, a);
FPVectorNeg<64>(code, ctx, inst);
}
void EmitX64::EmitFPVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) {