emit_x64_{vector_}floating_point: Add AVX512 implementation for ForceToDefaultNaN
`vfpclassp* k, xmm, i8` has better latency(4->3) and allocates better execution ports(01->5) that are out of the way of ALU-ports than `vcmpunordp* xmm, xmm, xmm`(`vcmpp* xmm, xmm, xmm, i8`) and removes the pipeline dependency on `xmm0` in favor AVX512 `k`-mask registers. `vblendmp* xmm, k, xmm, mem` is about the same throughput and latency as `blendvp* xmm. mem` but has the benefit of embedded broadcasts to reduce memory bandwidth(32/64-bit read rather than 128-bit) and lends itself to a future size optimization feature of `constant_pool`.
This commit is contained in:
parent
6367a26e62
commit
4d78d167d6
2 changed files with 12 additions and 3 deletions
|
@ -136,7 +136,11 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch)
|
|||
|
||||
template<size_t fsize>
|
||||
void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) {
|
||||
if (code.HasHostFeature(HostFeature::AVX)) {
|
||||
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
||||
const Xbyak::Opmask nan_mask = k1;
|
||||
FCODE(vfpclasss)(nan_mask, result, u8(FpClass::QNaN | FpClass::SNaN));
|
||||
FCODE(vblendmp)(result | nan_mask, result, code.MConst(ptr_b, fsize == 32 ? f32_nan : f64_nan));
|
||||
} else if (code.HasHostFeature(HostFeature::AVX)) {
|
||||
FCODE(vcmpunords)(xmm0, result, result);
|
||||
FCODE(blendvp)(result, code.MConst(xword, fsize == 32 ? f32_nan : f64_nan));
|
||||
} else {
|
||||
|
|
|
@ -190,11 +190,16 @@ Xbyak::Address GetVectorOf(BlockOfCode& code) {
|
|||
template<size_t fsize>
|
||||
void ForceToDefaultNaN(BlockOfCode& code, FP::FPCR fpcr, Xbyak::Xmm result) {
|
||||
if (fpcr.DN()) {
|
||||
const Xbyak::Xmm nan_mask = xmm0;
|
||||
if (code.HasHostFeature(HostFeature::AVX)) {
|
||||
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
||||
const Xbyak::Opmask nan_mask = k1;
|
||||
FCODE(vfpclassp)(nan_mask, result, u8(FpClass::QNaN | FpClass::SNaN));
|
||||
FCODE(vblendmp)(result | nan_mask, result, GetNaNVector<fsize>(code));
|
||||
} else if (code.HasHostFeature(HostFeature::AVX)) {
|
||||
const Xbyak::Xmm nan_mask = xmm0;
|
||||
FCODE(vcmpunordp)(nan_mask, result, result);
|
||||
FCODE(blendvp)(result, GetNaNVector<fsize>(code));
|
||||
} else {
|
||||
const Xbyak::Xmm nan_mask = xmm0;
|
||||
code.movaps(nan_mask, result);
|
||||
FCODE(cmpordp)(nan_mask, nan_mask);
|
||||
code.andps(result, nan_mask);
|
||||
|
|
Loading…
Reference in a new issue