emit_x64_{vector_}floating_point: Add AVX512 implementation for ForceToDefaultNaN

`vfpclassp* k, xmm, i8` has better latency(4->3) and allocates better execution ports(01->5) that are out of the way of ALU-ports than `vcmpunordp* xmm, xmm, xmm`(`vcmpp* xmm, xmm, xmm, i8`) and removes the pipeline dependency on `xmm0` in favor AVX512 `k`-mask registers. `vblendmp* xmm, k, xmm, mem` is about the same throughput and latency as `blendvp* xmm. mem` but has the benefit of embedded broadcasts to reduce memory bandwidth(32/64-bit read rather than 128-bit) and lends itself to a future size optimization feature of `constant_pool`.
2022-06-18 01:12:36 -07:00 · 2022-06-18 01:12:36 -07:00 · 4d78d167d6
commit 4d78d167d6
parent 6367a26e62
2 changed files with 12 additions and 3 deletions
--- a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
+++ b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
@ -136,7 +136,11 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch)

 template<size_t fsize>
 void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) {
-    if (code.HasHostFeature(HostFeature::AVX)) {
+    if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+        const Xbyak::Opmask nan_mask = k1;
+        FCODE(vfpclasss)(nan_mask, result, u8(FpClass::QNaN | FpClass::SNaN));
+        FCODE(vblendmp)(result | nan_mask, result, code.MConst(ptr_b, fsize == 32 ? f32_nan : f64_nan));
+    } else if (code.HasHostFeature(HostFeature::AVX)) {
        FCODE(vcmpunords)(xmm0, result, result);
        FCODE(blendvp)(result, code.MConst(xword, fsize == 32 ? f32_nan : f64_nan));
    } else {
--- a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
+++ b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
@ -190,11 +190,16 @@ Xbyak::Address GetVectorOf(BlockOfCode& code) {
 template<size_t fsize>
 void ForceToDefaultNaN(BlockOfCode& code, FP::FPCR fpcr, Xbyak::Xmm result) {
    if (fpcr.DN()) {
-        const Xbyak::Xmm nan_mask = xmm0;
-        if (code.HasHostFeature(HostFeature::AVX)) {
+        if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+            const Xbyak::Opmask nan_mask = k1;
+            FCODE(vfpclassp)(nan_mask, result, u8(FpClass::QNaN | FpClass::SNaN));
+            FCODE(vblendmp)(result | nan_mask, result, GetNaNVector<fsize>(code));
+        } else if (code.HasHostFeature(HostFeature::AVX)) {
+            const Xbyak::Xmm nan_mask = xmm0;
            FCODE(vcmpunordp)(nan_mask, result, result);
            FCODE(blendvp)(result, GetNaNVector<fsize>(code));
        } else {
+            const Xbyak::Xmm nan_mask = xmm0;
            code.movaps(nan_mask, result);
            FCODE(cmpordp)(nan_mask, nan_mask);
            code.andps(result, nan_mask);