emit_x64{_vector}_floating_point: Unsafe AVX512 implementation of Emit{RSqrt,Recip}Estimate

This implementation exists within the unsafe optimization paths and utilize the 14-bit-precision `vrsqrt14*` and `vrcp14p*` instructions provided by AVX512F+VL. These are _more_ accurate than the fallback path and the current `rsqrt`-based unsafe code-path but still falls in line with what is expected of the `Unsafe_ReducedErrorFP` optimization flag. Having AVX512 available will mean this function has 14 bits of precision. Not having AVX512 available will mean these functions have 11 bits of precision.
2021-06-22 23:55:27 -07:00 · 2021-06-22 23:55:27 -07:00 · 1fc96fd0c2
commit 1fc96fd0c2
parent ea02a7d05d
2 changed files with 36 additions and 22 deletions
--- a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
+++ b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
@ -766,12 +766,16 @@ static void EmitFPRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
            const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();

-            if constexpr (fsize == 32) {
-                code.rcpss(result, operand);
+            if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+                FCODE(vrcp14s)(result, operand, operand);
            } else {
-                code.cvtsd2ss(result, operand);
-                code.rcpss(result, result);
-                code.cvtss2sd(result, result);
+                if constexpr (fsize == 32) {
+                    code.rcpss(result, operand);
+                } else {
+                    code.cvtsd2ss(result, operand);
+                    code.rcpss(result, result);
+                    code.cvtss2sd(result, result);
+                }
            }

            ctx.reg_alloc.DefineValue(inst, result);
@ -984,20 +988,22 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
            const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();

-            if constexpr (fsize == 32) {
-                code.rsqrtss(result, operand);
+            if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+                FCODE(vrsqrt14s)(result, operand, operand);
            } else {
-                code.cvtsd2ss(result, operand);
-                code.rsqrtss(result, result);
-                code.cvtss2sd(result, result);
+                if constexpr (fsize == 32) {
+                    code.rsqrtss(result, operand);
+                } else {
+                    code.cvtsd2ss(result, operand);
+                    code.rsqrtss(result, result);
+                    code.cvtss2sd(result, result);
+                }
            }

            ctx.reg_alloc.DefineValue(inst, result);
            return;
        }

-        // TODO: VRSQRT14SS implementation (AVX512F)
-
        auto args = ctx.reg_alloc.GetArgumentInfo(inst);

        const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
--- a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
+++ b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
@ -1288,12 +1288,16 @@ static void EmitRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
            const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();

-            if constexpr (fsize == 32) {
-                code.rcpps(result, operand);
+            if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+                FCODE(vrcp14p)(result, operand);
            } else {
-                code.cvtpd2ps(result, operand);
-                code.rcpps(result, result);
-                code.cvtps2pd(result, result);
+                if constexpr (fsize == 32) {
+                    code.rcpps(result, operand);
+                } else {
+                    code.cvtpd2ps(result, operand);
+                    code.rcpps(result, result);
+                    code.cvtps2pd(result, result);
+                }
            }

            ctx.reg_alloc.DefineValue(inst, result);
@ -1502,12 +1506,16 @@ static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
            const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();

-            if constexpr (fsize == 32) {
-                code.rsqrtps(result, operand);
+            if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+                FCODE(vrsqrt14p)(result, operand);
            } else {
-                code.cvtpd2ps(result, operand);
-                code.rsqrtps(result, result);
-                code.cvtps2pd(result, result);
+                if constexpr (fsize == 32) {
+                    code.rsqrtps(result, operand);
+                } else {
+                    code.cvtpd2ps(result, operand);
+                    code.rsqrtps(result, result);
+                    code.cvtps2pd(result, result);
+                }
            }

            ctx.reg_alloc.DefineValue(inst, result);