From 249297003994e5f65a9796d996445583a88f52c5 Mon Sep 17 00:00:00 2001 From: merry Date: Thu, 29 Dec 2022 19:31:54 +0000 Subject: [PATCH] emit_x64_vector_floating_point: AVX implementation for EmitFPVectorMinMaxNumeric --- .../x64/emit_x64_vector_floating_point.cpp | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp index d97e906d..c9a98aab 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp @@ -1081,6 +1081,80 @@ static void EmitFPVectorMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::I // !NaN QNaN op1 // QNaN QNaN op1 + if (code.HasHostFeature(HostFeature::AVX)) { + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + using FPT = mcl::unsigned_integer_of_size; + + // result = xmm_a == SNaN || xmm_b == QNaN + { + // evaluate xmm_b == QNaN + FCODE(vcmpunordp)(tmp1, xmm_b, xmm_b); + ICODE(vpsll)(tmp2, xmm_b, static_cast(fsize - FP::FPInfo::explicit_mantissa_width)); + { + code.vpsrad(tmp2, tmp2, 31); + if constexpr (fsize == 64) { + code.vpshufd(tmp2, tmp2, 0b11110101); + } + } + code.vandps(result, tmp1, tmp2); + + // evaluate xmm_a == SNaN + FCODE(vcmpunordp)(tmp1, xmm_a, xmm_a); + ICODE(vpsll)(tmp2, xmm_a, static_cast(fsize - FP::FPInfo::explicit_mantissa_width)); + { + code.vpsrad(tmp2, tmp2, 31); + if constexpr (fsize == 64) { + code.vpshufd(tmp2, tmp2, 0b11110101); + } + } + code.vandnps(tmp2, tmp2, tmp1); + + code.vorps(result, tmp2); + } + + // Denormalization quiets SNaNs, therefore should happen after SNaN detection! + DenormalsAreZero(code, ctx.FPCR(fpcr_controlled), {xmm_a, xmm_b}, tmp1); + + // intermediate result = max/min(xmm_a, xmm_b) + { + const Xbyak::Xmm eq_mask = tmp1; + const Xbyak::Xmm eq = tmp2; + + FCODE(vcmpeqp)(eq_mask, xmm_a, xmm_b); + + if constexpr (is_max) { + code.vandps(eq, xmm_a, xmm_b); + FCODE(vmaxp)(intermediate_result, xmm_a, xmm_b); + } else { + code.vorps(eq, xmm_a, xmm_b); + FCODE(vminp)(intermediate_result, xmm_a, xmm_b); + } + + code.blendvps(intermediate_result, eq); // eq_mask is in xmm0 + } + + { + code.vblendvps(result, intermediate_result, xmm_a, result); + } + + if (ctx.FPCR(fpcr_controlled).DN()) { + const Xbyak::Xmm ord_mask = tmp1; + + FCODE(vcmpunordp)(ord_mask, result, result); + code.blendvps(result, GetNaNVector(code)); // ord_mask is in xmm0 + } else { + const Xbyak::Xmm nan_mask = tmp1; + + FCODE(vcmpunordp)(nan_mask, result, result); + code.vandps(nan_mask, nan_mask, GetVectorOf::mantissa_msb>(code)); + code.vorps(result, result, nan_mask); + } + }); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { using FPT = mcl::unsigned_integer_of_size;