From 18717d216c0646dda168e230f2bc8fa172ee009f Mon Sep 17 00:00:00 2001 From: Wunkolo Date: Sun, 4 Feb 2024 11:02:15 -0800 Subject: [PATCH] emit_x64_vector: AVX512+GNFI implementation of EmitVectorLogicalVShift8 --- src/dynarmic/backend/x64/emit_x64_vector.cpp | 41 ++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/backend/x64/emit_x64_vector.cpp index 4f47f2e0..47180de0 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector.cpp @@ -1824,6 +1824,47 @@ void EmitX64::EmitVectorLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitVectorLogicalVShift8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::GFNI)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + const Xbyak::Opmask negative_mask = k1; + code.pxor(tmp, tmp); + code.vpcmpb(negative_mask, left_shift, tmp, CmpInt::LessThan); + + // Reverse bits of negative-shifts + code.vmovaps(xmm0, code.BConst<64>(xword, 0x8040201008040201)); + code.vgf2p8affineqb(result | negative_mask, result, xmm0, 0); + + // Turn all negative shifts into left-shifts + code.pabsb(left_shift, left_shift); + + const Xbyak::Opmask valid_index = k2; + code.vptestnmb(valid_index, left_shift, code.BConst<8>(xword, 0xF8)); + + // gf2p8mulb's "x8 + x4 + x3 + x + 1"-polynomial-reduction only applies + // when the multiplication overflows. Masking away any bits that would have + // overflowed turns the polynomial-multiplication into regular modulo-multiplication + code.movdqa(tmp, code.Const(xword, 0x01'03'07'0f'1f'3f'7f'ff, 0)); + code.vpshufb(tmp | valid_index | T_z, tmp, left_shift); + code.pand(result, tmp); + + // n << 0 == n * 1 | n << 1 == n * 2 | n << 2 == n * 4 | etc + code.pxor(tmp, tmp); + code.movsd(tmp, xmm0); + code.pshufb(tmp, left_shift); + + code.gf2p8mulb(result, tmp); + + // Un-reverse bits of negative-shifts + code.vgf2p8affineqb(result | negative_mask, result, xmm0, 0); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); });