From 87f6e412d0fa6e4500d96050be900720d42bffa4 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Thu, 18 Jun 2020 18:35:21 +0100 Subject: [PATCH] emit_x64_vector: SSE4.1 implementation of EmitVectorPolynomialMultiply{Long}8 --- src/backend/x64/emit_x64_vector.cpp | 74 +++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/src/backend/x64/emit_x64_vector.cpp b/src/backend/x64/emit_x64_vector.cpp index 97597b06..9b40e65b 100644 --- a/src/backend/x64/emit_x64_vector.cpp +++ b/src/backend/x64/emit_x64_vector.cpp @@ -2495,12 +2495,86 @@ static D PolynomialMultiply(T lhs, T rhs) { } void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasSSE41()) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm alternate = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg32 counter = ctx.reg_alloc.ScratchGpr().cvt32(); + + Xbyak::Label loop; + + code.pxor(result, result); + code.movdqa(mask, code.MConst(xword, 0x0101010101010101, 0x0101010101010101)); + code.mov(counter, 8); + + code.L(loop); + if (code.HasAVX()) { + code.vpand(xmm0, xmm_b, mask); + code.vpxor(alternate, result, xmm_a); + } else { + code.movdqa(xmm0, xmm_b); + code.movdqa(alternate, result); + code.pand(xmm0, mask); + code.pxor(alternate, xmm_a); + } + code.pcmpeqb(xmm0, mask); + code.paddb(mask, mask); + code.paddb(xmm_a, xmm_a); + code.pblendvb(result, alternate); + code.dec(counter); + code.jnz(loop); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { std::transform(a.begin(), a.end(), b.begin(), result.begin(), PolynomialMultiply); }); } void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasSSE41()) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm alternate = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg32 counter = ctx.reg_alloc.ScratchGpr().cvt32(); + + Xbyak::Label loop; + + code.pmovzxbw(xmm_a, xmm_a); + code.pmovzxbw(xmm_b, xmm_b); + code.pxor(result, result); + code.movdqa(mask, code.MConst(xword, 0x0001000100010001, 0x0001000100010001)); + code.mov(counter, 8); + + code.L(loop); + if (code.HasAVX()) { + code.vpand(xmm0, xmm_b, mask); + code.vpxor(alternate, result, xmm_a); + } else { + code.movdqa(xmm0, xmm_b); + code.movdqa(alternate, result); + code.pand(xmm0, mask); + code.pxor(alternate, xmm_a); + } + code.pcmpeqw(xmm0, mask); + code.paddw(mask, mask); + code.paddw(xmm_a, xmm_a); + code.pblendvb(result, alternate); + code.dec(counter); + code.jnz(loop); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { for (size_t i = 0; i < result.size(); i++) { result[i] = PolynomialMultiply(a[i], b[i]);