emit_x64_vector: Simplify EmitVectorLogicalShiftRight8()

We can generate the mask and AND it against the result of a halfword shift instead of looping.
2018-08-31 09:07:05 -04:00 · 2018-08-31 09:07:05 -04:00 · 135107279d
commit 135107279d
parent 2952b46b16
1 changed files with 7 additions and 10 deletions
--- a/src/backend/x64/emit_x64_vector.cpp
+++ b/src/backend/x64/emit_x64_vector.cpp
@ -1102,18 +1102,15 @@ void EmitX64::EmitVectorLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);

-    Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
-    Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
-    Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
+    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
    const u8 shift_amount = args[1].GetImmediateU8();

-    // TODO: Optimize
-    code.pcmpeqb(mask, mask); // mask = 0xFF
-    code.paddb(mask, mask); // mask = 0xFE
-    code.pxor(zeros, zeros);
-    for (size_t i = 0; i < shift_amount; ++i) {
-        code.pand(result, mask);
-        code.pavgb(result, zeros);
+    if (shift_amount > 0) {
+        const u64 replicand = 0xFEULL >> shift_amount;
+        const u64 mask = Common::Replicate(replicand, Common::BitSize<u8>());
+
+        code.psrlw(result, shift_amount);
+        code.pand(result, code.MConst(xword, mask, mask));
    }

    ctx.reg_alloc.DefineValue(inst, result);