From fc885ac80f0b87625eda63f8418436fc2ef5df1b Mon Sep 17 00:00:00 2001
From: MerryMage <MerryMage@users.noreply.github.com>
Date: Tue, 12 Dec 2017 16:11:22 +0000
Subject: [PATCH] EmitPackedHalvingAddU8: Add SSE2 implementation

---
 src/backend_x64/emit_x64.cpp | 51 +++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 16 deletions(-)

diff --git a/src/backend_x64/emit_x64.cpp b/src/backend_x64/emit_x64.cpp
index d68de55c..05ab428b 100644
--- a/src/backend_x64/emit_x64.cpp
+++ b/src/backend_x64/emit_x64.cpp
@@ -1737,25 +1737,44 @@ void EmitX64::EmitPackedSubS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst*
 void EmitX64::EmitPackedHalvingAddU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
     auto args = reg_alloc.GetArgumentInfo(inst);
 
-    Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(args[0]).cvt32();
-    Xbyak::Reg32 reg_b = reg_alloc.UseGpr(args[1]).cvt32();
-    Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32();
-    Xbyak::Reg32 and_a_b = reg_a;
-    Xbyak::Reg32 result = reg_a;
+    if (args[0].IsInXmm() || args[1].IsInXmm()) {
+        Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]);
+        Xbyak::Xmm xmm_b = reg_alloc.UseScratchXmm(args[1]);
+        Xbyak::Xmm ones = reg_alloc.ScratchXmm();
 
-    // This relies on the equality x+y == ((x&y) << 1) + (x^y).
-    // Note that x^y always contains the LSB of the result.
-    // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
-    // We mask by 0x7F to remove the LSB so that it doesn't leak into the field below.
+        // Since,
+        //   pavg(a, b) == (a + b + 1) >> 1
+        // Therefore,
+        //   ~pavg(~a, ~b) == (a + b) >> 1
 
-    code->mov(xor_a_b, reg_a);
-    code->and_(and_a_b, reg_b);
-    code->xor_(xor_a_b, reg_b);
-    code->shr(xor_a_b, 1);
-    code->and_(xor_a_b, 0x7F7F7F7F);
-    code->add(result, xor_a_b);
+        code->pcmpeqb(ones, ones);
+        code->pxor(xmm_a, ones);
+        code->pxor(xmm_b, ones);
+        code->pavgb(xmm_a, xmm_b);
+        code->pxor(xmm_a, ones);
 
-    reg_alloc.DefineValue(inst, result);
+        reg_alloc.DefineValue(inst, xmm_a);
+    } else {
+        Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(args[0]).cvt32();
+        Xbyak::Reg32 reg_b = reg_alloc.UseGpr(args[1]).cvt32();
+        Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32();
+        Xbyak::Reg32 and_a_b = reg_a;
+        Xbyak::Reg32 result = reg_a;
+
+        // This relies on the equality x+y == ((x&y) << 1) + (x^y).
+        // Note that x^y always contains the LSB of the result.
+        // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
+        // We mask by 0x7F to remove the LSB so that it doesn't leak into the field below.
+
+        code->mov(xor_a_b, reg_a);
+        code->and_(and_a_b, reg_b);
+        code->xor_(xor_a_b, reg_b);
+        code->shr(xor_a_b, 1);
+        code->and_(xor_a_b, 0x7F7F7F7F);
+        code->add(result, xor_a_b);
+
+        reg_alloc.DefineValue(inst, result);
+    }
 }
 
 void EmitX64::EmitPackedHalvingAddU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {