dynarmic/src/backend_x64/emit_x64_packed.cpp

/* This file is part of the dynarmic project.
 * Copyright (c) 2016 MerryMage
 * This software may be used and distributed according to the terms of the GNU
 * General Public License version 2 or any later version.
 */

#include "backend_x64/block_of_code.h"
#include "backend_x64/emit_x64.h"
#include "common/assert.h"
#include "common/common_types.h"
#include "frontend/ir/basic_block.h"
#include "frontend/ir/microinstruction.h"
#include "frontend/ir/opcodes.h"

namespace Dynarmic::BackendX64 {

using namespace Xbyak::util;

void EmitX64::EmitPackedAddU8(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);

    Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
    Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);

    code.paddb(xmm_a, xmm_b);

    if (ge_inst) {
        Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
        Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();

        code.pcmpeqb(ones, ones);

        code.movdqa(xmm_ge, xmm_a);
        code.pminub(xmm_ge, xmm_b);
        code.pcmpeqb(xmm_ge, xmm_b);
        code.pxor(xmm_ge, ones);

        ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
        ctx.EraseInstruction(ge_inst);
    }

    ctx.reg_alloc.DefineValue(inst, xmm_a);
}

void EmitX64::EmitPackedAddS8(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);

    Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
    Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);

    if (ge_inst) {
        Xbyak::Xmm saturated_sum = ctx.reg_alloc.ScratchXmm();
        Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();

        code.pxor(xmm_ge, xmm_ge);
        code.movdqa(saturated_sum, xmm_a);
        code.paddsb(saturated_sum, xmm_b);
        code.pcmpgtb(xmm_ge, saturated_sum);
        code.pcmpeqb(saturated_sum, saturated_sum);
        code.pxor(xmm_ge, saturated_sum);

        ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
        ctx.EraseInstruction(ge_inst);
    }

    code.paddb(xmm_a, xmm_b);

    ctx.reg_alloc.DefineValue(inst, xmm_a);
}

void EmitX64::EmitPackedAddU16(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);

    Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
    Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);

    code.paddw(xmm_a, xmm_b);

    if (ge_inst) {
        if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
            Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
            Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();

            code.pcmpeqb(ones, ones);

            code.movdqa(xmm_ge, xmm_a);
            code.pminuw(xmm_ge, xmm_b);
            code.pcmpeqw(xmm_ge, xmm_b);
            code.pxor(xmm_ge, ones);

            ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
            ctx.EraseInstruction(ge_inst);
        } else {
            Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm();
            Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm();

            // !(b <= a+b) == b > a+b
            code.movdqa(tmp_a, xmm_a);
            code.movdqa(tmp_b, xmm_b);
            code.paddw(tmp_a, code.MConst(0x80008000));
            code.paddw(tmp_b, code.MConst(0x80008000));
            code.pcmpgtw(tmp_b, tmp_a); // *Signed* comparison!

            ctx.reg_alloc.DefineValue(ge_inst, tmp_b);
            ctx.EraseInstruction(ge_inst);
        }
    }

    ctx.reg_alloc.DefineValue(inst, xmm_a);
}

void EmitX64::EmitPackedAddS16(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);

    Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
    Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);

    if (ge_inst) {
        Xbyak::Xmm saturated_sum = ctx.reg_alloc.ScratchXmm();
        Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();

        code.pxor(xmm_ge, xmm_ge);
        code.movdqa(saturated_sum, xmm_a);
        code.paddsw(saturated_sum, xmm_b);
        code.pcmpgtw(xmm_ge, saturated_sum);
        code.pcmpeqw(saturated_sum, saturated_sum);
        code.pxor(xmm_ge, saturated_sum);

        ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
        ctx.EraseInstruction(ge_inst);
    }

    code.paddw(xmm_a, xmm_b);

    ctx.reg_alloc.DefineValue(inst, xmm_a);
}

void EmitX64::EmitPackedSubU8(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);

    Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
    Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);

    if (ge_inst) {
        Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();

        code.movdqa(xmm_ge, xmm_a);
        code.pmaxub(xmm_ge, xmm_b);
        code.pcmpeqb(xmm_ge, xmm_a);

        ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
        ctx.EraseInstruction(ge_inst);
    }

    code.psubb(xmm_a, xmm_b);

    ctx.reg_alloc.DefineValue(inst, xmm_a);
}

void EmitX64::EmitPackedSubS8(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);

    Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
    Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);

    if (ge_inst) {
        Xbyak::Xmm saturated_sum = ctx.reg_alloc.ScratchXmm();
        Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();

        code.pxor(xmm_ge, xmm_ge);
        code.movdqa(saturated_sum, xmm_a);
        code.psubsb(saturated_sum, xmm_b);
        code.pcmpgtb(xmm_ge, saturated_sum);
        code.pcmpeqb(saturated_sum, saturated_sum);
        code.pxor(xmm_ge, saturated_sum);

        ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
        ctx.EraseInstruction(ge_inst);
    }

    code.psubb(xmm_a, xmm_b);

    ctx.reg_alloc.DefineValue(inst, xmm_a);
}

void EmitX64::EmitPackedSubU16(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);

    if (!ge_inst) {
        Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
        Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);

        code.psubw(xmm_a, xmm_b);

        ctx.reg_alloc.DefineValue(inst, xmm_a);
        return;
    }

    if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
        Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
        Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
        Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();

        code.movdqa(xmm_ge, xmm_a);
        code.pmaxuw(xmm_ge, xmm_b); // Requires SSE 4.1
        code.pcmpeqw(xmm_ge, xmm_a);

        code.psubw(xmm_a, xmm_b);

        ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
        ctx.EraseInstruction(ge_inst);
        ctx.reg_alloc.DefineValue(inst, xmm_a);
        return;
    }

    Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
    Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
    Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
    Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();

    // (a >= b) == !(b > a)
    code.pcmpeqb(ones, ones);
    code.paddw(xmm_a, code.MConst(0x80008000));
    code.paddw(xmm_b, code.MConst(0x80008000));
    code.movdqa(xmm_ge, xmm_b);
    code.pcmpgtw(xmm_ge, xmm_a); // *Signed* comparison!
    code.pxor(xmm_ge, ones);

    code.psubw(xmm_a, xmm_b);

    ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
    ctx.EraseInstruction(ge_inst);
    ctx.reg_alloc.DefineValue(inst, xmm_a);
}

void EmitX64::EmitPackedSubS16(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);

    Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
    Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);

    if (ge_inst) {
        Xbyak::Xmm saturated_diff = ctx.reg_alloc.ScratchXmm();
        Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();

        code.pxor(xmm_ge, xmm_ge);
        code.movdqa(saturated_diff, xmm_a);
        code.psubsw(saturated_diff, xmm_b);
        code.pcmpgtw(xmm_ge, saturated_diff);
        code.pcmpeqw(saturated_diff, saturated_diff);
        code.pxor(xmm_ge, saturated_diff);

        ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
        ctx.EraseInstruction(ge_inst);
    }

    code.psubw(xmm_a, xmm_b);

    ctx.reg_alloc.DefineValue(inst, xmm_a);
}

void EmitX64::EmitPackedHalvingAddU8(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);

    if (args[0].IsInXmm() || args[1].IsInXmm()) {
        Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
        Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
        Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();

        // Since,
        //   pavg(a, b) == (a + b + 1) >> 1
        // Therefore,
        //   ~pavg(~a, ~b) == (a + b) >> 1

        code.pcmpeqb(ones, ones);
        code.pxor(xmm_a, ones);
        code.pxor(xmm_b, ones);
        code.pavgb(xmm_a, xmm_b);
        code.pxor(xmm_a, ones);

        ctx.reg_alloc.DefineValue(inst, xmm_a);
    } else {
        Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
        Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
        Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32();
        Xbyak::Reg32 and_a_b = reg_a;
        Xbyak::Reg32 result = reg_a;

        // This relies on the equality x+y == ((x&y) << 1) + (x^y).
        // Note that x^y always contains the LSB of the result.
        // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
        // We mask by 0x7F to remove the LSB so that it doesn't leak into the field below.

        code.mov(xor_a_b, reg_a);
        code.and_(and_a_b, reg_b);
        code.xor_(xor_a_b, reg_b);
        code.shr(xor_a_b, 1);
        code.and_(xor_a_b, 0x7F7F7F7F);
        code.add(result, xor_a_b);

        ctx.reg_alloc.DefineValue(inst, result);
    }
}

void EmitX64::EmitPackedHalvingAddU16(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);

    if (args[0].IsInXmm() || args[1].IsInXmm()) {
        Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
        Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
        Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();

        code.movdqa(tmp, xmm_a);
        code.pand(xmm_a, xmm_b);
        code.pxor(tmp, xmm_b);
        code.psrlw(tmp, 1);
        code.paddw(xmm_a, tmp);

        ctx.reg_alloc.DefineValue(inst, xmm_a);
    } else {
        Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
        Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
        Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32();
        Xbyak::Reg32 and_a_b = reg_a;
        Xbyak::Reg32 result = reg_a;

        // This relies on the equality x+y == ((x&y) << 1) + (x^y).
        // Note that x^y always contains the LSB of the result.
        // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
        // We mask by 0x7FFF to remove the LSB so that it doesn't leak into the field below.

        code.mov(xor_a_b, reg_a);
        code.and_(and_a_b, reg_b);
        code.xor_(xor_a_b, reg_b);
        code.shr(xor_a_b, 1);
        code.and_(xor_a_b, 0x7FFF7FFF);
        code.add(result, xor_a_b);

        ctx.reg_alloc.DefineValue(inst, result);
    }
}

void EmitX64::EmitPackedHalvingAddS8(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);

    Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
    Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
    Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32();
    Xbyak::Reg32 and_a_b = reg_a;
    Xbyak::Reg32 result = reg_a;
    Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr().cvt32();

    // This relies on the equality x+y == ((x&y) << 1) + (x^y).
    // Note that x^y always contains the LSB of the result.
    // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
    // We mask by 0x7F to remove the LSB so that it doesn't leak into the field below.
    // carry propagates the sign bit from (x^y)>>1 upwards by one.

    code.mov(xor_a_b, reg_a);
    code.and_(and_a_b, reg_b);
    code.xor_(xor_a_b, reg_b);
    code.mov(carry, xor_a_b);
    code.and_(carry, 0x80808080);
    code.shr(xor_a_b, 1);
    code.and_(xor_a_b, 0x7F7F7F7F);
    code.add(result, xor_a_b);
    code.xor_(result, carry);

    ctx.reg_alloc.DefineValue(inst, result);
}

void EmitX64::EmitPackedHalvingAddS16(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);

    Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
    Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
    Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();

    // This relies on the equality x+y == ((x&y) << 1) + (x^y).
    // Note that x^y always contains the LSB of the result.
    // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>>1).
    // The arithmetic shift right makes this signed.

    code.movdqa(tmp, xmm_a);
    code.pand(xmm_a, xmm_b);
    code.pxor(tmp, xmm_b);
    code.psraw(tmp, 1);
    code.paddw(xmm_a, tmp);

    ctx.reg_alloc.DefineValue(inst, xmm_a);
}

void EmitX64::EmitPackedHalvingSubU8(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);

    Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
    Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();

    // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
    // Note that x^y always contains the LSB of the result.
    // Since we want to calculate (x+y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y).

    code.xor_(minuend, subtrahend);
    code.and_(subtrahend, minuend);
    code.shr(minuend, 1);

    // At this point,
    // minuend := (a^b) >> 1
    // subtrahend := (a^b) & b

    // We must now perform a partitioned subtraction.
    // We can do this because minuend contains 7 bit fields.
    // We use the extra bit in minuend as a bit to borrow from; we set this bit.
    // We invert this bit at the end as this tells us if that bit was borrowed from.
    code.or_(minuend, 0x80808080);
    code.sub(minuend, subtrahend);
    code.xor_(minuend, 0x80808080);

    // minuend now contains the desired result.
    ctx.reg_alloc.DefineValue(inst, minuend);
}

void EmitX64::EmitPackedHalvingSubS8(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);

    Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
    Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();

    Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr().cvt32();

    // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
    // Note that x^y always contains the LSB of the result.
    // Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y).

    code.xor_(minuend, subtrahend);
    code.and_(subtrahend, minuend);
    code.mov(carry, minuend);
    code.and_(carry, 0x80808080);
    code.shr(minuend, 1);

    // At this point,
    // minuend := (a^b) >> 1
    // subtrahend := (a^b) & b
    // carry := (a^b) & 0x80808080

    // We must now perform a partitioned subtraction.
    // We can do this because minuend contains 7 bit fields.
    // We use the extra bit in minuend as a bit to borrow from; we set this bit.
    // We invert this bit at the end as this tells us if that bit was borrowed from.
    // We then sign extend the result into this bit.
    code.or_(minuend, 0x80808080);
    code.sub(minuend, subtrahend);
    code.xor_(minuend, 0x80808080);
    code.xor_(minuend, carry);

    ctx.reg_alloc.DefineValue(inst, minuend);
}

void EmitX64::EmitPackedHalvingSubU16(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);

    Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(args[0]);
    Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(args[1]);

    // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
    // Note that x^y always contains the LSB of the result.
    // Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y).

    code.pxor(minuend, subtrahend);
    code.pand(subtrahend, minuend);
    code.psrlw(minuend, 1);

    // At this point,
    // minuend := (a^b) >> 1
    // subtrahend := (a^b) & b

    code.psubw(minuend, subtrahend);

    ctx.reg_alloc.DefineValue(inst, minuend);
}

void EmitX64::EmitPackedHalvingSubS16(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);

    Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(args[0]);
    Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(args[1]);

    // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
    // Note that x^y always contains the LSB of the result.
    // Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>>1) - ((x^y)&y).

    code.pxor(minuend, subtrahend);
    code.pand(subtrahend, minuend);
    code.psraw(minuend, 1);

    // At this point,
    // minuend := (a^b) >>> 1
    // subtrahend := (a^b) & b

    code.psubw(minuend, subtrahend);

    ctx.reg_alloc.DefineValue(inst, minuend);
}

void EmitPackedSubAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bool hi_is_sum, bool is_signed, bool is_halving) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);

    Xbyak::Reg32 reg_a_hi = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
    Xbyak::Reg32 reg_b_hi = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
    Xbyak::Reg32 reg_a_lo = ctx.reg_alloc.ScratchGpr().cvt32();
    Xbyak::Reg32 reg_b_lo = ctx.reg_alloc.ScratchGpr().cvt32();
    Xbyak::Reg32 reg_sum, reg_diff;

    if (is_signed) {
        code.movsx(reg_a_lo, reg_a_hi.cvt16());
        code.movsx(reg_b_lo, reg_b_hi.cvt16());
        code.sar(reg_a_hi, 16);
        code.sar(reg_b_hi, 16);
    } else {
        code.movzx(reg_a_lo, reg_a_hi.cvt16());
        code.movzx(reg_b_lo, reg_b_hi.cvt16());
        code.shr(reg_a_hi, 16);
        code.shr(reg_b_hi, 16);
    }

    if (hi_is_sum) {
        code.sub(reg_a_lo, reg_b_hi);
        code.add(reg_a_hi, reg_b_lo);
        reg_diff = reg_a_lo;
        reg_sum = reg_a_hi;
    } else {
        code.add(reg_a_lo, reg_b_hi);
        code.sub(reg_a_hi, reg_b_lo);
        reg_diff = reg_a_hi;
        reg_sum = reg_a_lo;
    }

    if (ge_inst) {
        // The reg_b registers are no longer required.
        Xbyak::Reg32 ge_sum = reg_b_hi;
        Xbyak::Reg32 ge_diff = reg_b_lo;

        code.mov(ge_sum, reg_sum);
        code.mov(ge_diff, reg_diff);

        if (!is_signed) {
            code.shl(ge_sum, 15);
            code.sar(ge_sum, 31);
        } else {
            code.not_(ge_sum);
            code.sar(ge_sum, 31);
        }
        code.not_(ge_diff);
        code.sar(ge_diff, 31);
        code.and_(ge_sum, hi_is_sum ? 0xFFFF0000 : 0x0000FFFF);
        code.and_(ge_diff, hi_is_sum ? 0x0000FFFF : 0xFFFF0000);
        code.or_(ge_sum, ge_diff);

        ctx.reg_alloc.DefineValue(ge_inst, ge_sum);
        ctx.EraseInstruction(ge_inst);
    }

    if (is_halving) {
        code.shl(reg_a_lo, 15);
        code.shr(reg_a_hi, 1);
    } else {
        code.shl(reg_a_lo, 16);
    }

    // reg_a_lo now contains the low word and reg_a_hi now contains the high word.
    // Merge them.
    code.shld(reg_a_hi, reg_a_lo, 16);

    ctx.reg_alloc.DefineValue(inst, reg_a_hi);
}

void EmitX64::EmitPackedAddSubU16(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedSubAdd(code, ctx, inst, true, false, false);
}

void EmitX64::EmitPackedAddSubS16(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedSubAdd(code, ctx, inst, true, true, false);
}

void EmitX64::EmitPackedSubAddU16(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedSubAdd(code, ctx, inst, false, false, false);
}

void EmitX64::EmitPackedSubAddS16(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedSubAdd(code, ctx, inst, false, true, false);
}

void EmitX64::EmitPackedHalvingAddSubU16(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedSubAdd(code, ctx, inst, true, false, true);
}

void EmitX64::EmitPackedHalvingAddSubS16(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedSubAdd(code, ctx, inst, true, true, true);
}

void EmitX64::EmitPackedHalvingSubAddU16(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedSubAdd(code, ctx, inst, false, false, true);
}

void EmitX64::EmitPackedHalvingSubAddS16(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedSubAdd(code, ctx, inst, false, true, true);
}

static void EmitPackedOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);

    Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
    Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);

    (code.*fn)(xmm_a, xmm_b);

    ctx.reg_alloc.DefineValue(inst, xmm_a);
}

void EmitX64::EmitPackedSaturatedAddU8(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddusb);
}

void EmitX64::EmitPackedSaturatedAddS8(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddsb);
}

void EmitX64::EmitPackedSaturatedSubU8(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubusb);
}

void EmitX64::EmitPackedSaturatedSubS8(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubsb);
}

void EmitX64::EmitPackedSaturatedAddU16(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddusw);
}

void EmitX64::EmitPackedSaturatedAddS16(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddsw);
}

void EmitX64::EmitPackedSaturatedSubU16(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubusw);
}

void EmitX64::EmitPackedSaturatedSubS16(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubsw);
}

void EmitX64::EmitPackedAbsDiffSumS8(EmitContext& ctx, IR::Inst* inst) {
    EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psadbw);
}

void EmitX64::EmitPackedSelect(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);

    size_t num_args_in_xmm = args[0].IsInXmm() + args[1].IsInXmm() + args[2].IsInXmm();

    if (num_args_in_xmm >= 2) {
        Xbyak::Xmm ge = ctx.reg_alloc.UseScratchXmm(args[0]);
        Xbyak::Xmm to = ctx.reg_alloc.UseXmm(args[1]);
        Xbyak::Xmm from = ctx.reg_alloc.UseScratchXmm(args[2]);

        code.pand(from, ge);
        code.pandn(ge, to);
        code.por(from, ge);

        ctx.reg_alloc.DefineValue(inst, from);
    } else if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI1)) {
        Xbyak::Reg32 ge = ctx.reg_alloc.UseGpr(args[0]).cvt32();
        Xbyak::Reg32 to = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
        Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32();

        code.and_(from, ge);
        code.andn(to, ge, to);
        code.or_(from, to);

        ctx.reg_alloc.DefineValue(inst, from);
    } else {
        Xbyak::Reg32 ge = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
        Xbyak::Reg32 to = ctx.reg_alloc.UseGpr(args[1]).cvt32();
        Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32();

        code.and_(from, ge);
        code.not_(ge);
        code.and_(ge, to);
        code.or_(from, ge);

        ctx.reg_alloc.DefineValue(inst, from);
    }
}

} // namespace Dynarmic::BackendX64