/* This file is part of the dynarmic project. * Copyright (c) 2016 MerryMage * This software may be used and distributed according to the terms of the GNU * General Public License version 2 or any later version. */ #include "backend_x64/block_of_code.h" #include "backend_x64/emit_x64.h" #include "common/assert.h" #include "common/common_types.h" #include "frontend/ir/basic_block.h" #include "frontend/ir/microinstruction.h" #include "frontend/ir/opcodes.h" namespace Dynarmic::BackendX64 { using namespace Xbyak::util; void EmitX64::EmitPackedAddU8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); code->paddb(xmm_a, xmm_b); if (ge_inst) { Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm(); code->pcmpeqb(ones, ones); code->movdqa(xmm_ge, xmm_a); code->pminub(xmm_ge, xmm_b); code->pcmpeqb(xmm_ge, xmm_b); code->pxor(xmm_ge, ones); ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); ctx.EraseInstruction(ge_inst); } ctx.reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedAddS8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); if (ge_inst) { Xbyak::Xmm saturated_sum = ctx.reg_alloc.ScratchXmm(); Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); code->pxor(xmm_ge, xmm_ge); code->movdqa(saturated_sum, xmm_a); code->paddsb(saturated_sum, xmm_b); code->pcmpgtb(xmm_ge, saturated_sum); code->pcmpeqb(saturated_sum, saturated_sum); code->pxor(xmm_ge, saturated_sum); ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); ctx.EraseInstruction(ge_inst); } code->paddb(xmm_a, xmm_b); ctx.reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedAddU16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); code->paddw(xmm_a, xmm_b); if (ge_inst) { if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm(); code->pcmpeqb(ones, ones); code->movdqa(xmm_ge, xmm_a); code->pminuw(xmm_ge, xmm_b); code->pcmpeqw(xmm_ge, xmm_b); code->pxor(xmm_ge, ones); ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); ctx.EraseInstruction(ge_inst); } else { Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm(); Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm(); // !(b <= a+b) == b > a+b code->movdqa(tmp_a, xmm_a); code->movdqa(tmp_b, xmm_b); code->paddw(tmp_a, code->MConst(0x80008000)); code->paddw(tmp_b, code->MConst(0x80008000)); code->pcmpgtw(tmp_b, tmp_a); // *Signed* comparison! ctx.reg_alloc.DefineValue(ge_inst, tmp_b); ctx.EraseInstruction(ge_inst); } } ctx.reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedAddS16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); if (ge_inst) { Xbyak::Xmm saturated_sum = ctx.reg_alloc.ScratchXmm(); Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); code->pxor(xmm_ge, xmm_ge); code->movdqa(saturated_sum, xmm_a); code->paddsw(saturated_sum, xmm_b); code->pcmpgtw(xmm_ge, saturated_sum); code->pcmpeqw(saturated_sum, saturated_sum); code->pxor(xmm_ge, saturated_sum); ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); ctx.EraseInstruction(ge_inst); } code->paddw(xmm_a, xmm_b); ctx.reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedSubU8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); if (ge_inst) { Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); code->movdqa(xmm_ge, xmm_a); code->pmaxub(xmm_ge, xmm_b); code->pcmpeqb(xmm_ge, xmm_a); ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); ctx.EraseInstruction(ge_inst); } code->psubb(xmm_a, xmm_b); ctx.reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedSubS8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); if (ge_inst) { Xbyak::Xmm saturated_sum = ctx.reg_alloc.ScratchXmm(); Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); code->pxor(xmm_ge, xmm_ge); code->movdqa(saturated_sum, xmm_a); code->psubsb(saturated_sum, xmm_b); code->pcmpgtb(xmm_ge, saturated_sum); code->pcmpeqb(saturated_sum, saturated_sum); code->pxor(xmm_ge, saturated_sum); ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); ctx.EraseInstruction(ge_inst); } code->psubb(xmm_a, xmm_b); ctx.reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedSubU16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); if (!ge_inst) { Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); code->psubw(xmm_a, xmm_b); ctx.reg_alloc.DefineValue(inst, xmm_a); return; } if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); code->movdqa(xmm_ge, xmm_a); code->pmaxuw(xmm_ge, xmm_b); // Requires SSE 4.1 code->pcmpeqw(xmm_ge, xmm_a); code->psubw(xmm_a, xmm_b); ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); ctx.EraseInstruction(ge_inst); ctx.reg_alloc.DefineValue(inst, xmm_a); return; } Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]); Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm(); // (a >= b) == !(b > a) code->pcmpeqb(ones, ones); code->paddw(xmm_a, code->MConst(0x80008000)); code->paddw(xmm_b, code->MConst(0x80008000)); code->movdqa(xmm_ge, xmm_b); code->pcmpgtw(xmm_ge, xmm_a); // *Signed* comparison! code->pxor(xmm_ge, ones); code->psubw(xmm_a, xmm_b); ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); ctx.EraseInstruction(ge_inst); ctx.reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedSubS16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); if (ge_inst) { Xbyak::Xmm saturated_diff = ctx.reg_alloc.ScratchXmm(); Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); code->pxor(xmm_ge, xmm_ge); code->movdqa(saturated_diff, xmm_a); code->psubsw(saturated_diff, xmm_b); code->pcmpgtw(xmm_ge, saturated_diff); code->pcmpeqw(saturated_diff, saturated_diff); code->pxor(xmm_ge, saturated_diff); ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); ctx.EraseInstruction(ge_inst); } code->psubw(xmm_a, xmm_b); ctx.reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedHalvingAddU8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (args[0].IsInXmm() || args[1].IsInXmm()) { Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]); Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm(); // Since, // pavg(a, b) == (a + b + 1) >> 1 // Therefore, // ~pavg(~a, ~b) == (a + b) >> 1 code->pcmpeqb(ones, ones); code->pxor(xmm_a, ones); code->pxor(xmm_b, ones); code->pavgb(xmm_a, xmm_b); code->pxor(xmm_a, ones); ctx.reg_alloc.DefineValue(inst, xmm_a); } else { Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32(); Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32(); Xbyak::Reg32 and_a_b = reg_a; Xbyak::Reg32 result = reg_a; // This relies on the equality x+y == ((x&y) << 1) + (x^y). // Note that x^y always contains the LSB of the result. // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1). // We mask by 0x7F to remove the LSB so that it doesn't leak into the field below. code->mov(xor_a_b, reg_a); code->and_(and_a_b, reg_b); code->xor_(xor_a_b, reg_b); code->shr(xor_a_b, 1); code->and_(xor_a_b, 0x7F7F7F7F); code->add(result, xor_a_b); ctx.reg_alloc.DefineValue(inst, result); } } void EmitX64::EmitPackedHalvingAddU16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (args[0].IsInXmm() || args[1].IsInXmm()) { Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); code->movdqa(tmp, xmm_a); code->pand(xmm_a, xmm_b); code->pxor(tmp, xmm_b); code->psrlw(tmp, 1); code->paddw(xmm_a, tmp); ctx.reg_alloc.DefineValue(inst, xmm_a); } else { Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32(); Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32(); Xbyak::Reg32 and_a_b = reg_a; Xbyak::Reg32 result = reg_a; // This relies on the equality x+y == ((x&y) << 1) + (x^y). // Note that x^y always contains the LSB of the result. // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1). // We mask by 0x7FFF to remove the LSB so that it doesn't leak into the field below. code->mov(xor_a_b, reg_a); code->and_(and_a_b, reg_b); code->xor_(xor_a_b, reg_b); code->shr(xor_a_b, 1); code->and_(xor_a_b, 0x7FFF7FFF); code->add(result, xor_a_b); ctx.reg_alloc.DefineValue(inst, result); } } void EmitX64::EmitPackedHalvingAddS8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32(); Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32(); Xbyak::Reg32 and_a_b = reg_a; Xbyak::Reg32 result = reg_a; Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr().cvt32(); // This relies on the equality x+y == ((x&y) << 1) + (x^y). // Note that x^y always contains the LSB of the result. // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1). // We mask by 0x7F to remove the LSB so that it doesn't leak into the field below. // carry propagates the sign bit from (x^y)>>1 upwards by one. code->mov(xor_a_b, reg_a); code->and_(and_a_b, reg_b); code->xor_(xor_a_b, reg_b); code->mov(carry, xor_a_b); code->and_(carry, 0x80808080); code->shr(xor_a_b, 1); code->and_(xor_a_b, 0x7F7F7F7F); code->add(result, xor_a_b); code->xor_(result, carry); ctx.reg_alloc.DefineValue(inst, result); } void EmitX64::EmitPackedHalvingAddS16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); // This relies on the equality x+y == ((x&y) << 1) + (x^y). // Note that x^y always contains the LSB of the result. // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>>1). // The arithmetic shift right makes this signed. code->movdqa(tmp, xmm_a); code->pand(xmm_a, xmm_b); code->pxor(tmp, xmm_b); code->psraw(tmp, 1); code->paddw(xmm_a, tmp); ctx.reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedHalvingSubU8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1). // Note that x^y always contains the LSB of the result. // Since we want to calculate (x+y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y). code->xor_(minuend, subtrahend); code->and_(subtrahend, minuend); code->shr(minuend, 1); // At this point, // minuend := (a^b) >> 1 // subtrahend := (a^b) & b // We must now perform a partitioned subtraction. // We can do this because minuend contains 7 bit fields. // We use the extra bit in minuend as a bit to borrow from; we set this bit. // We invert this bit at the end as this tells us if that bit was borrowed from. code->or_(minuend, 0x80808080); code->sub(minuend, subtrahend); code->xor_(minuend, 0x80808080); // minuend now contains the desired result. ctx.reg_alloc.DefineValue(inst, minuend); } void EmitX64::EmitPackedHalvingSubS8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr().cvt32(); // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1). // Note that x^y always contains the LSB of the result. // Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y). code->xor_(minuend, subtrahend); code->and_(subtrahend, minuend); code->mov(carry, minuend); code->and_(carry, 0x80808080); code->shr(minuend, 1); // At this point, // minuend := (a^b) >> 1 // subtrahend := (a^b) & b // carry := (a^b) & 0x80808080 // We must now perform a partitioned subtraction. // We can do this because minuend contains 7 bit fields. // We use the extra bit in minuend as a bit to borrow from; we set this bit. // We invert this bit at the end as this tells us if that bit was borrowed from. // We then sign extend the result into this bit. code->or_(minuend, 0x80808080); code->sub(minuend, subtrahend); code->xor_(minuend, 0x80808080); code->xor_(minuend, carry); ctx.reg_alloc.DefineValue(inst, minuend); } void EmitX64::EmitPackedHalvingSubU16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(args[1]); // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1). // Note that x^y always contains the LSB of the result. // Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y). code->pxor(minuend, subtrahend); code->pand(subtrahend, minuend); code->psrlw(minuend, 1); // At this point, // minuend := (a^b) >> 1 // subtrahend := (a^b) & b code->psubw(minuend, subtrahend); ctx.reg_alloc.DefineValue(inst, minuend); } void EmitX64::EmitPackedHalvingSubS16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(args[1]); // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1). // Note that x^y always contains the LSB of the result. // Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>>1) - ((x^y)&y). code->pxor(minuend, subtrahend); code->pand(subtrahend, minuend); code->psraw(minuend, 1); // At this point, // minuend := (a^b) >>> 1 // subtrahend := (a^b) & b code->psubw(minuend, subtrahend); ctx.reg_alloc.DefineValue(inst, minuend); } void EmitPackedSubAdd(BlockOfCode* code, EmitContext& ctx, IR::Inst* inst, bool hi_is_sum, bool is_signed, bool is_halving) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); Xbyak::Reg32 reg_a_hi = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); Xbyak::Reg32 reg_b_hi = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); Xbyak::Reg32 reg_a_lo = ctx.reg_alloc.ScratchGpr().cvt32(); Xbyak::Reg32 reg_b_lo = ctx.reg_alloc.ScratchGpr().cvt32(); Xbyak::Reg32 reg_sum, reg_diff; if (is_signed) { code->movsx(reg_a_lo, reg_a_hi.cvt16()); code->movsx(reg_b_lo, reg_b_hi.cvt16()); code->sar(reg_a_hi, 16); code->sar(reg_b_hi, 16); } else { code->movzx(reg_a_lo, reg_a_hi.cvt16()); code->movzx(reg_b_lo, reg_b_hi.cvt16()); code->shr(reg_a_hi, 16); code->shr(reg_b_hi, 16); } if (hi_is_sum) { code->sub(reg_a_lo, reg_b_hi); code->add(reg_a_hi, reg_b_lo); reg_diff = reg_a_lo; reg_sum = reg_a_hi; } else { code->add(reg_a_lo, reg_b_hi); code->sub(reg_a_hi, reg_b_lo); reg_diff = reg_a_hi; reg_sum = reg_a_lo; } if (ge_inst) { // The reg_b registers are no longer required. Xbyak::Reg32 ge_sum = reg_b_hi; Xbyak::Reg32 ge_diff = reg_b_lo; code->mov(ge_sum, reg_sum); code->mov(ge_diff, reg_diff); if (!is_signed) { code->shl(ge_sum, 15); code->sar(ge_sum, 31); } else { code->not_(ge_sum); code->sar(ge_sum, 31); } code->not_(ge_diff); code->sar(ge_diff, 31); code->and_(ge_sum, hi_is_sum ? 0xFFFF0000 : 0x0000FFFF); code->and_(ge_diff, hi_is_sum ? 0x0000FFFF : 0xFFFF0000); code->or_(ge_sum, ge_diff); ctx.reg_alloc.DefineValue(ge_inst, ge_sum); ctx.EraseInstruction(ge_inst); } if (is_halving) { code->shl(reg_a_lo, 15); code->shr(reg_a_hi, 1); } else { code->shl(reg_a_lo, 16); } // reg_a_lo now contains the low word and reg_a_hi now contains the high word. // Merge them. code->shld(reg_a_hi, reg_a_lo, 16); ctx.reg_alloc.DefineValue(inst, reg_a_hi); } void EmitX64::EmitPackedAddSubU16(EmitContext& ctx, IR::Inst* inst) { EmitPackedSubAdd(code, ctx, inst, true, false, false); } void EmitX64::EmitPackedAddSubS16(EmitContext& ctx, IR::Inst* inst) { EmitPackedSubAdd(code, ctx, inst, true, true, false); } void EmitX64::EmitPackedSubAddU16(EmitContext& ctx, IR::Inst* inst) { EmitPackedSubAdd(code, ctx, inst, false, false, false); } void EmitX64::EmitPackedSubAddS16(EmitContext& ctx, IR::Inst* inst) { EmitPackedSubAdd(code, ctx, inst, false, true, false); } void EmitX64::EmitPackedHalvingAddSubU16(EmitContext& ctx, IR::Inst* inst) { EmitPackedSubAdd(code, ctx, inst, true, false, true); } void EmitX64::EmitPackedHalvingAddSubS16(EmitContext& ctx, IR::Inst* inst) { EmitPackedSubAdd(code, ctx, inst, true, true, true); } void EmitX64::EmitPackedHalvingSubAddU16(EmitContext& ctx, IR::Inst* inst) { EmitPackedSubAdd(code, ctx, inst, false, false, true); } void EmitX64::EmitPackedHalvingSubAddS16(EmitContext& ctx, IR::Inst* inst) { EmitPackedSubAdd(code, ctx, inst, false, true, true); } static void EmitPackedOperation(BlockOfCode* code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); (code->*fn)(xmm_a, xmm_b); ctx.reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedSaturatedAddU8(EmitContext& ctx, IR::Inst* inst) { EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddusb); } void EmitX64::EmitPackedSaturatedAddS8(EmitContext& ctx, IR::Inst* inst) { EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddsb); } void EmitX64::EmitPackedSaturatedSubU8(EmitContext& ctx, IR::Inst* inst) { EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubusb); } void EmitX64::EmitPackedSaturatedSubS8(EmitContext& ctx, IR::Inst* inst) { EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubsb); } void EmitX64::EmitPackedSaturatedAddU16(EmitContext& ctx, IR::Inst* inst) { EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddusw); } void EmitX64::EmitPackedSaturatedAddS16(EmitContext& ctx, IR::Inst* inst) { EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddsw); } void EmitX64::EmitPackedSaturatedSubU16(EmitContext& ctx, IR::Inst* inst) { EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubusw); } void EmitX64::EmitPackedSaturatedSubS16(EmitContext& ctx, IR::Inst* inst) { EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubsw); } void EmitX64::EmitPackedAbsDiffSumS8(EmitContext& ctx, IR::Inst* inst) { EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psadbw); } void EmitX64::EmitPackedSelect(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); size_t num_args_in_xmm = args[0].IsInXmm() + args[1].IsInXmm() + args[2].IsInXmm(); if (num_args_in_xmm >= 2) { Xbyak::Xmm ge = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm to = ctx.reg_alloc.UseXmm(args[1]); Xbyak::Xmm from = ctx.reg_alloc.UseScratchXmm(args[2]); code->pand(from, ge); code->pandn(ge, to); code->por(from, ge); ctx.reg_alloc.DefineValue(inst, from); } else if (code->DoesCpuSupport(Xbyak::util::Cpu::tBMI1)) { Xbyak::Reg32 ge = ctx.reg_alloc.UseGpr(args[0]).cvt32(); Xbyak::Reg32 to = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32(); code->and_(from, ge); code->andn(to, ge, to); code->or_(from, to); ctx.reg_alloc.DefineValue(inst, from); } else { Xbyak::Reg32 ge = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); Xbyak::Reg32 to = ctx.reg_alloc.UseGpr(args[1]).cvt32(); Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32(); code->and_(from, ge); code->not_(ge); code->and_(ge, to); code->or_(from, ge); ctx.reg_alloc.DefineValue(inst, from); } } } // namespace Dynarmic::BackendX64