704 lines
24 KiB
C++
704 lines
24 KiB
C++
/* This file is part of the dynarmic project.
|
|
* Copyright (c) 2016 MerryMage
|
|
* This software may be used and distributed according to the terms of the GNU
|
|
* General Public License version 2 or any later version.
|
|
*/
|
|
|
|
#include "backend_x64/block_of_code.h"
|
|
#include "backend_x64/emit_x64.h"
|
|
#include "common/assert.h"
|
|
#include "common/common_types.h"
|
|
#include "frontend/ir/basic_block.h"
|
|
#include "frontend/ir/microinstruction.h"
|
|
#include "frontend/ir/opcodes.h"
|
|
|
|
namespace Dynarmic::BackendX64 {
|
|
|
|
using namespace Xbyak::util;
|
|
|
|
void EmitX64::EmitPackedAddU8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
code.paddb(xmm_a, xmm_b);
|
|
|
|
if (ge_inst) {
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code.pcmpeqb(ones, ones);
|
|
|
|
code.movdqa(xmm_ge, xmm_a);
|
|
code.pminub(xmm_ge, xmm_b);
|
|
code.pcmpeqb(xmm_ge, xmm_b);
|
|
code.pxor(xmm_ge, ones);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedAddS8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
if (ge_inst) {
|
|
Xbyak::Xmm saturated_sum = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code.pxor(xmm_ge, xmm_ge);
|
|
code.movdqa(saturated_sum, xmm_a);
|
|
code.paddsb(saturated_sum, xmm_b);
|
|
code.pcmpgtb(xmm_ge, saturated_sum);
|
|
code.pcmpeqb(saturated_sum, saturated_sum);
|
|
code.pxor(xmm_ge, saturated_sum);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
|
|
code.paddb(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedAddU16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
code.paddw(xmm_a, xmm_b);
|
|
|
|
if (ge_inst) {
|
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code.pcmpeqb(ones, ones);
|
|
|
|
code.movdqa(xmm_ge, xmm_a);
|
|
code.pminuw(xmm_ge, xmm_b);
|
|
code.pcmpeqw(xmm_ge, xmm_b);
|
|
code.pxor(xmm_ge, ones);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
} else {
|
|
Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm();
|
|
|
|
// !(b <= a+b) == b > a+b
|
|
code.movdqa(tmp_a, xmm_a);
|
|
code.movdqa(tmp_b, xmm_b);
|
|
code.paddw(tmp_a, code.MConst(0x80008000));
|
|
code.paddw(tmp_b, code.MConst(0x80008000));
|
|
code.pcmpgtw(tmp_b, tmp_a); // *Signed* comparison!
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, tmp_b);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
}
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedAddS16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
if (ge_inst) {
|
|
Xbyak::Xmm saturated_sum = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code.pxor(xmm_ge, xmm_ge);
|
|
code.movdqa(saturated_sum, xmm_a);
|
|
code.paddsw(saturated_sum, xmm_b);
|
|
code.pcmpgtw(xmm_ge, saturated_sum);
|
|
code.pcmpeqw(saturated_sum, saturated_sum);
|
|
code.pxor(xmm_ge, saturated_sum);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
|
|
code.paddw(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSubU8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
if (ge_inst) {
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code.movdqa(xmm_ge, xmm_a);
|
|
code.pmaxub(xmm_ge, xmm_b);
|
|
code.pcmpeqb(xmm_ge, xmm_a);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
|
|
code.psubb(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSubS8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
if (ge_inst) {
|
|
Xbyak::Xmm saturated_sum = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code.pxor(xmm_ge, xmm_ge);
|
|
code.movdqa(saturated_sum, xmm_a);
|
|
code.psubsb(saturated_sum, xmm_b);
|
|
code.pcmpgtb(xmm_ge, saturated_sum);
|
|
code.pcmpeqb(saturated_sum, saturated_sum);
|
|
code.pxor(xmm_ge, saturated_sum);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
|
|
code.psubb(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSubU16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
if (!ge_inst) {
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
code.psubw(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
return;
|
|
}
|
|
|
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code.movdqa(xmm_ge, xmm_a);
|
|
code.pmaxuw(xmm_ge, xmm_b); // Requires SSE 4.1
|
|
code.pcmpeqw(xmm_ge, xmm_a);
|
|
|
|
code.psubw(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
return;
|
|
}
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
|
|
|
|
// (a >= b) == !(b > a)
|
|
code.pcmpeqb(ones, ones);
|
|
code.paddw(xmm_a, code.MConst(0x80008000));
|
|
code.paddw(xmm_b, code.MConst(0x80008000));
|
|
code.movdqa(xmm_ge, xmm_b);
|
|
code.pcmpgtw(xmm_ge, xmm_a); // *Signed* comparison!
|
|
code.pxor(xmm_ge, ones);
|
|
|
|
code.psubw(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSubS16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
if (ge_inst) {
|
|
Xbyak::Xmm saturated_diff = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code.pxor(xmm_ge, xmm_ge);
|
|
code.movdqa(saturated_diff, xmm_a);
|
|
code.psubsw(saturated_diff, xmm_b);
|
|
code.pcmpgtw(xmm_ge, saturated_diff);
|
|
code.pcmpeqw(saturated_diff, saturated_diff);
|
|
code.pxor(xmm_ge, saturated_diff);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
|
|
code.psubw(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingAddU8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
if (args[0].IsInXmm() || args[1].IsInXmm()) {
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
|
|
Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
|
|
|
|
// Since,
|
|
// pavg(a, b) == (a + b + 1) >> 1
|
|
// Therefore,
|
|
// ~pavg(~a, ~b) == (a + b) >> 1
|
|
|
|
code.pcmpeqb(ones, ones);
|
|
code.pxor(xmm_a, ones);
|
|
code.pxor(xmm_b, ones);
|
|
code.pavgb(xmm_a, xmm_b);
|
|
code.pxor(xmm_a, ones);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
} else {
|
|
Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
|
|
Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
Xbyak::Reg32 and_a_b = reg_a;
|
|
Xbyak::Reg32 result = reg_a;
|
|
|
|
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
|
|
// We mask by 0x7F to remove the LSB so that it doesn't leak into the field below.
|
|
|
|
code.mov(xor_a_b, reg_a);
|
|
code.and_(and_a_b, reg_b);
|
|
code.xor_(xor_a_b, reg_b);
|
|
code.shr(xor_a_b, 1);
|
|
code.and_(xor_a_b, 0x7F7F7F7F);
|
|
code.add(result, xor_a_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingAddU16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
if (args[0].IsInXmm() || args[1].IsInXmm()) {
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code.movdqa(tmp, xmm_a);
|
|
code.pand(xmm_a, xmm_b);
|
|
code.pxor(tmp, xmm_b);
|
|
code.psrlw(tmp, 1);
|
|
code.paddw(xmm_a, tmp);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
} else {
|
|
Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
|
|
Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
Xbyak::Reg32 and_a_b = reg_a;
|
|
Xbyak::Reg32 result = reg_a;
|
|
|
|
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
|
|
// We mask by 0x7FFF to remove the LSB so that it doesn't leak into the field below.
|
|
|
|
code.mov(xor_a_b, reg_a);
|
|
code.and_(and_a_b, reg_b);
|
|
code.xor_(xor_a_b, reg_b);
|
|
code.shr(xor_a_b, 1);
|
|
code.and_(xor_a_b, 0x7FFF7FFF);
|
|
code.add(result, xor_a_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingAddS8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
|
|
Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
Xbyak::Reg32 and_a_b = reg_a;
|
|
Xbyak::Reg32 result = reg_a;
|
|
Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
|
|
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
|
|
// We mask by 0x7F to remove the LSB so that it doesn't leak into the field below.
|
|
// carry propagates the sign bit from (x^y)>>1 upwards by one.
|
|
|
|
code.mov(xor_a_b, reg_a);
|
|
code.and_(and_a_b, reg_b);
|
|
code.xor_(xor_a_b, reg_b);
|
|
code.mov(carry, xor_a_b);
|
|
code.and_(carry, 0x80808080);
|
|
code.shr(xor_a_b, 1);
|
|
code.and_(xor_a_b, 0x7F7F7F7F);
|
|
code.add(result, xor_a_b);
|
|
code.xor_(result, carry);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingAddS16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
|
|
|
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>>1).
|
|
// The arithmetic shift right makes this signed.
|
|
|
|
code.movdqa(tmp, xmm_a);
|
|
code.pand(xmm_a, xmm_b);
|
|
code.pxor(tmp, xmm_b);
|
|
code.psraw(tmp, 1);
|
|
code.paddw(xmm_a, tmp);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingSubU8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
|
|
|
|
// This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x+y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y).
|
|
|
|
code.xor_(minuend, subtrahend);
|
|
code.and_(subtrahend, minuend);
|
|
code.shr(minuend, 1);
|
|
|
|
// At this point,
|
|
// minuend := (a^b) >> 1
|
|
// subtrahend := (a^b) & b
|
|
|
|
// We must now perform a partitioned subtraction.
|
|
// We can do this because minuend contains 7 bit fields.
|
|
// We use the extra bit in minuend as a bit to borrow from; we set this bit.
|
|
// We invert this bit at the end as this tells us if that bit was borrowed from.
|
|
code.or_(minuend, 0x80808080);
|
|
code.sub(minuend, subtrahend);
|
|
code.xor_(minuend, 0x80808080);
|
|
|
|
// minuend now contains the desired result.
|
|
ctx.reg_alloc.DefineValue(inst, minuend);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingSubS8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
|
|
|
|
Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
|
|
// This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y).
|
|
|
|
code.xor_(minuend, subtrahend);
|
|
code.and_(subtrahend, minuend);
|
|
code.mov(carry, minuend);
|
|
code.and_(carry, 0x80808080);
|
|
code.shr(minuend, 1);
|
|
|
|
// At this point,
|
|
// minuend := (a^b) >> 1
|
|
// subtrahend := (a^b) & b
|
|
// carry := (a^b) & 0x80808080
|
|
|
|
// We must now perform a partitioned subtraction.
|
|
// We can do this because minuend contains 7 bit fields.
|
|
// We use the extra bit in minuend as a bit to borrow from; we set this bit.
|
|
// We invert this bit at the end as this tells us if that bit was borrowed from.
|
|
// We then sign extend the result into this bit.
|
|
code.or_(minuend, 0x80808080);
|
|
code.sub(minuend, subtrahend);
|
|
code.xor_(minuend, 0x80808080);
|
|
code.xor_(minuend, carry);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, minuend);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingSubU16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(args[1]);
|
|
|
|
// This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y).
|
|
|
|
code.pxor(minuend, subtrahend);
|
|
code.pand(subtrahend, minuend);
|
|
code.psrlw(minuend, 1);
|
|
|
|
// At this point,
|
|
// minuend := (a^b) >> 1
|
|
// subtrahend := (a^b) & b
|
|
|
|
code.psubw(minuend, subtrahend);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, minuend);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingSubS16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(args[1]);
|
|
|
|
// This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>>1) - ((x^y)&y).
|
|
|
|
code.pxor(minuend, subtrahend);
|
|
code.pand(subtrahend, minuend);
|
|
code.psraw(minuend, 1);
|
|
|
|
// At this point,
|
|
// minuend := (a^b) >>> 1
|
|
// subtrahend := (a^b) & b
|
|
|
|
code.psubw(minuend, subtrahend);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, minuend);
|
|
}
|
|
|
|
void EmitPackedSubAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bool hi_is_sum, bool is_signed, bool is_halving) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Reg32 reg_a_hi = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 reg_b_hi = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
|
|
Xbyak::Reg32 reg_a_lo = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
Xbyak::Reg32 reg_b_lo = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
Xbyak::Reg32 reg_sum, reg_diff;
|
|
|
|
if (is_signed) {
|
|
code.movsx(reg_a_lo, reg_a_hi.cvt16());
|
|
code.movsx(reg_b_lo, reg_b_hi.cvt16());
|
|
code.sar(reg_a_hi, 16);
|
|
code.sar(reg_b_hi, 16);
|
|
} else {
|
|
code.movzx(reg_a_lo, reg_a_hi.cvt16());
|
|
code.movzx(reg_b_lo, reg_b_hi.cvt16());
|
|
code.shr(reg_a_hi, 16);
|
|
code.shr(reg_b_hi, 16);
|
|
}
|
|
|
|
if (hi_is_sum) {
|
|
code.sub(reg_a_lo, reg_b_hi);
|
|
code.add(reg_a_hi, reg_b_lo);
|
|
reg_diff = reg_a_lo;
|
|
reg_sum = reg_a_hi;
|
|
} else {
|
|
code.add(reg_a_lo, reg_b_hi);
|
|
code.sub(reg_a_hi, reg_b_lo);
|
|
reg_diff = reg_a_hi;
|
|
reg_sum = reg_a_lo;
|
|
}
|
|
|
|
if (ge_inst) {
|
|
// The reg_b registers are no longer required.
|
|
Xbyak::Reg32 ge_sum = reg_b_hi;
|
|
Xbyak::Reg32 ge_diff = reg_b_lo;
|
|
|
|
code.mov(ge_sum, reg_sum);
|
|
code.mov(ge_diff, reg_diff);
|
|
|
|
if (!is_signed) {
|
|
code.shl(ge_sum, 15);
|
|
code.sar(ge_sum, 31);
|
|
} else {
|
|
code.not_(ge_sum);
|
|
code.sar(ge_sum, 31);
|
|
}
|
|
code.not_(ge_diff);
|
|
code.sar(ge_diff, 31);
|
|
code.and_(ge_sum, hi_is_sum ? 0xFFFF0000 : 0x0000FFFF);
|
|
code.and_(ge_diff, hi_is_sum ? 0x0000FFFF : 0xFFFF0000);
|
|
code.or_(ge_sum, ge_diff);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, ge_sum);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
|
|
if (is_halving) {
|
|
code.shl(reg_a_lo, 15);
|
|
code.shr(reg_a_hi, 1);
|
|
} else {
|
|
code.shl(reg_a_lo, 16);
|
|
}
|
|
|
|
// reg_a_lo now contains the low word and reg_a_hi now contains the high word.
|
|
// Merge them.
|
|
code.shld(reg_a_hi, reg_a_lo, 16);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, reg_a_hi);
|
|
}
|
|
|
|
void EmitX64::EmitPackedAddSubU16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, true, false, false);
|
|
}
|
|
|
|
void EmitX64::EmitPackedAddSubS16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, true, true, false);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSubAddU16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, false, false, false);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSubAddS16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, false, true, false);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingAddSubU16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, true, false, true);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingAddSubS16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, true, true, true);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingSubAddU16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, false, false, true);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingSubAddS16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, false, true, true);
|
|
}
|
|
|
|
static void EmitPackedOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
(code.*fn)(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedAddU8(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddusb);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedAddS8(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddsb);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedSubU8(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubusb);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedSubS8(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubsb);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedAddU16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddusw);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedAddS16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddsw);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedSubU16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubusw);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedSubS16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubsw);
|
|
}
|
|
|
|
void EmitX64::EmitPackedAbsDiffSumS8(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psadbw);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSelect(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
size_t num_args_in_xmm = args[0].IsInXmm() + args[1].IsInXmm() + args[2].IsInXmm();
|
|
|
|
if (num_args_in_xmm >= 2) {
|
|
Xbyak::Xmm ge = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm to = ctx.reg_alloc.UseXmm(args[1]);
|
|
Xbyak::Xmm from = ctx.reg_alloc.UseScratchXmm(args[2]);
|
|
|
|
code.pand(from, ge);
|
|
code.pandn(ge, to);
|
|
code.por(from, ge);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, from);
|
|
} else if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI1)) {
|
|
Xbyak::Reg32 ge = ctx.reg_alloc.UseGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 to = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
|
|
Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32();
|
|
|
|
code.and_(from, ge);
|
|
code.andn(to, ge, to);
|
|
code.or_(from, to);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, from);
|
|
} else {
|
|
Xbyak::Reg32 ge = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 to = ctx.reg_alloc.UseGpr(args[1]).cvt32();
|
|
Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32();
|
|
|
|
code.and_(from, ge);
|
|
code.not_(ge);
|
|
code.and_(ge, to);
|
|
code.or_(from, ge);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, from);
|
|
}
|
|
}
|
|
|
|
} // namespace Dynarmic::BackendX64
|