67443efb62
Makes namespacing a little less noisy
704 lines
24 KiB
C++
704 lines
24 KiB
C++
/* This file is part of the dynarmic project.
|
|
* Copyright (c) 2016 MerryMage
|
|
* This software may be used and distributed according to the terms of the GNU
|
|
* General Public License version 2 or any later version.
|
|
*/
|
|
|
|
#include "backend_x64/block_of_code.h"
|
|
#include "backend_x64/emit_x64.h"
|
|
#include "common/assert.h"
|
|
#include "common/common_types.h"
|
|
#include "frontend/ir/basic_block.h"
|
|
#include "frontend/ir/microinstruction.h"
|
|
#include "frontend/ir/opcodes.h"
|
|
|
|
namespace Dynarmic::BackendX64 {
|
|
|
|
using namespace Xbyak::util;
|
|
|
|
void EmitX64::EmitPackedAddU8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
code->paddb(xmm_a, xmm_b);
|
|
|
|
if (ge_inst) {
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code->pcmpeqb(ones, ones);
|
|
|
|
code->movdqa(xmm_ge, xmm_a);
|
|
code->pminub(xmm_ge, xmm_b);
|
|
code->pcmpeqb(xmm_ge, xmm_b);
|
|
code->pxor(xmm_ge, ones);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedAddS8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
if (ge_inst) {
|
|
Xbyak::Xmm saturated_sum = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code->pxor(xmm_ge, xmm_ge);
|
|
code->movdqa(saturated_sum, xmm_a);
|
|
code->paddsb(saturated_sum, xmm_b);
|
|
code->pcmpgtb(xmm_ge, saturated_sum);
|
|
code->pcmpeqb(saturated_sum, saturated_sum);
|
|
code->pxor(xmm_ge, saturated_sum);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
|
|
code->paddb(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedAddU16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
code->paddw(xmm_a, xmm_b);
|
|
|
|
if (ge_inst) {
|
|
if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code->pcmpeqb(ones, ones);
|
|
|
|
code->movdqa(xmm_ge, xmm_a);
|
|
code->pminuw(xmm_ge, xmm_b);
|
|
code->pcmpeqw(xmm_ge, xmm_b);
|
|
code->pxor(xmm_ge, ones);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
} else {
|
|
Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm();
|
|
|
|
// !(b <= a+b) == b > a+b
|
|
code->movdqa(tmp_a, xmm_a);
|
|
code->movdqa(tmp_b, xmm_b);
|
|
code->paddw(tmp_a, code->MConst(0x80008000));
|
|
code->paddw(tmp_b, code->MConst(0x80008000));
|
|
code->pcmpgtw(tmp_b, tmp_a); // *Signed* comparison!
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, tmp_b);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
}
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedAddS16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
if (ge_inst) {
|
|
Xbyak::Xmm saturated_sum = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code->pxor(xmm_ge, xmm_ge);
|
|
code->movdqa(saturated_sum, xmm_a);
|
|
code->paddsw(saturated_sum, xmm_b);
|
|
code->pcmpgtw(xmm_ge, saturated_sum);
|
|
code->pcmpeqw(saturated_sum, saturated_sum);
|
|
code->pxor(xmm_ge, saturated_sum);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
|
|
code->paddw(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSubU8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
if (ge_inst) {
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code->movdqa(xmm_ge, xmm_a);
|
|
code->pmaxub(xmm_ge, xmm_b);
|
|
code->pcmpeqb(xmm_ge, xmm_a);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
|
|
code->psubb(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSubS8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
if (ge_inst) {
|
|
Xbyak::Xmm saturated_sum = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code->pxor(xmm_ge, xmm_ge);
|
|
code->movdqa(saturated_sum, xmm_a);
|
|
code->psubsb(saturated_sum, xmm_b);
|
|
code->pcmpgtb(xmm_ge, saturated_sum);
|
|
code->pcmpeqb(saturated_sum, saturated_sum);
|
|
code->pxor(xmm_ge, saturated_sum);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
|
|
code->psubb(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSubU16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
if (!ge_inst) {
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
code->psubw(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
return;
|
|
}
|
|
|
|
if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code->movdqa(xmm_ge, xmm_a);
|
|
code->pmaxuw(xmm_ge, xmm_b); // Requires SSE 4.1
|
|
code->pcmpeqw(xmm_ge, xmm_a);
|
|
|
|
code->psubw(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
return;
|
|
}
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
|
|
|
|
// (a >= b) == !(b > a)
|
|
code->pcmpeqb(ones, ones);
|
|
code->paddw(xmm_a, code->MConst(0x80008000));
|
|
code->paddw(xmm_b, code->MConst(0x80008000));
|
|
code->movdqa(xmm_ge, xmm_b);
|
|
code->pcmpgtw(xmm_ge, xmm_a); // *Signed* comparison!
|
|
code->pxor(xmm_ge, ones);
|
|
|
|
code->psubw(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSubS16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
if (ge_inst) {
|
|
Xbyak::Xmm saturated_diff = ctx.reg_alloc.ScratchXmm();
|
|
Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code->pxor(xmm_ge, xmm_ge);
|
|
code->movdqa(saturated_diff, xmm_a);
|
|
code->psubsw(saturated_diff, xmm_b);
|
|
code->pcmpgtw(xmm_ge, saturated_diff);
|
|
code->pcmpeqw(saturated_diff, saturated_diff);
|
|
code->pxor(xmm_ge, saturated_diff);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
|
|
code->psubw(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingAddU8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
if (args[0].IsInXmm() || args[1].IsInXmm()) {
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
|
|
Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
|
|
|
|
// Since,
|
|
// pavg(a, b) == (a + b + 1) >> 1
|
|
// Therefore,
|
|
// ~pavg(~a, ~b) == (a + b) >> 1
|
|
|
|
code->pcmpeqb(ones, ones);
|
|
code->pxor(xmm_a, ones);
|
|
code->pxor(xmm_b, ones);
|
|
code->pavgb(xmm_a, xmm_b);
|
|
code->pxor(xmm_a, ones);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
} else {
|
|
Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
|
|
Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
Xbyak::Reg32 and_a_b = reg_a;
|
|
Xbyak::Reg32 result = reg_a;
|
|
|
|
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
|
|
// We mask by 0x7F to remove the LSB so that it doesn't leak into the field below.
|
|
|
|
code->mov(xor_a_b, reg_a);
|
|
code->and_(and_a_b, reg_b);
|
|
code->xor_(xor_a_b, reg_b);
|
|
code->shr(xor_a_b, 1);
|
|
code->and_(xor_a_b, 0x7F7F7F7F);
|
|
code->add(result, xor_a_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingAddU16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
if (args[0].IsInXmm() || args[1].IsInXmm()) {
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
|
|
|
code->movdqa(tmp, xmm_a);
|
|
code->pand(xmm_a, xmm_b);
|
|
code->pxor(tmp, xmm_b);
|
|
code->psrlw(tmp, 1);
|
|
code->paddw(xmm_a, tmp);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
} else {
|
|
Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
|
|
Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
Xbyak::Reg32 and_a_b = reg_a;
|
|
Xbyak::Reg32 result = reg_a;
|
|
|
|
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
|
|
// We mask by 0x7FFF to remove the LSB so that it doesn't leak into the field below.
|
|
|
|
code->mov(xor_a_b, reg_a);
|
|
code->and_(and_a_b, reg_b);
|
|
code->xor_(xor_a_b, reg_b);
|
|
code->shr(xor_a_b, 1);
|
|
code->and_(xor_a_b, 0x7FFF7FFF);
|
|
code->add(result, xor_a_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingAddS8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
|
|
Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
Xbyak::Reg32 and_a_b = reg_a;
|
|
Xbyak::Reg32 result = reg_a;
|
|
Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
|
|
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
|
|
// We mask by 0x7F to remove the LSB so that it doesn't leak into the field below.
|
|
// carry propagates the sign bit from (x^y)>>1 upwards by one.
|
|
|
|
code->mov(xor_a_b, reg_a);
|
|
code->and_(and_a_b, reg_b);
|
|
code->xor_(xor_a_b, reg_b);
|
|
code->mov(carry, xor_a_b);
|
|
code->and_(carry, 0x80808080);
|
|
code->shr(xor_a_b, 1);
|
|
code->and_(xor_a_b, 0x7F7F7F7F);
|
|
code->add(result, xor_a_b);
|
|
code->xor_(result, carry);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingAddS16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
|
|
|
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>>1).
|
|
// The arithmetic shift right makes this signed.
|
|
|
|
code->movdqa(tmp, xmm_a);
|
|
code->pand(xmm_a, xmm_b);
|
|
code->pxor(tmp, xmm_b);
|
|
code->psraw(tmp, 1);
|
|
code->paddw(xmm_a, tmp);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingSubU8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
|
|
|
|
// This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x+y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y).
|
|
|
|
code->xor_(minuend, subtrahend);
|
|
code->and_(subtrahend, minuend);
|
|
code->shr(minuend, 1);
|
|
|
|
// At this point,
|
|
// minuend := (a^b) >> 1
|
|
// subtrahend := (a^b) & b
|
|
|
|
// We must now perform a partitioned subtraction.
|
|
// We can do this because minuend contains 7 bit fields.
|
|
// We use the extra bit in minuend as a bit to borrow from; we set this bit.
|
|
// We invert this bit at the end as this tells us if that bit was borrowed from.
|
|
code->or_(minuend, 0x80808080);
|
|
code->sub(minuend, subtrahend);
|
|
code->xor_(minuend, 0x80808080);
|
|
|
|
// minuend now contains the desired result.
|
|
ctx.reg_alloc.DefineValue(inst, minuend);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingSubS8(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
|
|
|
|
Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
|
|
// This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y).
|
|
|
|
code->xor_(minuend, subtrahend);
|
|
code->and_(subtrahend, minuend);
|
|
code->mov(carry, minuend);
|
|
code->and_(carry, 0x80808080);
|
|
code->shr(minuend, 1);
|
|
|
|
// At this point,
|
|
// minuend := (a^b) >> 1
|
|
// subtrahend := (a^b) & b
|
|
// carry := (a^b) & 0x80808080
|
|
|
|
// We must now perform a partitioned subtraction.
|
|
// We can do this because minuend contains 7 bit fields.
|
|
// We use the extra bit in minuend as a bit to borrow from; we set this bit.
|
|
// We invert this bit at the end as this tells us if that bit was borrowed from.
|
|
// We then sign extend the result into this bit.
|
|
code->or_(minuend, 0x80808080);
|
|
code->sub(minuend, subtrahend);
|
|
code->xor_(minuend, 0x80808080);
|
|
code->xor_(minuend, carry);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, minuend);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingSubU16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(args[1]);
|
|
|
|
// This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y).
|
|
|
|
code->pxor(minuend, subtrahend);
|
|
code->pand(subtrahend, minuend);
|
|
code->psrlw(minuend, 1);
|
|
|
|
// At this point,
|
|
// minuend := (a^b) >> 1
|
|
// subtrahend := (a^b) & b
|
|
|
|
code->psubw(minuend, subtrahend);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, minuend);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingSubS16(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(args[1]);
|
|
|
|
// This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
|
|
// Note that x^y always contains the LSB of the result.
|
|
// Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>>1) - ((x^y)&y).
|
|
|
|
code->pxor(minuend, subtrahend);
|
|
code->pand(subtrahend, minuend);
|
|
code->psraw(minuend, 1);
|
|
|
|
// At this point,
|
|
// minuend := (a^b) >>> 1
|
|
// subtrahend := (a^b) & b
|
|
|
|
code->psubw(minuend, subtrahend);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, minuend);
|
|
}
|
|
|
|
void EmitPackedSubAdd(BlockOfCode* code, EmitContext& ctx, IR::Inst* inst, bool hi_is_sum, bool is_signed, bool is_halving) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
|
|
|
Xbyak::Reg32 reg_a_hi = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 reg_b_hi = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
|
|
Xbyak::Reg32 reg_a_lo = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
Xbyak::Reg32 reg_b_lo = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
Xbyak::Reg32 reg_sum, reg_diff;
|
|
|
|
if (is_signed) {
|
|
code->movsx(reg_a_lo, reg_a_hi.cvt16());
|
|
code->movsx(reg_b_lo, reg_b_hi.cvt16());
|
|
code->sar(reg_a_hi, 16);
|
|
code->sar(reg_b_hi, 16);
|
|
} else {
|
|
code->movzx(reg_a_lo, reg_a_hi.cvt16());
|
|
code->movzx(reg_b_lo, reg_b_hi.cvt16());
|
|
code->shr(reg_a_hi, 16);
|
|
code->shr(reg_b_hi, 16);
|
|
}
|
|
|
|
if (hi_is_sum) {
|
|
code->sub(reg_a_lo, reg_b_hi);
|
|
code->add(reg_a_hi, reg_b_lo);
|
|
reg_diff = reg_a_lo;
|
|
reg_sum = reg_a_hi;
|
|
} else {
|
|
code->add(reg_a_lo, reg_b_hi);
|
|
code->sub(reg_a_hi, reg_b_lo);
|
|
reg_diff = reg_a_hi;
|
|
reg_sum = reg_a_lo;
|
|
}
|
|
|
|
if (ge_inst) {
|
|
// The reg_b registers are no longer required.
|
|
Xbyak::Reg32 ge_sum = reg_b_hi;
|
|
Xbyak::Reg32 ge_diff = reg_b_lo;
|
|
|
|
code->mov(ge_sum, reg_sum);
|
|
code->mov(ge_diff, reg_diff);
|
|
|
|
if (!is_signed) {
|
|
code->shl(ge_sum, 15);
|
|
code->sar(ge_sum, 31);
|
|
} else {
|
|
code->not_(ge_sum);
|
|
code->sar(ge_sum, 31);
|
|
}
|
|
code->not_(ge_diff);
|
|
code->sar(ge_diff, 31);
|
|
code->and_(ge_sum, hi_is_sum ? 0xFFFF0000 : 0x0000FFFF);
|
|
code->and_(ge_diff, hi_is_sum ? 0x0000FFFF : 0xFFFF0000);
|
|
code->or_(ge_sum, ge_diff);
|
|
|
|
ctx.reg_alloc.DefineValue(ge_inst, ge_sum);
|
|
ctx.EraseInstruction(ge_inst);
|
|
}
|
|
|
|
if (is_halving) {
|
|
code->shl(reg_a_lo, 15);
|
|
code->shr(reg_a_hi, 1);
|
|
} else {
|
|
code->shl(reg_a_lo, 16);
|
|
}
|
|
|
|
// reg_a_lo now contains the low word and reg_a_hi now contains the high word.
|
|
// Merge them.
|
|
code->shld(reg_a_hi, reg_a_lo, 16);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, reg_a_hi);
|
|
}
|
|
|
|
void EmitX64::EmitPackedAddSubU16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, true, false, false);
|
|
}
|
|
|
|
void EmitX64::EmitPackedAddSubS16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, true, true, false);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSubAddU16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, false, false, false);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSubAddS16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, false, true, false);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingAddSubU16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, true, false, true);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingAddSubS16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, true, true, true);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingSubAddU16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, false, false, true);
|
|
}
|
|
|
|
void EmitX64::EmitPackedHalvingSubAddS16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedSubAdd(code, ctx, inst, false, true, true);
|
|
}
|
|
|
|
static void EmitPackedOperation(BlockOfCode* code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
(code->*fn)(xmm_a, xmm_b);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedAddU8(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddusb);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedAddS8(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddsb);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedSubU8(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubusb);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedSubS8(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubsb);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedAddU16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddusw);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedAddS16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddsw);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedSubU16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubusw);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSaturatedSubS16(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubsw);
|
|
}
|
|
|
|
void EmitX64::EmitPackedAbsDiffSumS8(EmitContext& ctx, IR::Inst* inst) {
|
|
EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psadbw);
|
|
}
|
|
|
|
void EmitX64::EmitPackedSelect(EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
size_t num_args_in_xmm = args[0].IsInXmm() + args[1].IsInXmm() + args[2].IsInXmm();
|
|
|
|
if (num_args_in_xmm >= 2) {
|
|
Xbyak::Xmm ge = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
Xbyak::Xmm to = ctx.reg_alloc.UseXmm(args[1]);
|
|
Xbyak::Xmm from = ctx.reg_alloc.UseScratchXmm(args[2]);
|
|
|
|
code->pand(from, ge);
|
|
code->pandn(ge, to);
|
|
code->por(from, ge);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, from);
|
|
} else if (code->DoesCpuSupport(Xbyak::util::Cpu::tBMI1)) {
|
|
Xbyak::Reg32 ge = ctx.reg_alloc.UseGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 to = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
|
|
Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32();
|
|
|
|
code->and_(from, ge);
|
|
code->andn(to, ge, to);
|
|
code->or_(from, to);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, from);
|
|
} else {
|
|
Xbyak::Reg32 ge = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
Xbyak::Reg32 to = ctx.reg_alloc.UseGpr(args[1]).cvt32();
|
|
Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32();
|
|
|
|
code->and_(from, ge);
|
|
code->not_(ge);
|
|
code->and_(ge, to);
|
|
code->or_(from, ge);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, from);
|
|
}
|
|
}
|
|
|
|
} // namespace Dynarmic::BackendX64
|