diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 55eb0dfc..e30c3451 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -257,6 +257,7 @@ if (ARCHITECTURE_x86_64) backend/x64/emit_x64_sm4.cpp backend/x64/emit_x64_vector.cpp backend/x64/emit_x64_vector_floating_point.cpp + backend/x64/emit_x64_vector_saturation.cpp backend/x64/exception_handler.h backend/x64/hostloc.cpp backend/x64/hostloc.h diff --git a/src/backend/x64/emit_x64_vector_saturation.cpp b/src/backend/x64/emit_x64_vector_saturation.cpp new file mode 100644 index 00000000..763462e4 --- /dev/null +++ b/src/backend/x64/emit_x64_vector_saturation.cpp @@ -0,0 +1,370 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "backend/x64/block_of_code.h" +#include "backend/x64/emit_x64.h" +#include "common/common_types.h" +#include "frontend/ir/microinstruction.h" +#include "frontend/ir/opcodes.h" + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; + +namespace { + +void EmitVectorSaturatedNative(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*saturated_fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&), void (Xbyak::CodeGenerator::*unsaturated_fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&), void (Xbyak::CodeGenerator::*sub_fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm addend = ctx.reg_alloc.UseXmm(args[1]); + + if (overflow_inst) { + code.movaps(xmm0, result); + } + + (code.*saturated_fn)(result, addend); + + if (overflow_inst) { + const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); + + (code.*unsaturated_fn)(xmm0, addend); + (code.*sub_fn)(xmm0, result); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + code.ptest(xmm0, xmm0); + } else { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + code.pxor(tmp, tmp); + code.pcmpeqw(xmm0, tmp); + code.pmovmskb(overflow.cvt32(), xmm0); + code.xor_(overflow.cvt32(), 0xFFFF); + code.test(overflow.cvt32(), overflow.cvt32()); + } + code.setnz(overflow); + + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + ctx.EraseInstruction(overflow_inst); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +enum class Op { + Add, + Sub, +}; + +template +void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + static_assert(esize == 32 || esize == 64); + constexpr u64 msb_mask = esize == 32 ? 0x8000000080000000 : 0x8000000000000000; + + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm arg = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + // TODO AVX-512: vpternlog, vpsraq + // TODO AVX2 implementation + + code.movaps(xmm0, result); + code.movaps(tmp, result); + + if constexpr (op == Op::Add) { + if constexpr (esize == 32) { + code.paddd(result, arg); + } else { + code.paddq(result, arg); + } + } else { + if constexpr (esize == 32) { + code.psubd(result, arg); + } else { + code.psubq(result, arg); + } + } + + code.pxor(tmp, result); + code.pxor(xmm0, arg); + if constexpr (op == Op::Add) { + code.pandn(xmm0, tmp); + } else { + code.pand(xmm0, tmp); + } + + code.movaps(tmp, result); + code.psrad(tmp, 31); + if constexpr (esize == 64) { + code.pshufd(tmp, tmp, 0b11110101); + } + code.pxor(tmp, code.MConst(xword, msb_mask, msb_mask)); + + if (overflow_inst) { + const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + code.ptest(xmm0, code.MConst(xword, msb_mask, msb_mask)); + } else { + if constexpr (esize == 32) { + code.movmskps(overflow.cvt32(), xmm0); + } else { + code.movmskpd(overflow.cvt32(), xmm0); + } + code.test(overflow.cvt32(), overflow.cvt32()); + } + code.setnz(overflow); + + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + ctx.EraseInstruction(overflow_inst); + } + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + if constexpr (esize == 32) { + code.blendvps(result, tmp); + } else { + code.blendvpd(result, tmp); + } + + ctx.reg_alloc.DefineValue(inst, result); + } else { + code.psrad(xmm0, 31); + if constexpr (esize == 64) { + code.pshufd(xmm0, xmm0, 0b11110101); + } + + code.pand(tmp, xmm0); + code.pandn(xmm0, result); + code.por(tmp, xmm0); + + ctx.reg_alloc.DefineValue(inst, tmp); + } +} + +} // anonymous namespace + +void EmitX64::EmitVectorSignedSaturatedAdd8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::paddsb, &Xbyak::CodeGenerator::paddb, &Xbyak::CodeGenerator::psubb); +} + +void EmitX64::EmitVectorSignedSaturatedAdd16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::paddsw, &Xbyak::CodeGenerator::paddw, &Xbyak::CodeGenerator::psubw); +} + +void EmitX64::EmitVectorSignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturated(code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturated(code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedSub8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::psubsb, &Xbyak::CodeGenerator::psubb, &Xbyak::CodeGenerator::psubb); +} + +void EmitX64::EmitVectorSignedSaturatedSub16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::psubsw, &Xbyak::CodeGenerator::psubw, &Xbyak::CodeGenerator::psubw); +} + +void EmitX64::EmitVectorSignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturated(code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturated(code, ctx, inst); +} + +void EmitX64::EmitVectorUnsignedSaturatedAdd8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::paddusb, &Xbyak::CodeGenerator::paddb, &Xbyak::CodeGenerator::psubb); +} + +void EmitX64::EmitVectorUnsignedSaturatedAdd16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::paddusw, &Xbyak::CodeGenerator::paddw, &Xbyak::CodeGenerator::psubw); +} + +void EmitX64::EmitVectorUnsignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm addend = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + // TODO AVX2, AVX-512: vpternlog + + code.movaps(tmp, result); + code.movaps(xmm0, result); + + code.pxor(xmm0, addend); + code.pand(tmp, addend); + code.paddd(result, addend); + + code.psrld(xmm0, 1); + code.paddd(tmp, xmm0); + code.psrad(tmp, 31); + + code.por(result, tmp); + + if (overflow_inst) { + const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + code.ptest(tmp, tmp); + } else { + code.movmskps(overflow.cvt32(), tmp); + code.test(overflow.cvt32(), overflow.cvt32()); + } + code.setnz(overflow); + + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + ctx.EraseInstruction(overflow_inst); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorUnsignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm addend = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + // TODO AVX2, AVX-512: vpternlog + + code.movaps(tmp, result); + code.movaps(xmm0, result); + + code.pxor(xmm0, addend); + code.pand(tmp, addend); + code.paddq(result, addend); + + code.psrlq(xmm0, 1); + code.paddq(tmp, xmm0); + code.psrad(tmp, 31); + code.pshufd(tmp, tmp, 0b11110101); + + code.por(result, tmp); + + if (overflow_inst) { + const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + code.ptest(tmp, tmp); + } else { + code.movmskpd(overflow.cvt32(), tmp); + code.test(overflow.cvt32(), overflow.cvt32()); + } + code.setnz(overflow); + + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + ctx.EraseInstruction(overflow_inst); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorUnsignedSaturatedSub8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::psubusb, &Xbyak::CodeGenerator::psubb, &Xbyak::CodeGenerator::psubb); +} + +void EmitX64::EmitVectorUnsignedSaturatedSub16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::psubusw, &Xbyak::CodeGenerator::psubw, &Xbyak::CodeGenerator::psubw); +} + +void EmitX64::EmitVectorUnsignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm subtrahend = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + // TODO AVX2, AVX-512: vpternlog + + code.movaps(tmp, result); + code.movaps(xmm0, subtrahend); + + code.pxor(tmp, subtrahend); + code.psubd(result, subtrahend); + code.pand(xmm0, tmp); + + code.psrld(tmp, 1); + code.psubd(tmp, xmm0); + code.psrad(tmp, 31); + + if (overflow_inst) { + const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + code.ptest(tmp, tmp); + } else { + code.movmskps(overflow.cvt32(), tmp); + code.test(overflow.cvt32(), overflow.cvt32()); + } + code.setnz(overflow); + + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + ctx.EraseInstruction(overflow_inst); + } + + code.pandn(tmp, result); + + ctx.reg_alloc.DefineValue(inst, tmp); +} + +void EmitX64::EmitVectorUnsignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm subtrahend = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + // TODO AVX2, AVX-512: vpternlog + + code.movaps(tmp, result); + code.movaps(xmm0, subtrahend); + + code.pxor(tmp, subtrahend); + code.psubq(result, subtrahend); + code.pand(xmm0, tmp); + + code.psrlq(tmp, 1); + code.psubq(tmp, xmm0); + code.psrad(tmp, 31); + code.pshufd(tmp, tmp, 0b11110101); + + if (overflow_inst) { + const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + code.ptest(tmp, tmp); + } else { + code.movmskpd(overflow.cvt32(), tmp); + code.test(overflow.cvt32(), overflow.cvt32()); + } + code.setnz(overflow); + + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + ctx.EraseInstruction(overflow_inst); + } + + code.pandn(tmp, result); + + ctx.reg_alloc.DefineValue(inst, tmp); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 5eff3492..dad61212 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -635,6 +635,83 @@ ResultAndOverflow IREmitter::UnsignedSaturation(const U32& a, size_t bit_si return {result, overflow}; } +ResultAndOverflow IREmitter::VectorSignedSaturatedAdd(size_t esize, const U128& a, const U128& b) { + const auto result = [&]{ + switch (esize) { + case 8: + return Inst(Opcode::VectorSignedSaturatedAdd8, a, b); + case 16: + return Inst(Opcode::VectorSignedSaturatedAdd16, a, b); + case 32: + return Inst(Opcode::VectorSignedSaturatedAdd32, a, b); + case 64: + return Inst(Opcode::VectorSignedSaturatedAdd64, a, b); + default: + UNREACHABLE(); + } + }(); + const auto overflow = Inst(Opcode::GetOverflowFromOp, result); + return {result, overflow}; +} + +ResultAndOverflow IREmitter::VectorSignedSaturatedSub(size_t esize, const U128& a, const U128& b) { + const auto result = [&]{ + switch (esize) { + case 8: + return Inst(Opcode::VectorSignedSaturatedSub8, a, b); + case 16: + return Inst(Opcode::VectorSignedSaturatedSub16, a, b); + case 32: + return Inst(Opcode::VectorSignedSaturatedSub32, a, b); + case 64: + return Inst(Opcode::VectorSignedSaturatedSub64, a, b); + default: + UNREACHABLE(); + } + }(); + const auto overflow = Inst(Opcode::GetOverflowFromOp, result); + return {result, overflow}; +} + +ResultAndOverflow IREmitter::VectorUnsignedSaturatedAdd(size_t esize, const U128& a, const U128& b) { + const auto result = [&]{ + switch (esize) { + case 8: + return Inst(Opcode::VectorUnsignedSaturatedAdd8, a, b); + case 16: + return Inst(Opcode::VectorUnsignedSaturatedAdd16, a, b); + case 32: + return Inst(Opcode::VectorUnsignedSaturatedAdd32, a, b); + case 64: + return Inst(Opcode::VectorUnsignedSaturatedAdd64, a, b); + default: + UNREACHABLE(); + } + }(); + const auto overflow = Inst(Opcode::GetOverflowFromOp, result); + return {result, overflow}; +} + +ResultAndOverflow IREmitter::VectorUnsignedSaturatedSub(size_t esize, const U128& a, const U128& b) { + const auto result = [&]{ + switch (esize) { + case 8: + return Inst(Opcode::VectorUnsignedSaturatedSub8, a, b); + case 16: + return Inst(Opcode::VectorUnsignedSaturatedSub16, a, b); + case 32: + return Inst(Opcode::VectorUnsignedSaturatedSub32, a, b); + case 64: + return Inst(Opcode::VectorUnsignedSaturatedSub64, a, b); + default: + UNREACHABLE(); + } + }(); + const auto overflow = Inst(Opcode::GetOverflowFromOp, result); + return {result, overflow}; +} + + ResultAndGE IREmitter::PackedAddU8(const U32& a, const U32& b) { const auto result = Inst(Opcode::PackedAddU8, a, b); const auto ge = Inst(Opcode::GetGEFromOp, result); diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 6b8eec0d..4b28b0d9 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -166,6 +166,11 @@ public: ResultAndOverflow UnsignedSaturatedSub(const UAny& a, const UAny& b); ResultAndOverflow UnsignedSaturation(const U32& a, size_t bit_size_to_saturate_to); + ResultAndOverflow VectorSignedSaturatedAdd(size_t esize, const U128& a, const U128& b); + ResultAndOverflow VectorSignedSaturatedSub(size_t esize, const U128& a, const U128& b); + ResultAndOverflow VectorUnsignedSaturatedAdd(size_t esize, const U128& a, const U128& b); + ResultAndOverflow VectorUnsignedSaturatedSub(size_t esize, const U128& a, const U128& b); + ResultAndGE PackedAddU8(const U32& a, const U32& b); ResultAndGE PackedAddS8(const U32& a, const U32& b); ResultAndGE PackedAddU16(const U32& a, const U32& b); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 29288348..f601c891 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -194,6 +194,24 @@ OPCODE(UnsignedSaturatedSub32, U32, U32, OPCODE(UnsignedSaturatedSub64, U64, U64, U64 ) OPCODE(UnsignedSaturation, U32, U32, U8 ) +// Vector saturated instructions +OPCODE(VectorSignedSaturatedAdd8, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedAdd16, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedAdd32, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedAdd64, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedSub8, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedSub16, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedSub32, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedSub64, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAdd8, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAdd16, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAdd32, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAdd64, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedSub8, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedSub16, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedSub32, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedSub64, U128, U128, U128 ) + // Packed instructions OPCODE(PackedAddU8, U32, U32, U32 ) OPCODE(PackedAddS8, U32, U32, U32 )