diff --git a/src/backend/x64/emit_x64_vector.cpp b/src/backend/x64/emit_x64_vector.cpp index 989abd91..3b98af67 100644 --- a/src/backend/x64/emit_x64_vector.cpp +++ b/src/backend/x64/emit_x64_vector.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -308,6 +309,52 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) { } } +static void VectorAbs8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { + code.pabsb(data, data); + } else { + const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); + code.pxor(temp, temp); + code.psubb(temp, data); + code.pminub(data, temp); + } +} + +static void VectorAbs16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { + code.pabsw(data, data); + } else { + const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); + code.pxor(temp, temp); + code.psubw(temp, data); + code.pmaxsw(data, temp); + } +} + +static void VectorAbs32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { + code.pabsd(data, data); + } else { + const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); + code.movdqa(temp, data); + code.psrad(temp, 31); + code.pxor(data, temp); + code.psubd(data, temp); + } +} + +static void VectorAbs64(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { + code.vpabsq(data, data); + } else { + const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); + code.pshufd(temp, data, 0b11110101); + code.psrad(temp, 31); + code.pxor(data, temp); + code.psubq(data, temp); + } +} + static void EmitVectorAbs(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); @@ -315,46 +362,16 @@ static void EmitVectorAbs(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockO switch (esize) { case 8: - if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { - code.pabsb(data, data); - } else { - const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); - code.pxor(temp, temp); - code.psubb(temp, data); - code.pminub(data, temp); - } + VectorAbs8(code, ctx, data); break; case 16: - if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { - code.pabsw(data, data); - } else { - const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); - code.pxor(temp, temp); - code.psubw(temp, data); - code.pmaxsw(data, temp); - } + VectorAbs16(code, ctx, data); break; case 32: - if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { - code.pabsd(data, data); - } else { - const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); - code.movdqa(temp, data); - code.psrad(temp, 31); - code.pxor(data, temp); - code.psubd(data, temp); - } + VectorAbs32(code, ctx, data); break; case 64: - if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { - code.vpabsq(data, data); - } else { - const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); - code.pshufd(temp, data, 0b11110101); - code.psrad(temp, 31); - code.pxor(data, temp); - code.psubq(data, temp); - } + VectorAbs64(code, ctx, data); break; } @@ -2613,6 +2630,133 @@ void EmitX64::EmitVectorSignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* i EmitVectorSignedAbsoluteDifference(32, ctx, inst, code); } +static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm data_test = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm sign = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Address mask = [esize, &code] { + switch (esize) { + case 8: + return code.MConst(xword, 0x8080808080808080, 0x8080808080808080); + case 16: + return code.MConst(xword, 0x8000800080008000, 0x8000800080008000); + case 32: + return code.MConst(xword, 0x8000000080000000, 0x8000000080000000); + case 64: + return code.MConst(xword, 0x8000000000000000, 0x8000000000000000); + default: + UNREACHABLE(); + return Xbyak::Address{0}; + } + }(); + + const u32 test_mask = [esize] { + switch (esize) { + case 8: + return 0b1111'1111'1111'1111; + case 16: + return 0b1010'1010'1010'1010; + case 32: + return 0b1000'1000'1000'1000; + case 64: + return 0b10000000'10000000; + default: + UNREACHABLE(); + return 0; + } + }(); + + const auto vector_equality = [esize, &code](const Xbyak::Xmm& x, const Xbyak::Xmm& y) { + switch (esize) { + case 8: + code.pcmpeqb(x, y); + break; + case 16: + code.pcmpeqw(x, y); + break; + case 32: + code.pcmpeqd(x, y); + break; + case 64: + code.pcmpeqq(x, y); + break; + } + }; + + // Keep a copy of the initial data for determining whether or not + // to set the Q flag + code.movdqa(data_test, data); + + switch (esize) { + case 8: + VectorAbs8(code, ctx, data); + break; + case 16: + VectorAbs16(code, ctx, data); + break; + case 32: + VectorAbs32(code, ctx, data); + break; + case 64: + VectorAbs64(code, ctx, data); + break; + } + + code.movdqa(sign, mask); + vector_equality(sign, data); + code.pxor(data, sign); + + // Check if the initial data contained any elements with the value 0x80. + // If any exist, then the Q flag needs to be set. + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + code.movdqa(sign, mask); + vector_equality(data_test, sign); + code.pmovmskb(bit, data_test); + code.test(bit, test_mask); + code.setnz(bit.cvt8()); + + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit.cvt8()); + + ctx.reg_alloc.DefineValue(inst, data); +} + + +void EmitX64::EmitVectorSignedSaturatedAbs8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedAbs(8, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedAbs16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedAbs(16, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedAbs32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedAbs(32, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedAbs64(EmitContext& ctx, IR::Inst* inst) { + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + EmitVectorSignedSaturatedAbs(64, code, ctx, inst); + return; + } + + EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray& result, const VectorArray& data) { + bool qc_flag = false; + + for (size_t i = 0; i < result.size(); i++) { + if (static_cast(data[i]) == 0x8000000000000000) { + result[i] = 0x7FFFFFFFFFFFFFFF; + qc_flag = true; + } else { + result[i] = std::abs(data[i]); + } + } + + return qc_flag; + }); +} + static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(args[0]); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 6d65b477..a1680af0 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1496,6 +1496,21 @@ U128 IREmitter::VectorSignedAbsoluteDifference(size_t esize, const U128& a, cons return {}; } +U128 IREmitter::VectorSignedSaturatedAbs(size_t esize, const U128& a) { + switch (esize) { + case 8: + return Inst(Opcode::VectorSignedSaturatedAbs8, a); + case 16: + return Inst(Opcode::VectorSignedSaturatedAbs16, a); + case 32: + return Inst(Opcode::VectorSignedSaturatedAbs32, a); + case 64: + return Inst(Opcode::VectorSignedSaturatedAbs64, a); + } + UNREACHABLE(); + return {}; +} + U128 IREmitter::VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a) { switch (original_esize) { case 16: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 1f3c4d89..46beaf9f 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -263,6 +263,7 @@ public: U128 VectorShuffleWords(const U128& a, u8 mask); U128 VectorSignExtend(size_t original_esize, const U128& a); U128 VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b); + U128 VectorSignedSaturatedAbs(size_t esize, const U128& a); U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a); U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a); U128 VectorSub(size_t esize, const U128& a, const U128& b); diff --git a/src/frontend/ir/microinstruction.cpp b/src/frontend/ir/microinstruction.cpp index fffd6f42..c0da1c25 100644 --- a/src/frontend/ir/microinstruction.cpp +++ b/src/frontend/ir/microinstruction.cpp @@ -347,6 +347,10 @@ bool Inst::ReadsFromFPSRCumulativeSaturationBit() const { bool Inst::WritesToFPSRCumulativeSaturationBit() const { switch (op) { case Opcode::A64OrQC: + case Opcode::VectorSignedSaturatedAbs8: + case Opcode::VectorSignedSaturatedAbs16: + case Opcode::VectorSignedSaturatedAbs32: + case Opcode::VectorSignedSaturatedAbs64: case Opcode::VectorSignedSaturatedNarrowToSigned16: case Opcode::VectorSignedSaturatedNarrowToSigned32: case Opcode::VectorSignedSaturatedNarrowToSigned64: diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 87f025b1..7b93cbcd 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -391,6 +391,10 @@ OPCODE(VectorSignExtend64, U128, U128 OPCODE(VectorSignedAbsoluteDifference8, U128, U128, U128 ) OPCODE(VectorSignedAbsoluteDifference16, U128, U128, U128 ) OPCODE(VectorSignedAbsoluteDifference32, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedAbs8, U128, U128 ) +OPCODE(VectorSignedSaturatedAbs16, U128, U128 ) +OPCODE(VectorSignedSaturatedAbs32, U128, U128 ) +OPCODE(VectorSignedSaturatedAbs64, U128, U128 ) OPCODE(VectorSignedSaturatedNarrowToSigned16, U128, U128 ) OPCODE(VectorSignedSaturatedNarrowToSigned32, U128, U128 ) OPCODE(VectorSignedSaturatedNarrowToSigned64, U128, U128 )