A64: Implement SQXTN (vector)

2018-07-24 17:59:14 +01:00 · 2018-07-24 17:59:14 +01:00 · 3874cb37e3
commit 3874cb37e3
parent 8ef114d48f
8 changed files with 139 additions and 38 deletions
--- a/src/backend_x64/emit_x64_vector.cpp
+++ b/src/backend_x64/emit_x64_vector.cpp
@ -2193,6 +2193,73 @@ void EmitX64::EmitVectorSignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* i
    EmitVectorSignedAbsoluteDifference(32, ctx, inst, code);
 }

+static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(args[0]);
+    const Xbyak::Xmm dest = ctx.reg_alloc.ScratchXmm();
+    const Xbyak::Xmm reconstructed = ctx.reg_alloc.ScratchXmm();
+    const Xbyak::Xmm sign = ctx.reg_alloc.ScratchXmm();
+
+    code.movdqa(dest, src);
+
+    switch (original_esize) {
+    case 16:
+        code.packsswb(dest, dest);
+        code.movdqa(sign, src);
+        code.psraw(sign, 15);
+        code.packsswb(sign, sign);
+        code.movdqa(reconstructed, dest);
+        code.punpcklbw(reconstructed, sign);
+        break;
+    case 32:
+        code.packssdw(dest, dest);
+        code.movdqa(reconstructed, dest);
+        code.movdqa(sign, dest);
+        code.psraw(sign, 15);
+        code.punpcklwd(reconstructed, sign);
+        break;
+    default:
+        UNREACHABLE();
+        break;
+    }
+
+    const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
+
+    if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
+        code.pxor(reconstructed, src);
+        code.ptest(reconstructed, reconstructed);
+    } else {
+        code.pcmpeqd(reconstructed, src);
+        code.movmskps(bit, reconstructed);
+        code.cmp(bit, 0);
+    }
+
+    code.setnz(bit.cvt8());
+    code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit.cvt8());
+
+    ctx.reg_alloc.DefineValue(inst, dest);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNarrowToSigned16(EmitContext& ctx, IR::Inst* inst) {
+    EmitVectorSignedSaturatedNarrowToSigned(16, code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNarrowToSigned32(EmitContext& ctx, IR::Inst* inst) {
+    EmitVectorSignedSaturatedNarrowToSigned(32, code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNarrowToSigned64(EmitContext& ctx, IR::Inst* inst) {
+    EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<s32>& result, const VectorArray<s64>& a) {
+        bool qc_flag = false;
+        for (size_t i = 0; i < a.size(); ++i) {
+            const s64 saturated = std::clamp<s64>(a[i], -0x80000000, 0x7FFFFFFF);
+            result[i] = static_cast<s32>(saturated);
+            qc_flag |= saturated != a[i];
+        }
+        return qc_flag;
+    });
+}
+
 static void EmitVectorSignedSaturatedNarrowToUnsigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(args[0]);
--- a/src/frontend/A64/decoder/a64.inc
+++ b/src/frontend/A64/decoder/a64.inc
@ -579,7 +579,7 @@ INST(CMEQ_zero_2,            "CMEQ (zero)",                               "0Q001
 INST(CMLT_2,                 "CMLT (zero)",                               "0Q001110zz100000101010nnnnnddddd")
 INST(ABS_2,                  "ABS",                                       "0Q001110zz100000101110nnnnnddddd")
 INST(XTN,                    "XTN, XTN2",                                 "0Q001110zz100001001010nnnnnddddd")
-//INST(SQXTN_2,                "SQXTN, SQXTN2",                             "0Q001110zz100001010010nnnnnddddd")
+INST(SQXTN_2,                "SQXTN, SQXTN2",                             "0Q001110zz100001010010nnnnnddddd")
 //INST(FCVTN,                  "FCVTN, FCVTN2",                             "0Q0011100z100001011010nnnnnddddd")
 //INST(FCVTL,                  "FCVTL, FCVTL2",                             "0Q0011100z100001011110nnnnnddddd")
 //INST(FRINTN_1,               "FRINTN (vector)",                           "0Q00111001111001100010nnnnnddddd")
--- a/src/frontend/A64/translate/impl/impl.h
+++ b/src/frontend/A64/translate/impl/impl.h
@ -515,35 +515,20 @@ struct TranslatorVisitor final {

    // Data Processing - FP and SIMD - Scalar two-register misc
    bool SUQADD_1(Imm<2> size, Vec Vn, Vec Vd);
-    bool SUQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
    bool SQABS_1(Imm<2> size, Vec Vn, Vec Vd);
-    bool SQABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
    bool CMGT_zero_1(Imm<2> size, Vec Vn, Vec Vd);
-    bool CMGT_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
    bool CMEQ_zero_1(Imm<2> size, Vec Vn, Vec Vd);
-    bool CMEQ_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
    bool CMLT_1(Imm<2> size, Vec Vn, Vec Vd);
-    bool CMLT_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
    bool ABS_1(Imm<2> size, Vec Vn, Vec Vd);
-    bool ABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
    bool SQXTN_1(Imm<2> size, Vec Vn, Reg Rd);
-    bool SQXTN_2(bool Q, Imm<2> size, Vec Vn, Reg Rd);
    bool USQADD_1(Imm<2> size, Vec Vn, Vec Vd);
-    bool USQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
    bool SQNEG_1(Imm<2> size, Vec Vn, Vec Vd);
-    bool SQNEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
    bool CMGE_zero_1(Imm<2> size, Vec Vn, Vec Vd);
-    bool CMGE_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
    bool CMLE_1(Imm<2> size, Vec Vn, Vec Vd);
-    bool CMLE_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
    bool NEG_1(Imm<2> size, Vec Vn, Vec Vd);
-    bool NEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
    bool SQXTUN_1(Imm<2> size, Vec Vn, Reg Rd);
-    bool SQXTUN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
    bool UQXTN_1(Imm<2> size, Vec Vn, Reg Rd);
-    bool UQXTN_2(bool Q, Imm<2> size, Vec Vn, Reg Rd);
    bool FCVTXN_1(bool sz, Vec Vn, Reg Rd);
-    bool FCVTXN_2(bool Q, bool sz, Vec Vn, Reg Rd);

    // Data Processing - FP and SIMD - SIMD Scalar pairwise
    bool ADDP_pair(Imm<2> size, Vec Vn, Vec Vd);
@ -704,28 +689,6 @@ struct TranslatorVisitor final {
    bool FMINNMP_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
    bool FMINP_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd);

-    // Data Processing - FP and SIMD - SIMD Two-register misc
-    bool FRINTN_1(bool Q, Vec Vn, Vec Vd);
-    bool FRINTN_2(bool Q, bool sz, Vec Vn, Vec Vd);
-    bool FRINTM_1(bool Q, Vec Vn, Vec Vd);
-    bool FRINTM_2(bool Q, bool sz, Vec Vn, Vec Vd);
-    bool FABS_1(bool Q, Vec Vn, Vec Vd);
-    bool FABS_2(bool Q, bool sz, Vec Vn, Vec Vd);
-    bool FRINTP_1(bool Q, Vec Vn, Vec Vd);
-    bool FRINTP_2(bool Q, bool sz, Vec Vn, Vec Vd);
-    bool FRINTZ_1(bool Q, Vec Vn, Vec Vd);
-    bool FRINTZ_2(bool Q, bool sz, Vec Vn, Vec Vd);
-    bool FRINTA_1(bool Q, Vec Vn, Vec Vd);
-    bool FRINTA_2(bool Q, bool sz, Vec Vn, Vec Vd);
-    bool FRINTX_1(bool Q, Vec Vn, Vec Vd);
-    bool FRINTX_2(bool Q, bool sz, Vec Vn, Vec Vd);
-    bool FNEG_1(bool Q, Vec Vn, Vec Vd);
-    bool FNEG_2(bool Q, bool sz, Vec Vn, Vec Vd);
-    bool FRINTI_1(bool Q, Vec Vn, Vec Vd);
-    bool FRINTI_2(bool Q, bool sz, Vec Vn, Vec Vd);
-    bool FSQRT_1(bool Q, Vec Vn, Vec Vd);
-    bool FSQRT_2(bool Q, bool sz, Vec Vn, Vec Vd);
-
    // Data Processing - FP and SIMD - SIMD Three same extra
    bool SDOT_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
    bool UDOT_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
@ -751,6 +714,41 @@ struct TranslatorVisitor final {
    bool NOT(bool Q, Vec Vn, Vec Vd);
    bool RBIT_asimd(bool Q, Vec Vn, Vec Vd);
    bool URSQRTE(bool Q, bool sz, Vec Vn, Vec Vd);
+    bool SUQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+    bool SQABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+    bool CMGT_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+    bool CMEQ_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+    bool CMLT_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+    bool ABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+    bool SQXTN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+    bool USQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+    bool SQNEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+    bool CMGE_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+    bool CMLE_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+    bool NEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+    bool SQXTUN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+    bool UQXTN_2(bool Q, Imm<2> size, Vec Vn, Reg Rd);
+    bool FCVTXN_2(bool Q, bool sz, Vec Vn, Reg Rd);
+    bool FRINTN_1(bool Q, Vec Vn, Vec Vd);
+    bool FRINTN_2(bool Q, bool sz, Vec Vn, Vec Vd);
+    bool FRINTM_1(bool Q, Vec Vn, Vec Vd);
+    bool FRINTM_2(bool Q, bool sz, Vec Vn, Vec Vd);
+    bool FABS_1(bool Q, Vec Vn, Vec Vd);
+    bool FABS_2(bool Q, bool sz, Vec Vn, Vec Vd);
+    bool FRINTP_1(bool Q, Vec Vn, Vec Vd);
+    bool FRINTP_2(bool Q, bool sz, Vec Vn, Vec Vd);
+    bool FRINTZ_1(bool Q, Vec Vn, Vec Vd);
+    bool FRINTZ_2(bool Q, bool sz, Vec Vn, Vec Vd);
+    bool FRINTA_1(bool Q, Vec Vn, Vec Vd);
+    bool FRINTA_2(bool Q, bool sz, Vec Vn, Vec Vd);
+    bool FRINTX_1(bool Q, Vec Vn, Vec Vd);
+    bool FRINTX_2(bool Q, bool sz, Vec Vn, Vec Vd);
+    bool FNEG_1(bool Q, Vec Vn, Vec Vd);
+    bool FNEG_2(bool Q, bool sz, Vec Vn, Vec Vd);
+    bool FRINTI_1(bool Q, Vec Vn, Vec Vd);
+    bool FRINTI_2(bool Q, bool sz, Vec Vn, Vec Vd);
+    bool FSQRT_1(bool Q, Vec Vn, Vec Vd);
+    bool FSQRT_2(bool Q, bool sz, Vec Vn, Vec Vd);

    // Data Processing - FP and SIMD - SIMD across lanes
    bool SADDLV(bool Q, Imm<2> size, Vec Vn, Vec Vd);
--- a/src/frontend/A64/translate/impl/simd_two_register_misc.cpp
+++ b/src/frontend/A64/translate/impl/simd_two_register_misc.cpp
@ -291,6 +291,22 @@ bool TranslatorVisitor::SQXTUN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
    return true;
 }

+bool TranslatorVisitor::SQXTN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+    if (size == 0b11) {
+        return ReservedValue();
+    }
+
+    const size_t esize = 8 << size.ZeroExtend<size_t>();
+    const size_t datasize = 64;
+    const size_t part = Q ? 1 : 0;
+
+    const IR::U128 operand = V(2 * datasize, Vn);
+    const IR::U128 result = ir.VectorSignedSaturatedNarrowToSigned(2 * esize, operand);
+
+    Vpart(datasize, Vd, part, result);
+    return true;
+}
+
 bool TranslatorVisitor::NOT(bool Q, Vec Vn, Vec Vd) {
    const size_t datasize = Q ? 128 : 64;

--- a/src/frontend/ir/ir_emitter.cpp
+++ b/src/frontend/ir/ir_emitter.cpp
@ -1292,6 +1292,19 @@ U128 IREmitter::VectorSignedAbsoluteDifference(size_t esize, const U128& a, cons
    return {};
 }

+U128 IREmitter::VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a) {
+    switch (original_esize) {
+    case 16:
+        return Inst<U128>(Opcode::VectorSignedSaturatedNarrowToSigned16, a);
+    case 32:
+        return Inst<U128>(Opcode::VectorSignedSaturatedNarrowToSigned32, a);
+    case 64:
+        return Inst<U128>(Opcode::VectorSignedSaturatedNarrowToSigned64, a);
+    }
+    UNREACHABLE();
+    return {};
+}
+
 U128 IREmitter::VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a) {
    switch (original_esize) {
    case 16:
--- a/src/frontend/ir/ir_emitter.h
+++ b/src/frontend/ir/ir_emitter.h
@ -249,6 +249,7 @@ public:
    U128 VectorShuffleWords(const U128& a, u8 mask);
    U128 VectorSignExtend(size_t original_esize, const U128& a);
    U128 VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b);
+    U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a);
    U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a);
    U128 VectorSub(size_t esize, const U128& a, const U128& b);
    U128 VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b);
--- a/src/frontend/ir/microinstruction.cpp
+++ b/src/frontend/ir/microinstruction.cpp
@ -341,6 +341,9 @@ bool Inst::ReadsFromFPSRCumulativeSaturationBit() const {

 bool Inst::WritesToFPSRCumulativeSaturationBit() const {
    switch (op) {
+    case Opcode::VectorSignedSaturatedNarrowToSigned16:
+    case Opcode::VectorSignedSaturatedNarrowToSigned32:
+    case Opcode::VectorSignedSaturatedNarrowToSigned64:
    case Opcode::VectorSignedSaturatedNarrowToUnsigned16:
    case Opcode::VectorSignedSaturatedNarrowToUnsigned32:
    case Opcode::VectorSignedSaturatedNarrowToUnsigned64:
--- a/src/frontend/ir/opcodes.inc
+++ b/src/frontend/ir/opcodes.inc
@ -347,6 +347,9 @@ OPCODE(VectorSignExtend64,                      T::U128,        T::U128
 OPCODE(VectorSignedAbsoluteDifference8,         T::U128,        T::U128,        T::U128                         )
 OPCODE(VectorSignedAbsoluteDifference16,        T::U128,        T::U128,        T::U128                         )
 OPCODE(VectorSignedAbsoluteDifference32,        T::U128,        T::U128,        T::U128                         )
+OPCODE(VectorSignedSaturatedNarrowToSigned16,   T::U128,        T::U128                                         )
+OPCODE(VectorSignedSaturatedNarrowToSigned32,   T::U128,        T::U128                                         )
+OPCODE(VectorSignedSaturatedNarrowToSigned64,   T::U128,        T::U128                                         )
 OPCODE(VectorSignedSaturatedNarrowToUnsigned16, T::U128,        T::U128                                         )
 OPCODE(VectorSignedSaturatedNarrowToUnsigned32, T::U128,        T::U128                                         )
 OPCODE(VectorSignedSaturatedNarrowToUnsigned64, T::U128,        T::U128                                         )