diff --git a/src/backend_x64/emit_x64_vector_floating_point.cpp b/src/backend_x64/emit_x64_vector_floating_point.cpp index 052b53f1..348acfbb 100644 --- a/src/backend_x64/emit_x64_vector_floating_point.cpp +++ b/src/backend_x64/emit_x64_vector_floating_point.cpp @@ -31,7 +31,7 @@ struct NaNWrapper { static constexpr u64 value = 0x7ff8'0000'0000'0000; }; -template +template static void HandleNaNs(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& xmm_a, const Xbyak::Xmm& xmm_b, const Xbyak::Xmm& result, const Xbyak::Xmm& nan_mask) { static_assert(std::is_same_v || std::is_same_v, "T must be either u32 or u64"); @@ -69,7 +69,8 @@ static void HandleNaNs(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& xm code.CallFunction(static_cast( [](RegArray& result, const RegArray& a, const RegArray& b) { for (size_t i = 0; i < result.size(); ++i) { - if (auto r = FP::ProcessNaNs(a[i], b[i])) { + auto [first, second] = IndexFunction(i, a, b); + if (auto r = FP::ProcessNaNs(first, second)) { result[i] = *r; } else if (FP::IsNaN(result[i])) { result[i] = NaNWrapper::value; @@ -86,14 +87,52 @@ static void HandleNaNs(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& xm code.SwitchToNearCode(); } -template +static std::tuple DefaultIndexFunction32(size_t i, const std::array& a, const std::array& b) { + return std::make_tuple(a[i], b[i]); +} + +static std::tuple DefaultIndexFunction64(size_t i, const std::array& a, const std::array& b) { + return std::make_tuple(a[i], b[i]); +} + +static std::tuple PairedIndexFunction32(size_t i, const std::array& a, const std::array& b) { + if (i < 2) { + return std::make_tuple(a[2 * i], a[2 * i + 1]); + } + return std::make_tuple(b[2 * (i - 2)], b[2 * (i - 2) + 1]); +} + +static std::tuple PairedIndexFunction64(size_t i, const std::array& a, const std::array& b) { + return i == 0 ? std::make_tuple(a[0], a[1]) : std::make_tuple(b[0], b[1]); +} + +static std::tuple PairedLowerIndexFunction32(size_t i, const std::array& a, const std::array& b) { + switch (i) { + case 0: + return std::make_tuple(a[0], a[1]); + case 1: + return std::make_tuple(b[0], b[1]); + default: + return std::make_tuple(u32(0), u32(0)); + } +} + +static std::tuple PairedLowerIndexFunction64(size_t i, const std::array& a, const std::array& b) { + return i == 0 ? std::make_tuple(a[0], b[0]) : std::make_tuple(u64(0), u64(0)); +} + +template static void EmitVectorOperation32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); - (code.*fn)(xmm_a, xmm_b); + if constexpr (std::is_member_function_pointer_v) { + (code.*fn)(xmm_a, xmm_b); + } else { + fn(xmm_a, xmm_b); + } if (ctx.FPSCR_DN()) { Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); @@ -121,22 +160,30 @@ static void EmitVectorOperation32(BlockOfCode& code, EmitContext& ctx, IR::Inst* code.movaps(nan_mask, xmm_b); code.movaps(result, xmm_a); code.cmpunordps(nan_mask, xmm_a); - (code.*fn)(result, xmm_b); + if constexpr (std::is_member_function_pointer_v) { + (code.*fn)(result, xmm_b); + } else { + fn(result, xmm_b); + } code.cmpunordps(nan_mask, result); - HandleNaNs(code, ctx, xmm_a, xmm_b, result, nan_mask); + HandleNaNs(code, ctx, xmm_a, xmm_b, result, nan_mask); ctx.reg_alloc.DefineValue(inst, result); } -template +template static void EmitVectorOperation64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); - (code.*fn)(xmm_a, xmm_b); + if constexpr (std::is_member_function_pointer_v) { + (code.*fn)(xmm_a, xmm_b); + } else { + fn(xmm_a, xmm_b); + } if (ctx.FPSCR_DN()) { Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); @@ -165,10 +212,14 @@ static void EmitVectorOperation64(BlockOfCode& code, EmitContext& ctx, IR::Inst* code.movaps(nan_mask, xmm_b); code.movaps(result, xmm_a); code.cmpunordpd(nan_mask, xmm_a); - (code.*fn)(result, xmm_b); + if constexpr (std::is_member_function_pointer_v) { + (code.*fn)(result, xmm_b); + } else { + fn(result, xmm_b); + } code.cmpunordpd(nan_mask, result); - HandleNaNs(code, ctx, xmm_a, xmm_b, result, nan_mask); + HandleNaNs(code, ctx, xmm_a, xmm_b, result, nan_mask); ctx.reg_alloc.DefineValue(inst, result); } @@ -229,19 +280,19 @@ void EmitX64::EmitFPVectorAbs64(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitFPVectorAdd32(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::addps); + EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::addps); } void EmitX64::EmitFPVectorAdd64(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::addpd); + EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::addpd); } void EmitX64::EmitFPVectorDiv32(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::divps); + EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::divps); } void EmitX64::EmitFPVectorDiv64(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::divpd); + EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::divpd); } void EmitX64::EmitFPVectorEqual32(EmitContext& ctx, IR::Inst* inst) { @@ -305,11 +356,37 @@ void EmitX64::EmitFPVectorGreaterEqual64(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitFPVectorMul32(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::mulps); + EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::mulps); } void EmitX64::EmitFPVectorMul64(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::mulpd); + EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::mulpd); +} + +void EmitX64::EmitFPVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::haddps); +} + +void EmitX64::EmitFPVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::haddpd); +} + +void EmitX64::EmitFPVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation32(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm xmm_b) { + const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); + code.xorps(zero, zero); + code.punpcklqdq(result, xmm_b); + code.haddps(result, zero); + }); +} + +void EmitX64::EmitFPVectorPairedAddLower64(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation64(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm xmm_b) { + const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); + code.xorps(zero, zero); + code.punpcklqdq(result, xmm_b); + code.haddpd(result, zero); + }); } void EmitX64::EmitFPVectorS32ToSingle(EmitContext& ctx, IR::Inst* inst) { @@ -363,11 +440,11 @@ void EmitX64::EmitFPVectorS64ToDouble(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitFPVectorSub32(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::subps); + EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::subps); } void EmitX64::EmitFPVectorSub64(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::subpd); + EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::subpd); } void EmitX64::EmitFPVectorU32ToSingle(EmitContext& ctx, IR::Inst* inst) { diff --git a/src/frontend/A64/decoder/a64.inc b/src/frontend/A64/decoder/a64.inc index 00c51443..39dd32de 100644 --- a/src/frontend/A64/decoder/a64.inc +++ b/src/frontend/A64/decoder/a64.inc @@ -766,7 +766,7 @@ INST(MLS_vec, "MLS (vector)", "0Q101 //INST(SQRDMULH_vec_2, "SQRDMULH (vector)", "0Q101110zz1mmmmm101101nnnnnddddd") //INST(FMAXNMP_vec_2, "FMAXNMP (vector)", "0Q1011100z1mmmmm110001nnnnnddddd") //INST(FMLAL_vec_2, "FMLAL, FMLAL2 (vector)", "0Q1011100z1mmmmm110011nnnnnddddd") -//INST(FADDP_vec_2, "FADDP (vector)", "0Q1011100z1mmmmm110101nnnnnddddd") +INST(FADDP_vec_2, "FADDP (vector)", "0Q1011100z1mmmmm110101nnnnnddddd") INST(FMUL_vec_2, "FMUL (vector)", "0Q1011100z1mmmmm110111nnnnnddddd") INST(FCMGE_reg_4, "FCMGE (register)", "0Q1011100z1mmmmm111001nnnnnddddd") INST(FACGE_4, "FACGE", "0Q1011100z1mmmmm111011nnnnnddddd") diff --git a/src/frontend/A64/translate/impl/simd_three_same.cpp b/src/frontend/A64/translate/impl/simd_three_same.cpp index 6e56ddaa..36dfbca8 100644 --- a/src/frontend/A64/translate/impl/simd_three_same.cpp +++ b/src/frontend/A64/translate/impl/simd_three_same.cpp @@ -700,6 +700,21 @@ bool TranslatorVisitor::EOR_asimd(bool Q, Vec Vm, Vec Vn, Vec Vd) { return true; } +bool TranslatorVisitor::FADDP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = Q ? ir.FPVectorPairedAdd(esize, operand1, operand2) : ir.FPVectorPairedAddLower(esize, operand1, operand2); + V(datasize, Vd, result); + return true; +} + bool TranslatorVisitor::FMUL_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { if (sz && !Q) { return ReservedValue(); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 142de470..6d4dc842 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1647,6 +1647,28 @@ U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b) { return {}; } +U128 IREmitter::FPVectorPairedAdd(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 32: + return Inst(Opcode::FPVectorPairedAdd32, a, b); + case 64: + return Inst(Opcode::FPVectorPairedAdd64, a, b); + } + UNREACHABLE(); + return {}; +} + +U128 IREmitter::FPVectorPairedAddLower(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 32: + return Inst(Opcode::FPVectorPairedAddLower32, a, b); + case 64: + return Inst(Opcode::FPVectorPairedAddLower64, a, b); + } + UNREACHABLE(); + return {}; +} + U128 IREmitter::FPVectorSub(size_t esize, const U128& a, const U128& b) { switch (esize) { case 32: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 00da94af..36fd0e68 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -295,6 +295,8 @@ public: U128 FPVectorGreater(size_t esize, const U128& a, const U128& b); U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b); U128 FPVectorMul(size_t esize, const U128& a, const U128& b); + U128 FPVectorPairedAdd(size_t esize, const U128& a, const U128& b); + U128 FPVectorPairedAddLower(size_t esize, const U128& a, const U128& b); U128 FPVectorSub(size_t esize, const U128& a, const U128& b); U128 FPVectorS32ToSingle(const U128& a); U128 FPVectorS64ToDouble(const U128& a); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 129fdaad..5935759c 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -427,6 +427,10 @@ OPCODE(FPVectorGreaterEqual32, T::U128, T::U128, T::U OPCODE(FPVectorGreaterEqual64, T::U128, T::U128, T::U128 ) OPCODE(FPVectorMul32, T::U128, T::U128, T::U128 ) OPCODE(FPVectorMul64, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorPairedAddLower32, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorPairedAddLower64, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorPairedAdd32, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorPairedAdd64, T::U128, T::U128, T::U128 ) OPCODE(FPVectorS32ToSingle, T::U128, T::U128 ) OPCODE(FPVectorS64ToDouble, T::U128, T::U128 ) OPCODE(FPVectorSub32, T::U128, T::U128, T::U128 )