IR: Implement FPMulX IR instruction

2018-08-02 14:11:14 +01:00 · 2018-08-02 14:11:14 +01:00 · 17f73974f2
commit 17f73974f2
parent a8b938ef32
4 changed files with 115 additions and 0 deletions
--- a/src/backend_x64/emit_x64_floating_point.cpp
+++ b/src/backend_x64/emit_x64_floating_point.cpp
@ -36,6 +36,8 @@ namespace mp = Dynarmic::Common::mp;

 namespace {

+const Xbyak::Reg64 INVALID_REG = Xbyak::Reg64(-1);
+
 constexpr u64 f32_negative_zero = 0x80000000u;
 constexpr u64 f32_nan = 0x7fc00000u;
 constexpr u64 f32_non_sign_mask = 0x7fffffffu;
@ -669,6 +671,107 @@ void EmitX64::EmitFPMulAdd64(EmitContext& ctx, IR::Inst* inst) {
    EmitFPMulAdd<64>(code, ctx, inst);
 }

+template<size_t fsize>
+static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+    using FPT = mp::unsigned_integer_of_size<fsize>;
+
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const bool do_default_nan = ctx.FPSCR_DN() || !ctx.AccurateNaN();
+
+    const Xbyak::Xmm op1 = ctx.reg_alloc.UseXmm(args[0]);
+    const Xbyak::Xmm op2 = ctx.reg_alloc.UseXmm(args[1]);
+    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+    const Xbyak::Reg32 tmp = do_default_nan ? INVALID_REG.cvt32() : ctx.reg_alloc.ScratchGpr().cvt32();
+
+    Xbyak::Label end, nan, op_are_nans;
+
+    if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
+        FCODE(vmuls)(result, op1, op2);
+    } else {
+        code.movaps(result, op1);
+        FCODE(muls)(result, op2);
+    }
+    FCODE(ucomis)(result, result);
+    code.jp(nan, code.T_NEAR);
+    code.L(end);
+
+    code.SwitchToFarCode();
+    code.L(nan);
+    FCODE(ucomis)(op1, op2);
+    code.jp(op_are_nans);
+    if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
+        code.vxorps(result, op1, op2);
+    } else {
+        code.movaps(result, op1);
+        code.xorps(result, op2);
+    }
+    code.andps(result, code.MConst(xword, FP::FPInfo<FPT>::sign_mask));
+    code.orps(result, code.MConst(xword, FP::FPValue<FPT, false, 0, 2>()));
+    code.jmp(end, code.T_NEAR);
+    code.L(op_are_nans);
+    if (do_default_nan) {
+        code.movaps(result, code.MConst(xword, FP::FPInfo<FPT>::DefaultNaN()));
+        code.jmp(end, code.T_NEAR);
+    } else {
+        if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
+            code.vxorps(xmm0, op1, op2);
+        } else {
+            code.movaps(xmm0, op1);
+            code.xorps(xmm0, op2);
+        }
+
+        constexpr FPT exponent_mask = FP::FPInfo<FPT>::exponent_mask;
+        constexpr FPT mantissa_msb = FP::FPInfo<FPT>::mantissa_msb;
+        constexpr u8 mantissa_msb_bit = static_cast<u8>(FP::FPInfo<FPT>::explicit_mantissa_width - 1);
+        constexpr size_t shift = fsize == 32 ? 0 : 48;
+
+        if constexpr (fsize == 32) {
+            code.movd(tmp, xmm0);
+        } else {
+            code.pextrw(tmp, xmm0, shift / 16);
+        }
+        code.and_(tmp, static_cast<u32>((exponent_mask | mantissa_msb) >> shift));
+        code.cmp(tmp, static_cast<u32>(mantissa_msb >> shift));
+        code.jne(end, code.T_NEAR); // (op1 != NaN || op2 != NaN) OR (op1 == SNaN && op2 == SNaN) OR (op1 == QNaN && op2 == QNaN) OR (op1 == SNaN && op2 == Inf) OR (op1 == Inf && op2 == SNaN)
+
+        // If we're here there are four cases left:
+        // op1 == SNaN && op2 == QNaN
+        // op1 == Inf  && op2 == QNaN
+        // op1 == QNaN && op2 == SNaN <<< The problematic case
+        // op1 == QNaN && op2 == Inf
+
+        if constexpr (fsize == 32) {
+            code.movd(tmp, op2);
+            code.shl(tmp, 32 - mantissa_msb_bit);
+        } else {
+            code.movq(tmp.cvt64(), op2);
+            code.shl(tmp.cvt64(), 64 - mantissa_msb_bit);
+        }
+        // If op2 is a SNaN, CF = 0 and ZF = 0.
+        code.jna(end, code.T_NEAR);
+
+        if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
+            code.vorps(result, op2, code.MConst(xword, mantissa_msb));
+        } else {
+            code.movaps(result, op2);
+            code.orps(result, code.MConst(xword, mantissa_msb));
+        }
+        code.jmp(end, code.T_NEAR);
+    }
+    code.SwitchToNearCode();
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPMulX32(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPMulX<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPMulX64(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPMulX<64>(code, ctx, inst);
+}
+
 template<typename FPT>
 static void EmitFPRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
--- a/src/frontend/ir/ir_emitter.cpp
+++ b/src/frontend/ir/ir_emitter.cpp
@ -1612,6 +1612,15 @@ U32U64 IREmitter::FPMulAdd(const U32U64& a, const U32U64& b, const U32U64& c, bo
    }
 }

+U32U64 IREmitter::FPMulX(const U32U64& a, const U32U64& b) {
+    ASSERT(a.GetType() == b.GetType());
+    if (a.GetType() == Type::U32) {
+        return Inst<U32>(Opcode::FPMulX32, a, b);
+    } else {
+        return Inst<U64>(Opcode::FPMulX64, a, b);
+    }
+}
+
 U32U64 IREmitter::FPNeg(const U32U64& a) {
    if (a.GetType() == Type::U32) {
        return Inst<U32>(Opcode::FPNeg32, a);
--- a/src/frontend/ir/ir_emitter.h
+++ b/src/frontend/ir/ir_emitter.h
@ -278,6 +278,7 @@ public:
    U32U64 FPMinNumeric(const U32U64& a, const U32U64& b, bool fpscr_controlled);
    U32U64 FPMul(const U32U64& a, const U32U64& b, bool fpscr_controlled);
    U32U64 FPMulAdd(const U32U64& addend, const U32U64& op1, const U32U64& op2, bool fpscr_controlled);
+    U32U64 FPMulX(const U32U64& a, const U32U64& b);
    U32U64 FPNeg(const U32U64& a);
    U32U64 FPRecipEstimate(const U32U64& a);
    U32U64 FPRecipStepFused(const U32U64& a, const U32U64& b);
--- a/src/frontend/ir/opcodes.inc
+++ b/src/frontend/ir/opcodes.inc
@ -421,6 +421,8 @@ OPCODE(FPMul32,                                 T::U32,         T::U32,
 OPCODE(FPMul64,                                 T::U64,         T::U64,         T::U64                          )
 OPCODE(FPMulAdd32,                              T::U32,         T::U32,         T::U32,         T::U32          )
 OPCODE(FPMulAdd64,                              T::U64,         T::U64,         T::U64,         T::U64          )
+OPCODE(FPMulX32,                                T::U32,         T::U32,         T::U32                          )
+OPCODE(FPMulX64,                                T::U64,         T::U64,         T::U64                          )
 OPCODE(FPNeg32,                                 T::U32,         T::U32                                          )
 OPCODE(FPNeg64,                                 T::U64,         T::U64                                          )
 OPCODE(FPRecipEstimate32,                       T::U32,         T::U32                                          )