IR: Implement FPMulX IR instruction

2018-08-02 14:11:14 +01:00 · 2018-08-02 14:11:14 +01:00 · 17f73974f2
commit 17f73974f2
parent a8b938ef32
4 changed files with 115 additions and 0 deletions
--- a/src/backend_x64/emit_x64_floating_point.cpp
+++ b/src/backend_x64/emit_x64_floating_point.cpp
@ -36,6 +36,8 @@ namespace mp = Dynarmic::Common::mp;
 namespace {
 const Xbyak::Reg64 INVALID_REG = Xbyak::Reg64(-1);
 constexpr u64 f32_negative_zero = 0x80000000u;
 constexpr u64 f32_nan = 0x7fc00000u;
 constexpr u64 f32_non_sign_mask = 0x7fffffffu;
@ -669,6 +671,107 @@ void EmitX64::EmitFPMulAdd64(EmitContext& ctx, IR::Inst* inst) {
    EmitFPMulAdd<64>(code, ctx, inst);
 }
 template<size_t fsize>
 static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    using FPT = mp::unsigned_integer_of_size<fsize>;
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    const bool do_default_nan = ctx.FPSCR_DN() || !ctx.AccurateNaN();
    const Xbyak::Xmm op1 = ctx.reg_alloc.UseXmm(args[0]);
    const Xbyak::Xmm op2 = ctx.reg_alloc.UseXmm(args[1]);
    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
    const Xbyak::Reg32 tmp = do_default_nan ? INVALID_REG.cvt32() : ctx.reg_alloc.ScratchGpr().cvt32();
    Xbyak::Label end, nan, op_are_nans;
    if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
        FCODE(vmuls)(result, op1, op2);
    } else {
        code.movaps(result, op1);
        FCODE(muls)(result, op2);
    }
    FCODE(ucomis)(result, result);
    code.jp(nan, code.T_NEAR);
    code.L(end);
    code.SwitchToFarCode();
    code.L(nan);
    FCODE(ucomis)(op1, op2);
    code.jp(op_are_nans);
    if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
        code.vxorps(result, op1, op2);
    } else {
        code.movaps(result, op1);
        code.xorps(result, op2);
    }
    code.andps(result, code.MConst(xword, FP::FPInfo<FPT>::sign_mask));
    code.orps(result, code.MConst(xword, FP::FPValue<FPT, false, 0, 2>()));
    code.jmp(end, code.T_NEAR);
    code.L(op_are_nans);
    if (do_default_nan) {
        code.movaps(result, code.MConst(xword, FP::FPInfo<FPT>::DefaultNaN()));
        code.jmp(end, code.T_NEAR);
    } else {
        if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
            code.vxorps(xmm0, op1, op2);
        } else {
            code.movaps(xmm0, op1);
            code.xorps(xmm0, op2);
        }
        constexpr FPT exponent_mask = FP::FPInfo<FPT>::exponent_mask;
        constexpr FPT mantissa_msb = FP::FPInfo<FPT>::mantissa_msb;
        constexpr u8 mantissa_msb_bit = static_cast<u8>(FP::FPInfo<FPT>::explicit_mantissa_width - 1);
        constexpr size_t shift = fsize == 32 ? 0 : 48;
        if constexpr (fsize == 32) {
            code.movd(tmp, xmm0);
        } else {
            code.pextrw(tmp, xmm0, shift / 16);
        }
        code.and_(tmp, static_cast<u32>((exponent_mask | mantissa_msb) >> shift));
        code.cmp(tmp, static_cast<u32>(mantissa_msb >> shift));
        code.jne(end, code.T_NEAR); // (op1 != NaN || op2 != NaN) OR (op1 == SNaN && op2 == SNaN) OR (op1 == QNaN && op2 == QNaN) OR (op1 == SNaN && op2 == Inf) OR (op1 == Inf && op2 == SNaN)
        // If we're here there are four cases left:
        // op1 == SNaN && op2 == QNaN
        // op1 == Inf  && op2 == QNaN
        // op1 == QNaN && op2 == SNaN <<< The problematic case
        // op1 == QNaN && op2 == Inf
        if constexpr (fsize == 32) {
            code.movd(tmp, op2);
            code.shl(tmp, 32 - mantissa_msb_bit);
        } else {
            code.movq(tmp.cvt64(), op2);
            code.shl(tmp.cvt64(), 64 - mantissa_msb_bit);
        }
        // If op2 is a SNaN, CF = 0 and ZF = 0.
        code.jna(end, code.T_NEAR);
        if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
            code.vorps(result, op2, code.MConst(xword, mantissa_msb));
        } else {
            code.movaps(result, op2);
            code.orps(result, code.MConst(xword, mantissa_msb));
        }
        code.jmp(end, code.T_NEAR);
    }
    code.SwitchToNearCode();
    ctx.reg_alloc.DefineValue(inst, result);
 }
 void EmitX64::EmitFPMulX32(EmitContext& ctx, IR::Inst* inst) {
    EmitFPMulX<32>(code, ctx, inst);
 }
 void EmitX64::EmitFPMulX64(EmitContext& ctx, IR::Inst* inst) {
    EmitFPMulX<64>(code, ctx, inst);
 }
 template<typename FPT>
 static void EmitFPRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
--- a/src/frontend/ir/ir_emitter.cpp
+++ b/src/frontend/ir/ir_emitter.cpp
@ -1612,6 +1612,15 @@ U32U64 IREmitter::FPMulAdd(const U32U64& a, const U32U64& b, const U32U64& c, bo
    }
 }
 U32U64 IREmitter::FPMulX(const U32U64& a, const U32U64& b) {
    ASSERT(a.GetType() == b.GetType());
    if (a.GetType() == Type::U32) {
        return Inst<U32>(Opcode::FPMulX32, a, b);
    } else {
        return Inst<U64>(Opcode::FPMulX64, a, b);
    }
 }
 U32U64 IREmitter::FPNeg(const U32U64& a) {
    if (a.GetType() == Type::U32) {
        return Inst<U32>(Opcode::FPNeg32, a);
--- a/src/frontend/ir/ir_emitter.h
+++ b/src/frontend/ir/ir_emitter.h
@ -278,6 +278,7 @@ public:
    U32U64 FPMinNumeric(const U32U64& a, const U32U64& b, bool fpscr_controlled);
    U32U64 FPMul(const U32U64& a, const U32U64& b, bool fpscr_controlled);
    U32U64 FPMulAdd(const U32U64& addend, const U32U64& op1, const U32U64& op2, bool fpscr_controlled);
    U32U64 FPMulX(const U32U64& a, const U32U64& b);
    U32U64 FPNeg(const U32U64& a);
    U32U64 FPRecipEstimate(const U32U64& a);
    U32U64 FPRecipStepFused(const U32U64& a, const U32U64& b);
--- a/src/frontend/ir/opcodes.inc
+++ b/src/frontend/ir/opcodes.inc
@ -421,6 +421,8 @@ OPCODE(FPMul32,                                 T::U32,         T::U32,
 OPCODE(FPMul64,                                 T::U64,         T::U64,         T::U64                          )
 OPCODE(FPMulAdd32,                              T::U32,         T::U32,         T::U32,         T::U32          )
 OPCODE(FPMulAdd64,                              T::U64,         T::U64,         T::U64,         T::U64          )
 OPCODE(FPMulX32,                                T::U32,         T::U32,         T::U32                          )
 OPCODE(FPMulX64,                                T::U64,         T::U64,         T::U64                          )
 OPCODE(FPNeg32,                                 T::U32,         T::U32                                          )
 OPCODE(FPNeg64,                                 T::U64,         T::U64                                          )
 OPCODE(FPRecipEstimate32,                       T::U32,         T::U32                                          )