From b46e6a24dc73c6c6eeeaf37c50fcb2f85747428e Mon Sep 17 00:00:00 2001 From: Merry Date: Fri, 28 May 2021 14:50:07 +0100 Subject: [PATCH] emit_x64_vector_saturation: AVX implementation of EmitVectorUnsignedSaturatedAdd32 --- .../x64/emit_x64_vector_saturation.cpp | 69 +++++++++++-------- 1 file changed, 39 insertions(+), 30 deletions(-) diff --git a/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp b/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp index 6aea6ece..89afff05 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp @@ -222,45 +222,54 @@ void EmitX64::EmitVectorUnsignedSaturatedAdd16(EmitContext& ctx, IR::Inst* inst) void EmitX64::EmitVectorUnsignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm addend = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); - - // TODO AVX2 if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) { - // Do a regular unsigned addition - code.vpaddd(result, result, addend); + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); - // Test if an overflow happened - code.vpcmpud(k1, result, addend, CmpInt::LessThan); - - // Write 0b1111... where overflows have happened - // This is just a quick way to do this without touching memory + code.vpaddd(result, operand1, operand2); + code.vpcmpud(k1, result, operand2, CmpInt::LessThan); code.vpternlogd(result | k1, result, result, 0xFF); - - // Set ZF if an overflow happened code.ktestb(k1, k1); + + code.setnz(overflow); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const Xbyak::Xmm operand1 = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.UseXmm(args[0]) : ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.ScratchXmm() : operand1; + const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpxor(xmm0, operand1, operand2); + code.vpand(tmp, operand1, operand2); + code.vpaddd(result, operand1, operand2); } else { - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); - code.movaps(tmp, result); - code.movaps(xmm0, result); + code.movaps(tmp, operand1); + code.movaps(xmm0, operand1); - code.pxor(xmm0, addend); - code.pand(tmp, addend); - code.paddd(result, addend); + code.pxor(xmm0, operand2); + code.pand(tmp, operand2); + code.paddd(result, operand2); + } - code.psrld(xmm0, 1); - code.paddd(tmp, xmm0); - code.psrad(tmp, 31); + code.psrld(xmm0, 1); + code.paddd(tmp, xmm0); + code.psrad(tmp, 31); - code.por(result, tmp); + code.por(result, tmp); - if (code.HasHostFeature(HostFeature::SSE41)) { - code.ptest(tmp, tmp); - } else { - code.movmskps(overflow.cvt32(), tmp); - code.test(overflow.cvt32(), overflow.cvt32()); - } + if (code.HasHostFeature(HostFeature::SSE41)) { + code.ptest(tmp, tmp); + } else { + code.movmskps(overflow.cvt32(), tmp); + code.test(overflow.cvt32(), overflow.cvt32()); } code.setnz(overflow);