Merge pull request #2609 from FernandoS27/new-scan

Implement a New Shader Scanner, Decompile Flow Stack and implement BRX BRA.CC
2019-07-11 17:36:23 -04:00 · 2019-07-11 17:36:23 -04:00 · bb67091c77
commit bb67091c77
parent 79c382fafd f2549739d1
16 changed files with 778 additions and 124 deletions
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@ -82,6 +82,8 @@ set(HASH_FILES
    "${VIDEO_CORE}/shader/decode/shift.cpp"
    "${VIDEO_CORE}/shader/decode/video.cpp"
    "${VIDEO_CORE}/shader/decode/xmad.cpp"
+    "${VIDEO_CORE}/shader/control_flow.cpp"
+    "${VIDEO_CORE}/shader/control_flow.h"
    "${VIDEO_CORE}/shader/decode.cpp"
    "${VIDEO_CORE}/shader/node.h"
    "${VIDEO_CORE}/shader/node_helper.cpp"
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@ -56,6 +56,8 @@ add_custom_command(OUTPUT scm_rev.cpp
      "${VIDEO_CORE}/shader/decode/shift.cpp"
      "${VIDEO_CORE}/shader/decode/video.cpp"
      "${VIDEO_CORE}/shader/decode/xmad.cpp"
+      "${VIDEO_CORE}/shader/control_flow.cpp"
+      "${VIDEO_CORE}/shader/control_flow.h"
      "${VIDEO_CORE}/shader/decode.cpp"
      "${VIDEO_CORE}/shader/node.h"
      "${VIDEO_CORE}/shader/node_helper.cpp"
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@ -103,6 +103,8 @@ add_library(video_core STATIC
    shader/decode/video.cpp
    shader/decode/xmad.cpp
    shader/decode/other.cpp
+    shader/control_flow.cpp
+    shader/control_flow.h
    shader/decode.cpp
    shader/node_helper.cpp
    shader/node_helper.h
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@ -1367,6 +1367,20 @@ union Instruction {
        }
    } bra;

+    union {
+        BitField<20, 24, u64> target;
+        BitField<5, 1, u64> constant_buffer;
+
+        s32 GetBranchExtend() const {
+            // Sign extend the branch target offset
+            u32 mask = 1U << (24 - 1);
+            u32 value = static_cast<u32>(target);
+            // The branch offset is relative to the next instruction and is stored in bytes, so
+            // divide it by the size of an instruction and add 1 to it.
+            return static_cast<s32>((value ^ mask) - mask) / sizeof(Instruction) + 1;
+        }
+    } brx;
+
    union {
        BitField<39, 1, u64> emit; // EmitVertex
        BitField<40, 1, u64> cut;  // EndPrimitive
@ -1464,6 +1478,7 @@ public:
        BFE_IMM,
        BFI_IMM_R,
        BRA,
+        BRX,
        PBK,
        LD_A,
        LD_L,
@ -1738,6 +1753,7 @@ private:
            INST("111000101001----", Id::SSY, Type::Flow, "SSY"),
            INST("111000101010----", Id::PBK, Type::Flow, "PBK"),
            INST("111000100100----", Id::BRA, Type::Flow, "BRA"),
+            INST("111000100101----", Id::BRX, Type::Flow, "BRX"),
            INST("1111000011111---", Id::SYNC, Type::Flow, "SYNC"),
            INST("111000110100---", Id::BRK, Type::Flow, "BRK"),
            INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@ -129,9 +129,11 @@ std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {

 /// Hashes one (or two) program streams
 u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& code,
-                        const ProgramCode& code_b) {
-    u64 unique_identifier =
-        Common::CityHash64(reinterpret_cast<const char*>(code.data()), CalculateProgramSize(code));
+                        const ProgramCode& code_b, std::size_t size_a = 0, std::size_t size_b = 0) {
+    if (size_a == 0) {
+        size_a = CalculateProgramSize(code);
+    }
+    u64 unique_identifier = Common::CityHash64(reinterpret_cast<const char*>(code.data()), size_a);
    if (program_type != Maxwell::ShaderProgram::VertexA) {
        return unique_identifier;
    }
@ -140,8 +142,11 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode&
    std::size_t seed = 0;
    boost::hash_combine(seed, unique_identifier);

-    const u64 identifier_b = Common::CityHash64(reinterpret_cast<const char*>(code_b.data()),
-                                                CalculateProgramSize(code_b));
+    if (size_b == 0) {
+        size_b = CalculateProgramSize(code_b);
+    }
+    const u64 identifier_b =
+        Common::CityHash64(reinterpret_cast<const char*>(code_b.data()), size_b);
    boost::hash_combine(seed, identifier_b);
    return static_cast<u64>(seed);
 }
@ -150,14 +155,17 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode&
 GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type,
                                      ProgramCode program_code, ProgramCode program_code_b) {
    GLShader::ShaderSetup setup(program_code);
+    setup.program.size_a = CalculateProgramSize(program_code);
+    setup.program.size_b = 0;
    if (program_type == Maxwell::ShaderProgram::VertexA) {
        // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders.
        // Conventional HW does not support this, so we combine VertexA and VertexB into one
        // stage here.
        setup.SetProgramB(program_code_b);
+        setup.program.size_b = CalculateProgramSize(program_code_b);
    }
-    setup.program.unique_identifier =
-        GetUniqueIdentifier(program_type, program_code, program_code_b);
+    setup.program.unique_identifier = GetUniqueIdentifier(
+        program_type, program_code, program_code_b, setup.program.size_a, setup.program.size_b);

    switch (program_type) {
    case Maxwell::ShaderProgram::VertexA:
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@ -191,10 +191,12 @@ public:

        // TODO(Subv): Figure out the actual depth of the flow stack, for now it seems
        // unlikely that shaders will use 20 nested SSYs and PBKs.
-        constexpr u32 FLOW_STACK_SIZE = 20;
-        for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) {
-            code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE);
-            code.AddLine("uint {} = 0u;", FlowStackTopName(stack));
+        if (!ir.IsFlowStackDisabled()) {
+            constexpr u32 FLOW_STACK_SIZE = 20;
+            for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) {
+                code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE);
+                code.AddLine("uint {} = 0u;", FlowStackTopName(stack));
+            }
        }

        code.AddLine("while (true) {{");
@ -1555,6 +1557,14 @@ private:
        return {};
    }

+    std::string BranchIndirect(Operation operation) {
+        const std::string op_a = VisitOperand(operation, 0, Type::Uint);
+
+        code.AddLine("jmp_to = {};", op_a);
+        code.AddLine("break;");
+        return {};
+    }
+
    std::string PushFlowStack(Operation operation) {
        const auto stack = std::get<MetaStackClass>(operation.GetMeta());
        const auto target = std::get_if<ImmediateNode>(&*operation[0]);
@ -1789,6 +1799,7 @@ private:
        &GLSLDecompiler::ImageStore,

        &GLSLDecompiler::Branch,
+        &GLSLDecompiler::BranchIndirect,
        &GLSLDecompiler::PushFlowStack,
        &GLSLDecompiler::PopFlowStack,
        &GLSLDecompiler::Exit,
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@ -29,14 +29,14 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
 };

 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
    ProgramResult program =
        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");

    out += program.first;

    if (setup.IsDualProgram()) {
-        const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET);
+        const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET, setup.program.size_b);
        ProgramResult program_b =
            Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");

@ -80,7 +80,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
 };

 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
    ProgramResult program =
        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
    out += program.first;
@ -115,7 +115,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config {
 };

 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
    ProgramResult program =
        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");

--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@ -27,6 +27,8 @@ struct ShaderSetup {
        ProgramCode code;
        ProgramCode code_b; // Used for dual vertex shaders
        u64 unique_identifier;
+        std::size_t size_a;
+        std::size_t size_b;
    } program;

    /// Used in scenarios where we have a dual vertex shaders
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@ -949,6 +949,14 @@ private:
        return {};
    }

+    Id BranchIndirect(Operation operation) {
+        const Id op_a = VisitOperand<Type::Uint>(operation, 0);
+
+        Emit(OpStore(jmp_to, op_a));
+        BranchingOp([&]() { Emit(OpBranch(continue_label)); });
+        return {};
+    }
+
    Id PushFlowStack(Operation operation) {
        const auto target = std::get_if<ImmediateNode>(&*operation[0]);
        ASSERT(target);
@ -1334,6 +1342,7 @@ private:
        &SPIRVDecompiler::ImageStore,

        &SPIRVDecompiler::Branch,
+        &SPIRVDecompiler::BranchIndirect,
        &SPIRVDecompiler::PushFlowStack,
        &SPIRVDecompiler::PopFlowStack,
        &SPIRVDecompiler::Exit,
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@ -0,0 +1,476 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <list>
+#include <map>
+#include <stack>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/shader/control_flow.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+using Tegra::Shader::Instruction;
+using Tegra::Shader::OpCode;
+
+constexpr s32 unassigned_branch = -2;
+
+struct Query {
+    u32 address{};
+    std::stack<u32> ssy_stack{};
+    std::stack<u32> pbk_stack{};
+};
+
+struct BlockStack {
+    BlockStack() = default;
+    BlockStack(const BlockStack& b) = default;
+    BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {}
+    std::stack<u32> ssy_stack{};
+    std::stack<u32> pbk_stack{};
+};
+
+struct BlockBranchInfo {
+    Condition condition{};
+    s32 address{exit_branch};
+    bool kill{};
+    bool is_sync{};
+    bool is_brk{};
+    bool ignore{};
+};
+
+struct BlockInfo {
+    u32 start{};
+    u32 end{};
+    bool visited{};
+    BlockBranchInfo branch{};
+
+    bool IsInside(const u32 address) const {
+        return start <= address && address <= end;
+    }
+};
+
+struct CFGRebuildState {
+    explicit CFGRebuildState(const ProgramCode& program_code, const std::size_t program_size,
+                             const u32 start)
+        : program_code{program_code}, program_size{program_size}, start{start} {}
+
+    u32 start{};
+    std::vector<BlockInfo> block_info{};
+    std::list<u32> inspect_queries{};
+    std::list<Query> queries{};
+    std::unordered_map<u32, u32> registered{};
+    std::unordered_set<u32> labels{};
+    std::map<u32, u32> ssy_labels{};
+    std::map<u32, u32> pbk_labels{};
+    std::unordered_map<u32, BlockStack> stacks{};
+    const ProgramCode& program_code;
+    const std::size_t program_size;
+};
+
+enum class BlockCollision : u32 { None, Found, Inside };
+
+std::pair<BlockCollision, u32> TryGetBlock(CFGRebuildState& state, u32 address) {
+    const auto& blocks = state.block_info;
+    for (u32 index = 0; index < blocks.size(); index++) {
+        if (blocks[index].start == address) {
+            return {BlockCollision::Found, index};
+        }
+        if (blocks[index].IsInside(address)) {
+            return {BlockCollision::Inside, index};
+        }
+    }
+    return {BlockCollision::None, -1};
+}
+
+struct ParseInfo {
+    BlockBranchInfo branch_info{};
+    u32 end_address{};
+};
+
+BlockInfo& CreateBlockInfo(CFGRebuildState& state, u32 start, u32 end) {
+    auto& it = state.block_info.emplace_back();
+    it.start = start;
+    it.end = end;
+    const u32 index = static_cast<u32>(state.block_info.size() - 1);
+    state.registered.insert({start, index});
+    return it;
+}
+
+Pred GetPredicate(u32 index, bool negated) {
+    return static_cast<Pred>(index + (negated ? 8 : 0));
+}
+
+/**
+ * Returns whether the instruction at the specified offset is a 'sched' instruction.
+ * Sched instructions always appear before a sequence of 3 instructions.
+ */
+constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
+    constexpr u32 SchedPeriod = 4;
+    u32 absolute_offset = offset - main_offset;
+
+    return (absolute_offset % SchedPeriod) == 0;
+}
+
+enum class ParseResult : u32 {
+    ControlCaught,
+    BlockEnd,
+    AbnormalFlow,
+};
+
+std::pair<ParseResult, ParseInfo> ParseCode(CFGRebuildState& state, u32 address) {
+    u32 offset = static_cast<u32>(address);
+    const u32 end_address = static_cast<u32>(state.program_size / sizeof(Instruction));
+    ParseInfo parse_info{};
+
+    const auto insert_label = [](CFGRebuildState& state, u32 address) {
+        const auto pair = state.labels.emplace(address);
+        if (pair.second) {
+            state.inspect_queries.push_back(address);
+        }
+    };
+
+    while (true) {
+        if (offset >= end_address) {
+            // ASSERT_OR_EXECUTE can't be used, as it ignores the break
+            ASSERT_MSG(false, "Shader passed the current limit!");
+            parse_info.branch_info.address = exit_branch;
+            parse_info.branch_info.ignore = false;
+            break;
+        }
+        if (state.registered.count(offset) != 0) {
+            parse_info.branch_info.address = offset;
+            parse_info.branch_info.ignore = true;
+            break;
+        }
+        if (IsSchedInstruction(offset, state.start)) {
+            offset++;
+            continue;
+        }
+        const Instruction instr = {state.program_code[offset]};
+        const auto opcode = OpCode::Decode(instr);
+        if (!opcode || opcode->get().GetType() != OpCode::Type::Flow) {
+            offset++;
+            continue;
+        }
+
+        switch (opcode->get().GetId()) {
+        case OpCode::Id::EXIT: {
+            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
+            parse_info.branch_info.condition.predicate =
+                GetPredicate(pred_index, instr.negate_pred != 0);
+            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
+                offset++;
+                continue;
+            }
+            const ConditionCode cc = instr.flow_condition_code;
+            parse_info.branch_info.condition.cc = cc;
+            if (cc == ConditionCode::F) {
+                offset++;
+                continue;
+            }
+            parse_info.branch_info.address = exit_branch;
+            parse_info.branch_info.kill = false;
+            parse_info.branch_info.is_sync = false;
+            parse_info.branch_info.is_brk = false;
+            parse_info.branch_info.ignore = false;
+            parse_info.end_address = offset;
+
+            return {ParseResult::ControlCaught, parse_info};
+        }
+        case OpCode::Id::BRA: {
+            if (instr.bra.constant_buffer != 0) {
+                return {ParseResult::AbnormalFlow, parse_info};
+            }
+            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
+            parse_info.branch_info.condition.predicate =
+                GetPredicate(pred_index, instr.negate_pred != 0);
+            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
+                offset++;
+                continue;
+            }
+            const ConditionCode cc = instr.flow_condition_code;
+            parse_info.branch_info.condition.cc = cc;
+            if (cc == ConditionCode::F) {
+                offset++;
+                continue;
+            }
+            const u32 branch_offset = offset + instr.bra.GetBranchTarget();
+            if (branch_offset == 0) {
+                parse_info.branch_info.address = exit_branch;
+            } else {
+                parse_info.branch_info.address = branch_offset;
+            }
+            insert_label(state, branch_offset);
+            parse_info.branch_info.kill = false;
+            parse_info.branch_info.is_sync = false;
+            parse_info.branch_info.is_brk = false;
+            parse_info.branch_info.ignore = false;
+            parse_info.end_address = offset;
+
+            return {ParseResult::ControlCaught, parse_info};
+        }
+        case OpCode::Id::SYNC: {
+            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
+            parse_info.branch_info.condition.predicate =
+                GetPredicate(pred_index, instr.negate_pred != 0);
+            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
+                offset++;
+                continue;
+            }
+            const ConditionCode cc = instr.flow_condition_code;
+            parse_info.branch_info.condition.cc = cc;
+            if (cc == ConditionCode::F) {
+                offset++;
+                continue;
+            }
+            parse_info.branch_info.address = unassigned_branch;
+            parse_info.branch_info.kill = false;
+            parse_info.branch_info.is_sync = true;
+            parse_info.branch_info.is_brk = false;
+            parse_info.branch_info.ignore = false;
+            parse_info.end_address = offset;
+
+            return {ParseResult::ControlCaught, parse_info};
+        }
+        case OpCode::Id::BRK: {
+            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
+            parse_info.branch_info.condition.predicate =
+                GetPredicate(pred_index, instr.negate_pred != 0);
+            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
+                offset++;
+                continue;
+            }
+            const ConditionCode cc = instr.flow_condition_code;
+            parse_info.branch_info.condition.cc = cc;
+            if (cc == ConditionCode::F) {
+                offset++;
+                continue;
+            }
+            parse_info.branch_info.address = unassigned_branch;
+            parse_info.branch_info.kill = false;
+            parse_info.branch_info.is_sync = false;
+            parse_info.branch_info.is_brk = true;
+            parse_info.branch_info.ignore = false;
+            parse_info.end_address = offset;
+
+            return {ParseResult::ControlCaught, parse_info};
+        }
+        case OpCode::Id::KIL: {
+            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
+            parse_info.branch_info.condition.predicate =
+                GetPredicate(pred_index, instr.negate_pred != 0);
+            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
+                offset++;
+                continue;
+            }
+            const ConditionCode cc = instr.flow_condition_code;
+            parse_info.branch_info.condition.cc = cc;
+            if (cc == ConditionCode::F) {
+                offset++;
+                continue;
+            }
+            parse_info.branch_info.address = exit_branch;
+            parse_info.branch_info.kill = true;
+            parse_info.branch_info.is_sync = false;
+            parse_info.branch_info.is_brk = false;
+            parse_info.branch_info.ignore = false;
+            parse_info.end_address = offset;
+
+            return {ParseResult::ControlCaught, parse_info};
+        }
+        case OpCode::Id::SSY: {
+            const u32 target = offset + instr.bra.GetBranchTarget();
+            insert_label(state, target);
+            state.ssy_labels.emplace(offset, target);
+            break;
+        }
+        case OpCode::Id::PBK: {
+            const u32 target = offset + instr.bra.GetBranchTarget();
+            insert_label(state, target);
+            state.pbk_labels.emplace(offset, target);
+            break;
+        }
+        case OpCode::Id::BRX: {
+            return {ParseResult::AbnormalFlow, parse_info};
+        }
+        default:
+            break;
+        }
+
+        offset++;
+    }
+    parse_info.branch_info.kill = false;
+    parse_info.branch_info.is_sync = false;
+    parse_info.branch_info.is_brk = false;
+    parse_info.end_address = offset - 1;
+    return {ParseResult::BlockEnd, parse_info};
+}
+
+bool TryInspectAddress(CFGRebuildState& state) {
+    if (state.inspect_queries.empty()) {
+        return false;
+    }
+
+    const u32 address = state.inspect_queries.front();
+    state.inspect_queries.pop_front();
+    const auto [result, block_index] = TryGetBlock(state, address);
+    switch (result) {
+    case BlockCollision::Found: {
+        return true;
+    }
+    case BlockCollision::Inside: {
+        // This case is the tricky one:
+        // We need to Split the block in 2 sepparate blocks
+        const u32 end = state.block_info[block_index].end;
+        BlockInfo& new_block = CreateBlockInfo(state, address, end);
+        BlockInfo& current_block = state.block_info[block_index];
+        current_block.end = address - 1;
+        new_block.branch = current_block.branch;
+        BlockBranchInfo forward_branch{};
+        forward_branch.address = address;
+        forward_branch.ignore = true;
+        current_block.branch = forward_branch;
+        return true;
+    }
+    default:
+        break;
+    }
+    const auto [parse_result, parse_info] = ParseCode(state, address);
+    if (parse_result == ParseResult::AbnormalFlow) {
+        // if it's AbnormalFlow, we end it as false, ending the CFG reconstruction
+        return false;
+    }
+
+    BlockInfo& block_info = CreateBlockInfo(state, address, parse_info.end_address);
+    block_info.branch = parse_info.branch_info;
+    if (parse_info.branch_info.condition.IsUnconditional()) {
+        return true;
+    }
+
+    const u32 fallthrough_address = parse_info.end_address + 1;
+    state.inspect_queries.push_front(fallthrough_address);
+    return true;
+}
+
+bool TryQuery(CFGRebuildState& state) {
+    const auto gather_labels = [](std::stack<u32>& cc, std::map<u32, u32>& labels,
+                                  BlockInfo& block) {
+        auto gather_start = labels.lower_bound(block.start);
+        const auto gather_end = labels.upper_bound(block.end);
+        while (gather_start != gather_end) {
+            cc.push(gather_start->second);
+            gather_start++;
+        }
+    };
+    if (state.queries.empty()) {
+        return false;
+    }
+    Query& q = state.queries.front();
+    const u32 block_index = state.registered[q.address];
+    BlockInfo& block = state.block_info[block_index];
+    // If the block is visted, check if the stacks match, else gather the ssy/pbk
+    // labels into the current stack and look if the branch at the end of the block
+    // consumes a label. Schedule new queries accordingly
+    if (block.visited) {
+        BlockStack& stack = state.stacks[q.address];
+        const bool all_okay = (stack.ssy_stack.size() == 0 || q.ssy_stack == stack.ssy_stack) &&
+                              (stack.pbk_stack.size() == 0 || q.pbk_stack == stack.pbk_stack);
+        state.queries.pop_front();
+        return all_okay;
+    }
+    block.visited = true;
+    state.stacks[q.address] = BlockStack{q};
+    Query q2(q);
+    state.queries.pop_front();
+    gather_labels(q2.ssy_stack, state.ssy_labels, block);
+    gather_labels(q2.pbk_stack, state.pbk_labels, block);
+    if (!block.branch.condition.IsUnconditional()) {
+        q2.address = block.end + 1;
+        state.queries.push_back(q2);
+    }
+    Query conditional_query{q2};
+    if (block.branch.is_sync) {
+        if (block.branch.address == unassigned_branch) {
+            block.branch.address = conditional_query.ssy_stack.top();
+        }
+        conditional_query.ssy_stack.pop();
+    }
+    if (block.branch.is_brk) {
+        if (block.branch.address == unassigned_branch) {
+            block.branch.address = conditional_query.pbk_stack.top();
+        }
+        conditional_query.pbk_stack.pop();
+    }
+    conditional_query.address = block.branch.address;
+    state.queries.push_back(conditional_query);
+    return true;
+}
+
+std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
+                                              u32 start_address) {
+    CFGRebuildState state{program_code, program_size, start_address};
+    // Inspect Code and generate blocks
+    state.labels.clear();
+    state.labels.emplace(start_address);
+    state.inspect_queries.push_back(state.start);
+    while (!state.inspect_queries.empty()) {
+        if (!TryInspectAddress(state)) {
+            return {};
+        }
+    }
+    // Decompile Stacks
+    Query start_query{};
+    start_query.address = state.start;
+    state.queries.push_back(start_query);
+    bool decompiled = true;
+    while (!state.queries.empty()) {
+        if (!TryQuery(state)) {
+            decompiled = false;
+            break;
+        }
+    }
+    // Sort and organize results
+    std::sort(state.block_info.begin(), state.block_info.end(),
+              [](const BlockInfo& a, const BlockInfo& b) -> bool { return a.start < b.start; });
+    ShaderCharacteristics result_out{};
+    result_out.decompilable = decompiled;
+    result_out.start = start_address;
+    result_out.end = start_address;
+    for (auto& block : state.block_info) {
+        ShaderBlock new_block{};
+        new_block.start = block.start;
+        new_block.end = block.end;
+        new_block.ignore_branch = block.branch.ignore;
+        if (!new_block.ignore_branch) {
+            new_block.branch.cond = block.branch.condition;
+            new_block.branch.kills = block.branch.kill;
+            new_block.branch.address = block.branch.address;
+        }
+        result_out.end = std::max(result_out.end, block.end);
+        result_out.blocks.push_back(new_block);
+    }
+    if (result_out.decompilable) {
+        result_out.labels = std::move(state.labels);
+        return {result_out};
+    }
+    // If it's not decompilable, merge the unlabelled blocks together
+    auto back = result_out.blocks.begin();
+    auto next = std::next(back);
+    while (next != result_out.blocks.end()) {
+        if (state.labels.count(next->start) == 0 && next->start == back->end + 1) {
+            back->end = next->end;
+            next = result_out.blocks.erase(next);
+            continue;
+        }
+        back = next;
+        next++;
+    }
+    return {result_out};
+}
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/control_flow.h
+++ b/src/video_core/shader/control_flow.h
@ -0,0 +1,63 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstring>
+#include <list>
+#include <optional>
+#include <unordered_set>
+
+#include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+using Tegra::Shader::ConditionCode;
+using Tegra::Shader::Pred;
+
+constexpr s32 exit_branch = -1;
+
+struct Condition {
+    Pred predicate{Pred::UnusedIndex};
+    ConditionCode cc{ConditionCode::T};
+
+    bool IsUnconditional() const {
+        return predicate == Pred::UnusedIndex && cc == ConditionCode::T;
+    }
+    bool operator==(const Condition& other) const {
+        return std::tie(predicate, cc) == std::tie(other.predicate, other.cc);
+    }
+};
+
+struct ShaderBlock {
+    u32 start{};
+    u32 end{};
+    bool ignore_branch{};
+    struct Branch {
+        Condition cond{};
+        bool kills{};
+        s32 address{};
+        bool operator==(const Branch& b) const {
+            return std::tie(cond, kills, address) == std::tie(b.cond, b.kills, b.address);
+        }
+    } branch{};
+    bool operator==(const ShaderBlock& sb) const {
+        return std::tie(start, end, ignore_branch, branch) ==
+               std::tie(sb.start, sb.end, sb.ignore_branch, sb.branch);
+    }
+};
+
+struct ShaderCharacteristics {
+    std::list<ShaderBlock> blocks{};
+    bool decompilable{};
+    u32 start{};
+    u32 end{};
+    std::unordered_set<u32> labels{};
+};
+
+std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
+                                              u32 start_address);
+
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@ -11,6 +11,7 @@
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/engines/shader_header.h"
+#include "video_core/shader/control_flow.h"
 #include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"

@ -21,20 +22,6 @@ using Tegra::Shader::OpCode;

 namespace {

-/// Merges exit method of two parallel branches.
-constexpr ExitMethod ParallelExit(ExitMethod a, ExitMethod b) {
-    if (a == ExitMethod::Undetermined) {
-        return b;
-    }
-    if (b == ExitMethod::Undetermined) {
-        return a;
-    }
-    if (a == b) {
-        return a;
-    }
-    return ExitMethod::Conditional;
-}
-
 /**
 * Returns whether the instruction at the specified offset is a 'sched' instruction.
 * Sched instructions always appear before a sequence of 3 instructions.
@ -51,87 +38,106 @@ constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
 void ShaderIR::Decode() {
    std::memcpy(&header, program_code.data(), sizeof(Tegra::Shader::Header));

-    std::set<u32> labels;
-    const ExitMethod exit_method = Scan(main_offset, MAX_PROGRAM_LENGTH, labels);
-    if (exit_method != ExitMethod::AlwaysEnd) {
-        UNREACHABLE_MSG("Program does not always end");
-    }
-
-    if (labels.empty()) {
-        basic_blocks.insert({main_offset, DecodeRange(main_offset, MAX_PROGRAM_LENGTH)});
+    disable_flow_stack = false;
+    const auto info = ScanFlow(program_code, program_size, main_offset);
+    if (info) {
+        const auto& shader_info = *info;
+        coverage_begin = shader_info.start;
+        coverage_end = shader_info.end;
+        if (shader_info.decompilable) {
+            disable_flow_stack = true;
+            const auto insert_block = ([this](NodeBlock& nodes, u32 label) {
+                if (label == exit_branch) {
+                    return;
+                }
+                basic_blocks.insert({label, nodes});
+            });
+            const auto& blocks = shader_info.blocks;
+            NodeBlock current_block;
+            u32 current_label = exit_branch;
+            for (auto& block : blocks) {
+                if (shader_info.labels.count(block.start) != 0) {
+                    insert_block(current_block, current_label);
+                    current_block.clear();
+                    current_label = block.start;
+                }
+                if (!block.ignore_branch) {
+                    DecodeRangeInner(current_block, block.start, block.end);
+                    InsertControlFlow(current_block, block);
+                } else {
+                    DecodeRangeInner(current_block, block.start, block.end + 1);
+                }
+            }
+            insert_block(current_block, current_label);
+            return;
+        }
+        LOG_WARNING(HW_GPU, "Flow Stack Removing Failed! Falling back to old method");
+        // we can't decompile it, fallback to standard method
+        for (const auto& block : shader_info.blocks) {
+            basic_blocks.insert({block.start, DecodeRange(block.start, block.end + 1)});
+        }
        return;
    }
+    LOG_WARNING(HW_GPU, "Flow Analysis Failed! Falling back to brute force compiling");

-    labels.insert(main_offset);
-
-    for (const u32 label : labels) {
-        const auto next_it = labels.lower_bound(label + 1);
-        const u32 next_label = next_it == labels.end() ? MAX_PROGRAM_LENGTH : *next_it;
-
-        basic_blocks.insert({label, DecodeRange(label, next_label)});
+    // Now we need to deal with an undecompilable shader. We need to brute force
+    // a shader that captures every position.
+    coverage_begin = main_offset;
+    const u32 shader_end = static_cast<u32>(program_size / sizeof(u64));
+    coverage_end = shader_end;
+    for (u32 label = main_offset; label < shader_end; label++) {
+        basic_blocks.insert({label, DecodeRange(label, label + 1)});
    }
 }

-ExitMethod ShaderIR::Scan(u32 begin, u32 end, std::set<u32>& labels) {
-    const auto [iter, inserted] =
-        exit_method_map.emplace(std::make_pair(begin, end), ExitMethod::Undetermined);
-    ExitMethod& exit_method = iter->second;
-    if (!inserted)
-        return exit_method;
-
-    for (u32 offset = begin; offset != end && offset != MAX_PROGRAM_LENGTH; ++offset) {
-        coverage_begin = std::min(coverage_begin, offset);
-        coverage_end = std::max(coverage_end, offset + 1);
-
-        const Instruction instr = {program_code[offset]};
-        const auto opcode = OpCode::Decode(instr);
-        if (!opcode)
-            continue;
-        switch (opcode->get().GetId()) {
-        case OpCode::Id::EXIT: {
-            // The EXIT instruction can be predicated, which means that the shader can conditionally
-            // end on this instruction. We have to consider the case where the condition is not met
-            // and check the exit method of that other basic block.
-            using Tegra::Shader::Pred;
-            if (instr.pred.pred_index == static_cast<u64>(Pred::UnusedIndex)) {
-                return exit_method = ExitMethod::AlwaysEnd;
-            } else {
-                const ExitMethod not_met = Scan(offset + 1, end, labels);
-                return exit_method = ParallelExit(ExitMethod::AlwaysEnd, not_met);
-            }
-        }
-        case OpCode::Id::BRA: {
-            const u32 target = offset + instr.bra.GetBranchTarget();
-            labels.insert(target);
-            const ExitMethod no_jmp = Scan(offset + 1, end, labels);
-            const ExitMethod jmp = Scan(target, end, labels);
-            return exit_method = ParallelExit(no_jmp, jmp);
-        }
-        case OpCode::Id::SSY:
-        case OpCode::Id::PBK: {
-            // The SSY and PBK use a similar encoding as the BRA instruction.
-            UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
-                                 "Constant buffer branching is not supported");
-            const u32 target = offset + instr.bra.GetBranchTarget();
-            labels.insert(target);
-            // Continue scanning for an exit method.
-            break;
-        }
-        default:
-            break;
-        }
-    }
-    return exit_method = ExitMethod::AlwaysReturn;
-}
-
 NodeBlock ShaderIR::DecodeRange(u32 begin, u32 end) {
    NodeBlock basic_block;
-    for (u32 pc = begin; pc < (begin > end ? MAX_PROGRAM_LENGTH : end);) {
-        pc = DecodeInstr(basic_block, pc);
-    }
+    DecodeRangeInner(basic_block, begin, end);
    return basic_block;
 }

+void ShaderIR::DecodeRangeInner(NodeBlock& bb, u32 begin, u32 end) {
+    for (u32 pc = begin; pc < (begin > end ? MAX_PROGRAM_LENGTH : end);) {
+        pc = DecodeInstr(bb, pc);
+    }
+}
+
+void ShaderIR::InsertControlFlow(NodeBlock& bb, const ShaderBlock& block) {
+    const auto apply_conditions = ([&](const Condition& cond, Node n) -> Node {
+        Node result = n;
+        if (cond.cc != ConditionCode::T) {
+            result = Conditional(GetConditionCode(cond.cc), {result});
+        }
+        if (cond.predicate != Pred::UnusedIndex) {
+            u32 pred = static_cast<u32>(cond.predicate);
+            const bool is_neg = pred > 7;
+            if (is_neg) {
+                pred -= 8;
+            }
+            result = Conditional(GetPredicate(pred, is_neg), {result});
+        }
+        return result;
+    });
+    if (block.branch.address < 0) {
+        if (block.branch.kills) {
+            Node n = Operation(OperationCode::Discard);
+            n = apply_conditions(block.branch.cond, n);
+            bb.push_back(n);
+            global_code.push_back(n);
+            return;
+        }
+        Node n = Operation(OperationCode::Exit);
+        n = apply_conditions(block.branch.cond, n);
+        bb.push_back(n);
+        global_code.push_back(n);
+        return;
+    }
+    Node n = Operation(OperationCode::Branch, Immediate(block.branch.address));
+    n = apply_conditions(block.branch.cond, n);
+    bb.push_back(n);
+    global_code.push_back(n);
+}
+
 u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
    // Ignore sched instructions when generating code.
    if (IsSchedInstruction(pc, main_offset)) {
@ -140,15 +146,18 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {

    const Instruction instr = {program_code[pc]};
    const auto opcode = OpCode::Decode(instr);
+    const u32 nv_address = ConvertAddressToNvidiaSpace(pc);

    // Decoding failure
    if (!opcode) {
        UNIMPLEMENTED_MSG("Unhandled instruction: {0:x}", instr.value);
+        bb.push_back(Comment(fmt::format("{:05x} Unimplemented Shader instruction (0x{:016x})",
+                                         nv_address, instr.value)));
        return pc + 1;
    }

-    bb.push_back(
-        Comment(fmt::format("{}: {} (0x{:016x})", pc, opcode->get().GetName(), instr.value)));
+    bb.push_back(Comment(
+        fmt::format("{:05x} {} (0x{:016x})", nv_address, opcode->get().GetName(), instr.value)));

    using Tegra::Shader::Pred;
    UNIMPLEMENTED_IF_MSG(instr.pred.full_pred == Pred::NeverExecute,
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@ -91,11 +91,46 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
        break;
    }
    case OpCode::Id::BRA: {
-        UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
-                             "BRA with constant buffers are not implemented");
+        Node branch;
+        if (instr.bra.constant_buffer == 0) {
+            const u32 target = pc + instr.bra.GetBranchTarget();
+            branch = Operation(OperationCode::Branch, Immediate(target));
+        } else {
+            const u32 target = pc + 1;
+            const Node op_a = GetConstBuffer(instr.cbuf36.index, instr.cbuf36.GetOffset());
+            const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, true,
+                                                 PRECISE, op_a, Immediate(3));
+            const Node operand =
+                Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target));
+            branch = Operation(OperationCode::BranchIndirect, convert);
+        }

-        const u32 target = pc + instr.bra.GetBranchTarget();
-        const Node branch = Operation(OperationCode::Branch, Immediate(target));
+        const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
+        if (cc != Tegra::Shader::ConditionCode::T) {
+            bb.push_back(Conditional(GetConditionCode(cc), {branch}));
+        } else {
+            bb.push_back(branch);
+        }
+        break;
+    }
+    case OpCode::Id::BRX: {
+        Node operand;
+        if (instr.brx.constant_buffer != 0) {
+            const s32 target = pc + 1;
+            const Node index = GetRegister(instr.gpr8);
+            const Node op_a =
+                GetConstBufferIndirect(instr.cbuf36.index, instr.cbuf36.GetOffset() + 0, index);
+            const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, true,
+                                                 PRECISE, op_a, Immediate(3));
+            operand = Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target));
+        } else {
+            const s32 target = pc + instr.brx.GetBranchExtend();
+            const Node op_a = GetRegister(instr.gpr8);
+            const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, true,
+                                                 PRECISE, op_a, Immediate(3));
+            operand = Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target));
+        }
+        const Node branch = Operation(OperationCode::BranchIndirect, operand);

        const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
        if (cc != Tegra::Shader::ConditionCode::T) {
@ -109,6 +144,10 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
        UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
                             "Constant buffer flow is not supported");

+        if (disable_flow_stack) {
+            break;
+        }
+
        // The SSY opcode tells the GPU where to re-converge divergent execution paths with SYNC.
        const u32 target = pc + instr.bra.GetBranchTarget();
        bb.push_back(
@ -119,6 +158,10 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
        UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
                             "Constant buffer PBK is not supported");

+        if (disable_flow_stack) {
+            break;
+        }
+
        // PBK pushes to a stack the address where BRK will jump to.
        const u32 target = pc + instr.bra.GetBranchTarget();
        bb.push_back(
@ -130,6 +173,10 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
        UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "SYNC condition code used: {}",
                             static_cast<u32>(cc));

+        if (disable_flow_stack) {
+            break;
+        }
+
        // The SYNC opcode jumps to the address previously set by the SSY opcode
        bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Ssy));
        break;
@ -138,6 +185,9 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
        const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
        UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "BRK condition code used: {}",
                             static_cast<u32>(cc));
+        if (disable_flow_stack) {
+            break;
+        }

        // The BRK opcode jumps to the address previously set by the PBK opcode
        bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Pbk));
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@ -148,11 +148,12 @@ enum class OperationCode {

    ImageStore, /// (MetaImage, float[N] coords) -> void

-    Branch,        /// (uint branch_target) -> void
-    PushFlowStack, /// (uint branch_target) -> void
-    PopFlowStack,  /// () -> void
-    Exit,          /// () -> void
-    Discard,       /// () -> void
+    Branch,         /// (uint branch_target) -> void
+    BranchIndirect, /// (uint branch_target) -> void
+    PushFlowStack,  /// (uint branch_target) -> void
+    PopFlowStack,   /// () -> void
+    Exit,           /// () -> void
+    Discard,        /// () -> void

    EmitVertex,   /// () -> void
    EndPrimitive, /// () -> void
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@ -22,8 +22,8 @@ using Tegra::Shader::PredCondition;
 using Tegra::Shader::PredOperation;
 using Tegra::Shader::Register;

-ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset)
-    : program_code{program_code}, main_offset{main_offset} {
+ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset, const std::size_t size)
+    : program_code{program_code}, main_offset{main_offset}, program_size{size} {
    Decode();
 }

--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@ -22,18 +22,12 @@

 namespace VideoCommon::Shader {

+struct ShaderBlock;
+
 using ProgramCode = std::vector<u64>;

 constexpr u32 MAX_PROGRAM_LENGTH = 0x1000;

-/// Describes the behaviour of code path of a given entry point and a return point.
-enum class ExitMethod {
-    Undetermined, ///< Internal value. Only occur when analyzing JMP loop.
-    AlwaysReturn, ///< All code paths reach the return point.
-    Conditional,  ///< Code path reaches the return point or an END instruction conditionally.
-    AlwaysEnd,    ///< All code paths reach a END instruction.
-};
-
 class ConstBuffer {
 public:
    explicit ConstBuffer(u32 max_offset, bool is_indirect)
@ -73,7 +67,7 @@ struct GlobalMemoryUsage {

 class ShaderIR final {
 public:
-    explicit ShaderIR(const ProgramCode& program_code, u32 main_offset);
+    explicit ShaderIR(const ProgramCode& program_code, u32 main_offset, std::size_t size);
    ~ShaderIR();

    const std::map<u32, NodeBlock>& GetBasicBlocks() const {
@ -129,12 +123,20 @@ public:
        return header;
    }

+    bool IsFlowStackDisabled() const {
+        return disable_flow_stack;
+    }
+
+    u32 ConvertAddressToNvidiaSpace(const u32 address) const {
+        return (address - main_offset) * sizeof(Tegra::Shader::Instruction);
+    }
+
 private:
    void Decode();

-    ExitMethod Scan(u32 begin, u32 end, std::set<u32>& labels);
-
    NodeBlock DecodeRange(u32 begin, u32 end);
+    void DecodeRangeInner(NodeBlock& bb, u32 begin, u32 end);
+    void InsertControlFlow(NodeBlock& bb, const ShaderBlock& block);

    /**
     * Decodes a single instruction from Tegra to IR.
@ -326,10 +328,11 @@ private:

    const ProgramCode& program_code;
    const u32 main_offset;
+    const std::size_t program_size;
+    bool disable_flow_stack{};

    u32 coverage_begin{};
    u32 coverage_end{};
-    std::map<std::pair<u32, u32>, ExitMethod> exit_method_map;

    std::map<u32, NodeBlock> basic_blocks;
    NodeBlock global_code;