suyu/src/video_core/shader/decode/memory.cpp

// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.

#include <algorithm>
#include <vector>
#include <fmt/format.h>

#include "common/assert.h"
#include "common/common_types.h"
#include "common/logging/log.h"
#include "video_core/engines/shader_bytecode.h"
#include "video_core/shader/shader_ir.h"

namespace VideoCommon::Shader {

using Tegra::Shader::Attribute;
using Tegra::Shader::Instruction;
using Tegra::Shader::OpCode;
using Tegra::Shader::Register;

namespace {
u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) {
    switch (uniform_type) {
    case Tegra::Shader::UniformType::Single:
        return 1;
    case Tegra::Shader::UniformType::Double:
        return 2;
    case Tegra::Shader::UniformType::Quad:
    case Tegra::Shader::UniformType::UnsignedQuad:
        return 4;
    default:
        UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type));
        return 1;
    }
}
} // namespace

u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
    const Instruction instr = {program_code[pc]};
    const auto opcode = OpCode::Decode(instr);

    switch (opcode->get().GetId()) {
    case OpCode::Id::LD_A: {
        // Note: Shouldn't this be interp mode flat? As in no interpolation made.
        UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex,
                             "Indirect attribute loads are not supported");
        UNIMPLEMENTED_IF_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) != 0,
                             "Unaligned attribute loads are not supported");

        Tegra::Shader::IpaMode input_mode{Tegra::Shader::IpaInterpMode::Pass,
                                          Tegra::Shader::IpaSampleMode::Default};

        u64 next_element = instr.attribute.fmt20.element;
        auto next_index = static_cast<u64>(instr.attribute.fmt20.index.Value());

        const auto LoadNextElement = [&](u32 reg_offset) {
            const Node buffer = GetRegister(instr.gpr39);
            const Node attribute = GetInputAttribute(static_cast<Attribute::Index>(next_index),
                                                     next_element, input_mode, buffer);

            SetRegister(bb, instr.gpr0.Value() + reg_offset, attribute);

            // Load the next attribute element into the following register. If the element
            // to load goes beyond the vec4 size, load the first element of the next
            // attribute.
            next_element = (next_element + 1) % 4;
            next_index = next_index + (next_element == 0 ? 1 : 0);
        };

        const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1;
        for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) {
            LoadNextElement(reg_offset);
        }
        break;
    }
    case OpCode::Id::LD_C: {
        UNIMPLEMENTED_IF(instr.ld_c.unknown != 0);

        Node index = GetRegister(instr.gpr8);

        const Node op_a =
            GetConstBufferIndirect(instr.cbuf36.index, instr.cbuf36.GetOffset() + 0, index);

        switch (instr.ld_c.type.Value()) {
        case Tegra::Shader::UniformType::Single:
            SetRegister(bb, instr.gpr0, op_a);
            break;

        case Tegra::Shader::UniformType::Double: {
            const Node op_b =
                GetConstBufferIndirect(instr.cbuf36.index, instr.cbuf36.GetOffset() + 4, index);

            SetTemporal(bb, 0, op_a);
            SetTemporal(bb, 1, op_b);
            SetRegister(bb, instr.gpr0, GetTemporal(0));
            SetRegister(bb, instr.gpr0.Value() + 1, GetTemporal(1));
            break;
        }
        default:
            UNIMPLEMENTED_MSG("Unhandled type: {}", static_cast<unsigned>(instr.ld_c.type.Value()));
        }
        break;
    }
    case OpCode::Id::LD_L: {
        LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}",
                  static_cast<u64>(instr.ld_l.unknown.Value()));

        const auto GetLmem = [&](s32 offset) {
            ASSERT(offset % 4 == 0);
            const Node immediate_offset = Immediate(static_cast<s32>(instr.smem_imm) + offset);
            const Node address = Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8),
                                           immediate_offset);
            return GetLocalMemory(address);
        };

        switch (instr.ldst_sl.type.Value()) {
        case Tegra::Shader::StoreType::Bits32:
        case Tegra::Shader::StoreType::Bits64:
        case Tegra::Shader::StoreType::Bits128: {
            const u32 count = [&]() {
                switch (instr.ldst_sl.type.Value()) {
                case Tegra::Shader::StoreType::Bits32:
                    return 1;
                case Tegra::Shader::StoreType::Bits64:
                    return 2;
                case Tegra::Shader::StoreType::Bits128:
                    return 4;
                default:
                    UNREACHABLE();
                    return 0;
                }
            }();
            for (u32 i = 0; i < count; ++i)
                SetTemporal(bb, i, GetLmem(i * 4));
            for (u32 i = 0; i < count; ++i)
                SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
            break;
        }
        default:
            UNIMPLEMENTED_MSG("LD_L Unhandled type: {}",
                              static_cast<u32>(instr.ldst_sl.type.Value()));
        }
        break;
    }
    case OpCode::Id::LDG: {
        const auto [real_address_base, base_address, descriptor] =
            TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8),
                                    static_cast<u32>(instr.ldg.immediate_offset.Value()), false);

        const u32 count = GetUniformTypeElementsCount(instr.ldg.type);
        for (u32 i = 0; i < count; ++i) {
            const Node it_offset = Immediate(i * 4);
            const Node real_address =
                Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
            const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));

            SetTemporal(bb, i, gmem);
        }
        for (u32 i = 0; i < count; ++i) {
            SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
        }
        break;
    }
    case OpCode::Id::STG: {
        const auto [real_address_base, base_address, descriptor] =
            TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8),
                                    static_cast<u32>(instr.stg.immediate_offset.Value()), true);

        // Encode in temporary registers like this: real_base_address, {registers_to_be_written...}
        SetTemporal(bb, 0, real_address_base);

        const u32 count = GetUniformTypeElementsCount(instr.stg.type);
        for (u32 i = 0; i < count; ++i) {
            SetTemporal(bb, i + 1, GetRegister(instr.gpr0.Value() + i));
        }
        for (u32 i = 0; i < count; ++i) {
            const Node it_offset = Immediate(i * 4);
            const Node real_address =
                Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
            const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));

            bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporal(i + 1)));
        }
        break;
    }
    case OpCode::Id::ST_A: {
        UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex,
                             "Indirect attribute loads are not supported");
        UNIMPLEMENTED_IF_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) != 0,
                             "Unaligned attribute loads are not supported");

        u64 next_element = instr.attribute.fmt20.element;
        auto next_index = static_cast<u64>(instr.attribute.fmt20.index.Value());

        const auto StoreNextElement = [&](u32 reg_offset) {
            const auto dest = GetOutputAttribute(static_cast<Attribute::Index>(next_index),
                                                 next_element, GetRegister(instr.gpr39));
            const auto src = GetRegister(instr.gpr0.Value() + reg_offset);

            bb.push_back(Operation(OperationCode::Assign, dest, src));

            // Load the next attribute element into the following register. If the element
            // to load goes beyond the vec4 size, load the first element of the next
            // attribute.
            next_element = (next_element + 1) % 4;
            next_index = next_index + (next_element == 0 ? 1 : 0);
        };

        const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1;
        for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) {
            StoreNextElement(reg_offset);
        }

        break;
    }
    case OpCode::Id::ST_L: {
        LOG_DEBUG(HW_GPU, "ST_L cache management mode: {}",
                  static_cast<u64>(instr.st_l.cache_management.Value()));

        const auto GetLmemAddr = [&](s32 offset) {
            ASSERT(offset % 4 == 0);
            const Node immediate = Immediate(static_cast<s32>(instr.smem_imm) + offset);
            return Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8), immediate);
        };

        switch (instr.ldst_sl.type.Value()) {
        case Tegra::Shader::StoreType::Bits128:
            SetLocalMemory(bb, GetLmemAddr(12), GetRegister(instr.gpr0.Value() + 3));
            SetLocalMemory(bb, GetLmemAddr(8), GetRegister(instr.gpr0.Value() + 2));
        case Tegra::Shader::StoreType::Bits64:
            SetLocalMemory(bb, GetLmemAddr(4), GetRegister(instr.gpr0.Value() + 1));
        case Tegra::Shader::StoreType::Bits32:
            SetLocalMemory(bb, GetLmemAddr(0), GetRegister(instr.gpr0));
            break;
        default:
            UNIMPLEMENTED_MSG("ST_L Unhandled type: {}",
                              static_cast<u32>(instr.ldst_sl.type.Value()));
        }
        break;
    }
    default:
        UNIMPLEMENTED_MSG("Unhandled memory instruction: {}", opcode->get().GetName());
    }

    return pc;
}

std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackAndGetGlobalMemory(NodeBlock& bb,
                                                                           Node addr_register,
                                                                           u32 immediate_offset,
                                                                           bool is_write) {
    const Node base_address{
        TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()))};
    const auto cbuf = std::get_if<CbufNode>(base_address);
    ASSERT(cbuf != nullptr);
    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset());
    ASSERT(cbuf_offset_imm != nullptr);
    const auto cbuf_offset = cbuf_offset_imm->GetValue();

    bb.push_back(
        Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset)));

    const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset};
    const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor);
    auto& usage = entry->second;
    if (is_write) {
        usage.is_written = true;
    } else {
        usage.is_read = true;
    }

    const auto real_address =
        Operation(OperationCode::UAdd, NO_PRECISE, Immediate(immediate_offset), addr_register);

    return {real_address, base_address, descriptor};
}

} // namespace VideoCommon::Shader
shader_ir: Initial implementation 2018-12-20 23:09:21 +01:00			`// Copyright 2018 yuzu Emulator Project`
			`// Licensed under GPLv2 or any later version`
			`// Refer to the license.txt file included.`

shader_decode: Implement TEX and TXQ 2018-12-13 20:59:28 +01:00			`#include <algorithm>`
shader_decode: Implement TEXS (F32) 2018-12-21 05:27:47 +01:00			`#include <vector>`
shader_decode: Implement LDG and basic cbuf tracking 2018-12-29 06:44:54 +01:00			`#include <fmt/format.h>`
shader_decode: Implement TEXS (F32) 2018-12-21 05:27:47 +01:00
shader_ir: Initial implementation 2018-12-20 23:09:21 +01:00			`#include "common/assert.h"`
			`#include "common/common_types.h"`
shader_ir/memory: Reduce severity of ST_L cache management and log it 2019-04-02 04:03:32 +02:00			`#include "common/logging/log.h"`
shader_ir: Initial implementation 2018-12-20 23:09:21 +01:00			`#include "video_core/engines/shader_bytecode.h"`
			`#include "video_core/shader/shader_ir.h"`

			`namespace VideoCommon::Shader {`

shader_decode: Implement LD_A 2018-12-21 04:05:42 +01:00			`using Tegra::Shader::Attribute;`
shader_ir: Initial implementation 2018-12-20 23:09:21 +01:00			`using Tegra::Shader::Instruction;`
			`using Tegra::Shader::OpCode;`
shader_decode: Implement LD_A 2018-12-21 04:05:42 +01:00			`using Tegra::Shader::Register;`
shader_ir: Initial implementation 2018-12-20 23:09:21 +01:00
shader_ir: Implement STG, keep track of global memory usage and flush 2019-02-07 04:05:41 +01:00			`namespace {`
			`u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) {`
			`switch (uniform_type) {`
			`case Tegra::Shader::UniformType::Single:`
			`return 1;`
			`case Tegra::Shader::UniformType::Double:`
			`return 2;`
			`case Tegra::Shader::UniformType::Quad:`
			`case Tegra::Shader::UniformType::UnsignedQuad:`
			`return 4;`
			`default:`
			`UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type));`
			`return 1;`
			`}`
			`}`
			`} // namespace`

shader_ir: Rename BasicBlock to NodeBlock It's not always used as a basic block. Rename it for consistency. 2019-01-30 06:09:40 +01:00			`u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {`
shader_ir: Initial implementation 2018-12-20 23:09:21 +01:00			`const Instruction instr = {program_code[pc]};`
			`const auto opcode = OpCode::Decode(instr);`

shader_decode: Implement LD_A 2018-12-21 04:05:42 +01:00			`switch (opcode->get().GetId()) {`
			`case OpCode::Id::LD_A: {`
			`// Note: Shouldn't this be interp mode flat? As in no interpolation made.`
			`UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex,`
			`"Indirect attribute loads are not supported");`
			`UNIMPLEMENTED_IF_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) != 0,`
			`"Unaligned attribute loads are not supported");`

shader_decompiler: Improve Accuracy of Attribute Interpolation. 2019-02-13 02:14:39 +01:00			`Tegra::Shader::IpaMode input_mode{Tegra::Shader::IpaInterpMode::Pass,`
shader_decode: Implement LD_A 2018-12-21 04:05:42 +01:00			`Tegra::Shader::IpaSampleMode::Default};`

			`u64 next_element = instr.attribute.fmt20.element;`
			`auto next_index = static_cast<u64>(instr.attribute.fmt20.index.Value());`

			`const auto LoadNextElement = [&](u32 reg_offset) {`
			`const Node buffer = GetRegister(instr.gpr39);`
			`const Node attribute = GetInputAttribute(static_cast<Attribute::Index>(next_index),`
			`next_element, input_mode, buffer);`

			`SetRegister(bb, instr.gpr0.Value() + reg_offset, attribute);`

			`// Load the next attribute element into the following register. If the element`
			`// to load goes beyond the vec4 size, load the first element of the next`
			`// attribute.`
			`next_element = (next_element + 1) % 4;`
			`next_index = next_index + (next_element == 0 ? 1 : 0);`
			`};`

			`const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1;`
			`for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) {`
			`LoadNextElement(reg_offset);`
			`}`
			`break;`
			`}`
shader_decode: Implement LD_C 2018-12-21 06:08:52 +01:00			`case OpCode::Id::LD_C: {`
			`UNIMPLEMENTED_IF(instr.ld_c.unknown != 0);`

			`Node index = GetRegister(instr.gpr8);`

			`const Node op_a =`
shader_ir: Unify constant buffer offset values Constant buffer values on the shader IR were using different offsets if the access direct or indirect. cbuf34 has a non-multiplied offset while cbuf36 does. On shader decoding this commit multiplies it by four on cbuf34 queries. 2019-01-28 22:11:23 +01:00			`GetConstBufferIndirect(instr.cbuf36.index, instr.cbuf36.GetOffset() + 0, index);`
shader_decode: Implement LD_C 2018-12-21 06:08:52 +01:00
			`switch (instr.ld_c.type.Value()) {`
			`case Tegra::Shader::UniformType::Single:`
			`SetRegister(bb, instr.gpr0, op_a);`
			`break;`

			`case Tegra::Shader::UniformType::Double: {`
			`const Node op_b =`
shader_ir: Unify constant buffer offset values Constant buffer values on the shader IR were using different offsets if the access direct or indirect. cbuf34 has a non-multiplied offset while cbuf36 does. On shader decoding this commit multiplies it by four on cbuf34 queries. 2019-01-28 22:11:23 +01:00			`GetConstBufferIndirect(instr.cbuf36.index, instr.cbuf36.GetOffset() + 4, index);`
shader_decode: Implement LD_C 2018-12-21 06:08:52 +01:00
shader_ir: Remove composite primitives and use temporals instead 2018-12-27 05:50:22 +01:00			`SetTemporal(bb, 0, op_a);`
			`SetTemporal(bb, 1, op_b);`
			`SetRegister(bb, instr.gpr0, GetTemporal(0));`
			`SetRegister(bb, instr.gpr0.Value() + 1, GetTemporal(1));`
shader_decode: Implement LD_C 2018-12-21 06:08:52 +01:00			`break;`
			`}`
			`default:`
			`UNIMPLEMENTED_MSG("Unhandled type: {}", static_cast<unsigned>(instr.ld_c.type.Value()));`
			`}`
			`break;`
			`}`
shader_decode: Implement LD_L 2018-12-21 06:33:15 +01:00			`case OpCode::Id::LD_L: {`
shader_ir/memory: Reduce severity of LD_L cache management and log it 2019-04-02 04:18:13 +02:00			`LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}",`
			`static_cast<u64>(instr.ld_l.unknown.Value()));`
shader_ir/memory: Add LD_L 64 bits loads 2019-02-03 03:43:11 +01:00
			`const auto GetLmem = [&](s32 offset) {`
			`ASSERT(offset % 4 == 0);`
			`const Node immediate_offset = Immediate(static_cast<s32>(instr.smem_imm) + offset);`
			`const Node address = Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8),`
			`immediate_offset);`
			`return GetLocalMemory(address);`
			`};`
shader_decode: Implement LD_L 2018-12-21 06:33:15 +01:00
			`switch (instr.ldst_sl.type.Value()) {`
shader_bytecode: Rename BytesN enums to BitsN 2019-02-03 03:44:38 +01:00			`case Tegra::Shader::StoreType::Bits32:`
shader_ir/memory: Add LD_L 128 bits loads 2019-02-03 04:35:20 +01:00			`case Tegra::Shader::StoreType::Bits64:`
			`case Tegra::Shader::StoreType::Bits128: {`
			`const u32 count = [&]() {`
			`switch (instr.ldst_sl.type.Value()) {`
			`case Tegra::Shader::StoreType::Bits32:`
			`return 1;`
			`case Tegra::Shader::StoreType::Bits64:`
			`return 2;`
			`case Tegra::Shader::StoreType::Bits128:`
			`return 4;`
			`default:`
			`UNREACHABLE();`
			`return 0;`
			`}`
			`}();`
			`for (u32 i = 0; i < count; ++i)`
			`SetTemporal(bb, i, GetLmem(i * 4));`
			`for (u32 i = 0; i < count; ++i)`
			`SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));`
shader_decode: Implement LD_L 2018-12-21 06:33:15 +01:00			`break;`
shader_ir/memory: Add LD_L 64 bits loads 2019-02-03 03:43:11 +01:00			`}`
shader_decode: Implement LD_L 2018-12-21 06:33:15 +01:00			`default:`
			`UNIMPLEMENTED_MSG("LD_L Unhandled type: {}",`
shader_bytecode: Rename BytesN enums to BitsN 2019-02-03 03:44:38 +01:00			`static_cast<u32>(instr.ldst_sl.type.Value()));`
shader_decode: Implement LD_L 2018-12-21 06:33:15 +01:00			`}`
			`break;`
			`}`
shader_decode: Implement LDG and basic cbuf tracking 2018-12-29 06:44:54 +01:00			`case OpCode::Id::LDG: {`
shader_ir: Implement STG, keep track of global memory usage and flush 2019-02-07 04:05:41 +01:00			`const auto [real_address_base, base_address, descriptor] =`
			`TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8),`
			`static_cast<u32>(instr.ldg.immediate_offset.Value()), false);`
shader_decode: Implement LDG and basic cbuf tracking 2018-12-29 06:44:54 +01:00
shader_ir: Implement STG, keep track of global memory usage and flush 2019-02-07 04:05:41 +01:00			`const u32 count = GetUniformTypeElementsCount(instr.ldg.type);`
shader_decode: Implement LDG and basic cbuf tracking 2018-12-29 06:44:54 +01:00			`for (u32 i = 0; i < count; ++i) {`
			`const Node it_offset = Immediate(i * 4);`
			`const Node real_address =`
shader_ir: Implement STG, keep track of global memory usage and flush 2019-02-07 04:05:41 +01:00			`Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);`
shader_decode: Implement LDG and basic cbuf tracking 2018-12-29 06:44:54 +01:00			`const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));`

			`SetTemporal(bb, i, gmem);`
			`}`
			`for (u32 i = 0; i < count; ++i) {`
			`SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));`
			`}`
			`break;`
			`}`
shader_ir: Implement STG, keep track of global memory usage and flush 2019-02-07 04:05:41 +01:00			`case OpCode::Id::STG: {`
			`const auto [real_address_base, base_address, descriptor] =`
			`TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8),`
			`static_cast<u32>(instr.stg.immediate_offset.Value()), true);`

			`// Encode in temporary registers like this: real_base_address, {registers_to_be_written...}`
			`SetTemporal(bb, 0, real_address_base);`

			`const u32 count = GetUniformTypeElementsCount(instr.stg.type);`
			`for (u32 i = 0; i < count; ++i) {`
			`SetTemporal(bb, i + 1, GetRegister(instr.gpr0.Value() + i));`
			`}`
			`for (u32 i = 0; i < count; ++i) {`
			`const Node it_offset = Immediate(i * 4);`
			`const Node real_address =`
			`Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);`
			`const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));`

			`bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporal(i + 1)));`
			`}`
			`break;`
			`}`
shader_decode: Implement ST_A 2018-12-21 04:06:13 +01:00			`case OpCode::Id::ST_A: {`
			`UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex,`
			`"Indirect attribute loads are not supported");`
			`UNIMPLEMENTED_IF_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) != 0,`
			`"Unaligned attribute loads are not supported");`

			`u64 next_element = instr.attribute.fmt20.element;`
			`auto next_index = static_cast<u64>(instr.attribute.fmt20.index.Value());`

			`const auto StoreNextElement = [&](u32 reg_offset) {`
			`const auto dest = GetOutputAttribute(static_cast<Attribute::Index>(next_index),`
			`next_element, GetRegister(instr.gpr39));`
			`const auto src = GetRegister(instr.gpr0.Value() + reg_offset);`

			`bb.push_back(Operation(OperationCode::Assign, dest, src));`

			`// Load the next attribute element into the following register. If the element`
			`// to load goes beyond the vec4 size, load the first element of the next`
			`// attribute.`
			`next_element = (next_element + 1) % 4;`
			`next_index = next_index + (next_element == 0 ? 1 : 0);`
			`};`

			`const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1;`
			`for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) {`
			`StoreNextElement(reg_offset);`
			`}`

			`break;`
			`}`
shader_decode: Implement ST_L 2018-12-21 06:33:31 +01:00			`case OpCode::Id::ST_L: {`
shader_ir/memory: Reduce severity of ST_L cache management and log it 2019-04-02 04:03:32 +02:00			`LOG_DEBUG(HW_GPU, "ST_L cache management mode: {}",`
			`static_cast<u64>(instr.st_l.cache_management.Value()));`
shader_decode: Implement ST_L 2018-12-21 06:33:31 +01:00
shader_ir/memory: Add ST_L 64 and 128 bits stores 2019-02-03 23:08:10 +01:00			`const auto GetLmemAddr = [&](s32 offset) {`
			`ASSERT(offset % 4 == 0);`
			`const Node immediate = Immediate(static_cast<s32>(instr.smem_imm) + offset);`
			`return Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8), immediate);`
			`};`
shader_decode: Implement ST_L 2018-12-21 06:33:31 +01:00
			`switch (instr.ldst_sl.type.Value()) {`
shader_ir/memory: Add ST_L 64 and 128 bits stores 2019-02-03 23:08:10 +01:00			`case Tegra::Shader::StoreType::Bits128:`
			`SetLocalMemory(bb, GetLmemAddr(12), GetRegister(instr.gpr0.Value() + 3));`
			`SetLocalMemory(bb, GetLmemAddr(8), GetRegister(instr.gpr0.Value() + 2));`
			`case Tegra::Shader::StoreType::Bits64:`
			`SetLocalMemory(bb, GetLmemAddr(4), GetRegister(instr.gpr0.Value() + 1));`
shader_bytecode: Rename BytesN enums to BitsN 2019-02-03 03:44:38 +01:00			`case Tegra::Shader::StoreType::Bits32:`
shader_ir/memory: Add ST_L 64 and 128 bits stores 2019-02-03 23:08:10 +01:00			`SetLocalMemory(bb, GetLmemAddr(0), GetRegister(instr.gpr0));`
shader_decode: Implement ST_L 2018-12-21 06:33:31 +01:00			`break;`
			`default:`
			`UNIMPLEMENTED_MSG("ST_L Unhandled type: {}",`
			`static_cast<u32>(instr.ldst_sl.type.Value()));`
			`}`
			`break;`
			`}`
shader_decode: Implement LD_A 2018-12-21 04:05:42 +01:00			`default:`
			`UNIMPLEMENTED_MSG("Unhandled memory instruction: {}", opcode->get().GetName());`
			`}`
shader_ir: Initial implementation 2018-12-20 23:09:21 +01:00
			`return pc;`
			`}`

shader_ir: Implement STG, keep track of global memory usage and flush 2019-02-07 04:05:41 +01:00			`std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackAndGetGlobalMemory(NodeBlock& bb,`
			`Node addr_register,`
			`u32 immediate_offset,`
			`bool is_write) {`
			`const Node base_address{`
			`TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()))};`
			`const auto cbuf = std::get_if<CbufNode>(base_address);`
			`ASSERT(cbuf != nullptr);`
			`const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset());`
			`ASSERT(cbuf_offset_imm != nullptr);`
			`const auto cbuf_offset = cbuf_offset_imm->GetValue();`

			`bb.push_back(`
			`Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset)));`

			`const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset};`
			`const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor);`
			`auto& usage = entry->second;`
			`if (is_write) {`
			`usage.is_written = true;`
			`} else {`
			`usage.is_read = true;`
			`}`

			`const auto real_address =`
			`Operation(OperationCode::UAdd, NO_PRECISE, Immediate(immediate_offset), addr_register);`

			`return {real_address, base_address, descriptor};`
			`}`

Fix TXQ not using the component mask. 2019-02-03 22:07:20 +01:00			`} // namespace VideoCommon::Shader`