video_core: Cache GPU internal writes.

2023-01-04 22:05:20 -05:00 · 2023-01-04 22:05:20 -05:00 · 6c7eb81f7d
commit 6c7eb81f7d
parent b78328f19a
10 changed files with 185 additions and 30 deletions
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@ -85,6 +85,7 @@ add_library(video_core STATIC
    gpu.h
    gpu_thread.cpp
    gpu_thread.h
+    invalidation_accumulator.h
    memory_manager.cpp
    memory_manager.h
    precompiled_headers.h
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@ -76,7 +76,7 @@ void State::ProcessData(std::span<const u8> read_buffer) {
                                       regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,
                                       x_elements, regs.line_count, regs.dest.BlockHeight(),
                                       regs.dest.BlockDepth(), regs.line_length_in);
-        memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
+        memory_manager.WriteBlockCached(address, tmp_buffer.data(), dst_size);
    }
 }

--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@ -485,11 +485,6 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
 }

 void Maxwell3D::ProcessQueryGet() {
-    // TODO(Subv): Support the other query units.
-    if (regs.report_semaphore.query.location != Regs::ReportSemaphore::Location::All) {
-        LOG_DEBUG(HW_GPU, "Locations other than ALL are unimplemented");
-    }
-
    switch (regs.report_semaphore.query.operation) {
    case Regs::ReportSemaphore::Operation::Release:
        if (regs.report_semaphore.query.short_query != 0) {
@ -649,7 +644,7 @@ void Maxwell3D::ProcessCBMultiData(const u32* start_base, u32 amount) {

    const GPUVAddr address{buffer_address + regs.const_buffer.offset};
    const size_t copy_size = amount * sizeof(u32);
-    memory_manager.WriteBlock(address, start_base, copy_size);
+    memory_manager.WriteBlockCached(address, start_base, copy_size);

    // Increment the current buffer position.
    regs.const_buffer.offset += static_cast<u32>(copy_size);
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@ -69,7 +69,7 @@ void MaxwellDMA::Launch() {
    if (launch.multi_line_enable) {
        const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
        const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
-
+        memory_manager.FlushCaching();
        if (!is_src_pitch && !is_dst_pitch) {
            // If both the source and the destination are in block layout, assert.
            CopyBlockLinearToBlockLinear();
@ -104,6 +104,7 @@ void MaxwellDMA::Launch() {
                                            reinterpret_cast<u8*>(tmp_buffer.data()),
                                            regs.line_length_in * sizeof(u32));
        } else {
+            memory_manager.FlushCaching();
            const auto convert_linear_2_blocklinear_addr = [](u64 address) {
                return (address & ~0x1f0ULL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) |
                       ((address & 0x180) >> 1) | ((address & 0x20) << 3);
@ -121,7 +122,7 @@ void MaxwellDMA::Launch() {
                    memory_manager.ReadBlockUnsafe(
                        convert_linear_2_blocklinear_addr(regs.offset_in + offset),
                        tmp_buffer.data(), tmp_buffer.size());
-                    memory_manager.WriteBlock(regs.offset_out + offset, tmp_buffer.data(),
+                    memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(),
                                              tmp_buffer.size());
                }
            } else if (is_src_pitch && !is_dst_pitch) {
@ -132,7 +133,7 @@ void MaxwellDMA::Launch() {
                for (u32 offset = 0; offset < regs.line_length_in; offset += 16) {
                    memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(),
                                                   tmp_buffer.size());
-                    memory_manager.WriteBlock(
+                    memory_manager.WriteBlockCached(
                        convert_linear_2_blocklinear_addr(regs.offset_out + offset),
                        tmp_buffer.data(), tmp_buffer.size());
                }
@ -141,7 +142,7 @@ void MaxwellDMA::Launch() {
                    std::vector<u8> tmp_buffer(regs.line_length_in);
                    memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(),
                                                   regs.line_length_in);
-                    memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(),
+                    memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(),
                                              regs.line_length_in);
                }
            }
@ -204,7 +205,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
                     src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
                     regs.pitch_out);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::CopyPitchToBlockLinear() {
@ -256,7 +257,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
                   dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
                   regs.pitch_in);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::FastCopyBlockLinearToPitch() {
@ -287,7 +288,7 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
                     regs.src_params.block_size.height, regs.src_params.block_size.depth,
                     regs.pitch_out);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::CopyBlockLinearToBlockLinear() {
@ -347,7 +348,7 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() {
                   dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count,
                   dst.block_size.height, dst.block_size.depth, pitch);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::ReleaseSemaphore() {
--- a/src/video_core/invalidation_accumulator.h
+++ b/src/video_core/invalidation_accumulator.h
@ -0,0 +1,78 @@
+// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <vector>
+
+#include "common/common_types.h"
+
+namespace VideoCommon {
+
+class InvalidationAccumulator {
+public:
+    InvalidationAccumulator() = default;
+    ~InvalidationAccumulator() = default;
+
+    void Add(GPUVAddr address, size_t size) {
+        const auto reset_values = [&]() {
+            if (has_collected) {
+                buffer.emplace_back(start_address, accumulated_size);
+            }
+            start_address = address;
+            accumulated_size = size;
+            last_collection = start_address + size;
+        };
+        if (address >= start_address && address + size <= last_collection) [[likely]] {
+            return;
+        }
+        size = (address + size + atomicy_side_mask) & atomicy_mask - address;
+        address = address & atomicy_mask;
+        if (!has_collected) [[unlikely]] {
+            reset_values();
+            has_collected = true;
+            return;
+        }
+        if (address != last_collection) [[unlikely]] {
+            reset_values();
+            return;
+        }
+        accumulated_size += size;
+        last_collection += size;
+    }
+
+    void Clear() {
+        buffer.clear();
+        start_address = 0;
+        last_collection = 0;
+        has_collected = false;
+    }
+
+    bool AnyAccumulated() const {
+        return has_collected;
+    }
+
+    template <typename Func>
+    void Callback(Func&& func) {
+        if (!has_collected) {
+            return;
+        }
+        buffer.emplace_back(start_address, accumulated_size);
+        for (auto& [address, size] : buffer) {
+            func(address, size);
+        }
+    }
+
+private:
+    static constexpr size_t atomicy_bits = 5;
+    static constexpr size_t atomicy_size = 1ULL << atomicy_bits;
+    static constexpr size_t atomicy_side_mask = atomicy_size - 1;
+    static constexpr size_t atomicy_mask = ~atomicy_side_mask;
+    GPUVAddr start_address{};
+    GPUVAddr last_collection{};
+    size_t accumulated_size{};
+    bool has_collected{};
+    std::vector<std::pair<VAddr, size_t>> buffer;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@ -11,6 +11,7 @@
 #include "core/hle/kernel/k_page_table.h"
 #include "core/hle/kernel/k_process.h"
 #include "core/memory.h"
+#include "video_core/invalidation_accumulator.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
@ -26,7 +27,8 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64
      entries{}, big_entries{}, page_table{address_space_bits, address_space_bits + page_bits - 38,
                                           page_bits != big_page_bits ? page_bits : 0},
      kind_map{PTEKind::INVALID}, unique_identifier{unique_identifier_generator.fetch_add(
-                                      1, std::memory_order_acq_rel)} {
+                                      1, std::memory_order_acq_rel)},
+      accumulator{std::make_unique<VideoCommon::InvalidationAccumulator>()} {
    address_space_size = 1ULL << address_space_bits;
    page_size = 1ULL << page_bits;
    page_mask = page_size - 1ULL;
@ -185,15 +187,12 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
    if (size == 0) {
        return;
    }
-    const auto submapped_ranges = GetSubmappedRange(gpu_addr, size);
+    GetSubmappedRangeImpl<false>(gpu_addr, size, page_stash);

-    for (const auto& [map_addr, map_size] : submapped_ranges) {
-        // Flush and invalidate through the GPU interface, to be asynchronous if possible.
-        const std::optional<VAddr> cpu_addr = GpuToCpuAddress(map_addr);
-        ASSERT(cpu_addr);
-
-        rasterizer->UnmapMemory(*cpu_addr, map_size);
+    for (const auto& [map_addr, map_size] : page_stash) {
+        rasterizer->UnmapMemory(map_addr, map_size);
    }
+    page_stash.clear();

    BigPageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
    PageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
@ -454,6 +453,12 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buf
    WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
 }

+void MemoryManager::WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer,
+                                     std::size_t size) {
+    WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
+    accumulator->Add(gpu_dest_addr, size);
+}
+
 void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size,
                                VideoCommon::CacheType which) const {
    auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
@ -663,7 +668,17 @@ bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) cons
 std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
    GPUVAddr gpu_addr, std::size_t size) const {
    std::vector<std::pair<GPUVAddr, std::size_t>> result{};
-    std::optional<std::pair<GPUVAddr, std::size_t>> last_segment{};
+    GetSubmappedRangeImpl<true>(gpu_addr, size, result);
+    return result;
+}
+
+template <bool is_gpu_address>
+void MemoryManager::GetSubmappedRangeImpl(
+    GPUVAddr gpu_addr, std::size_t size,
+    std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
+        result) const {
+    std::optional<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>
+        last_segment{};
    std::optional<VAddr> old_page_addr{};
    const auto split = [&last_segment, &result]([[maybe_unused]] std::size_t page_index,
                                                [[maybe_unused]] std::size_t offset,
@ -685,8 +700,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
        }
        old_page_addr = {cpu_addr_base + copy_amount};
        if (!last_segment) {
-            const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset;
-            last_segment = {new_base_addr, copy_amount};
+            if constexpr (is_gpu_address) {
+                const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset;
+                last_segment = {new_base_addr, copy_amount};
+            } else {
+                last_segment = {cpu_addr_base, copy_amount};
+            }
        } else {
            last_segment->second += copy_amount;
        }
@ -703,8 +722,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
        }
        old_page_addr = {cpu_addr_base + copy_amount};
        if (!last_segment) {
-            const GPUVAddr new_base_addr = (page_index << page_bits) + offset;
-            last_segment = {new_base_addr, copy_amount};
+            if constexpr (is_gpu_address) {
+                const GPUVAddr new_base_addr = (page_index << page_bits) + offset;
+                last_segment = {new_base_addr, copy_amount};
+            } else {
+                last_segment = {cpu_addr_base, copy_amount};
+            }
        } else {
            last_segment->second += copy_amount;
        }
@ -715,7 +738,18 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
    };
    MemoryOperation<true>(gpu_addr, size, extend_size_big, split, do_short_pages);
    split(0, 0, 0);
-    return result;
+}
+
+void MemoryManager::FlushCaching() {
+    if (!accumulator->AnyAccumulated()) {
+        return;
+    }
+    accumulator->Callback([this](GPUVAddr addr, size_t size) {
+        GetSubmappedRangeImpl<false>(addr, size, page_stash);
+    });
+    rasterizer->InnerInvalidation(page_stash);
+    page_stash.clear();
+    accumulator->Clear();
 }

 } // namespace Tegra
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@ -19,6 +19,10 @@ namespace VideoCore {
 class RasterizerInterface;
 }

+namespace VideoCommon {
+class InvalidationAccumulator;
+}
+
 namespace Core {
 class DeviceMemory;
 namespace Memory {
@ -80,6 +84,7 @@ public:
     */
    void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
    void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);

    /**
     * Checks if a gpu region can be simply read with a pointer.
@ -102,7 +107,7 @@ public:
     * will be returned;
     */
    std::vector<std::pair<GPUVAddr, std::size_t>> GetSubmappedRange(GPUVAddr gpu_addr,
-                                                                    std::size_t size) const;
+                                                                 std::size_t size) const;

    GPUVAddr Map(GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size,
                 PTEKind kind = PTEKind::INVALID, bool is_big_pages = true);
@ -129,6 +134,8 @@ public:
    size_t GetMemoryLayoutSize(GPUVAddr gpu_addr,
                               size_t max_size = std::numeric_limits<size_t>::max()) const;

+    void FlushCaching();
+
 private:
    template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>
    inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,
@ -154,6 +161,12 @@ private:
    inline bool IsBigPageContinous(size_t big_page_index) const;
    inline void SetBigPageContinous(size_t big_page_index, bool value);

+    template <bool is_gpu_address>
+    void GetSubmappedRangeImpl(
+        GPUVAddr gpu_addr, std::size_t size,
+        std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
+            result) const;
+
    Core::System& system;
    Core::Memory::Memory& memory;
    Core::DeviceMemory& device_memory;
@ -201,10 +214,12 @@ private:
    Common::VirtualBuffer<u32> big_page_table_cpu;

    std::vector<u64> big_page_continous;
+    std::vector<std::pair<VAddr, std::size_t>> page_stash{};

    constexpr static size_t continous_bits = 64;

    const size_t unique_identifier;
+    std::unique_ptr<VideoCommon::InvalidationAccumulator> accumulator;

    static std::atomic<size_t> unique_identifier_generator;
 };
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@ -6,6 +6,7 @@
 #include <functional>
 #include <optional>
 #include <span>
+#include <utility>
 #include "common/common_types.h"
 #include "common/polyfill_thread.h"
 #include "video_core/cache_types.h"
@ -95,6 +96,12 @@ public:
    virtual void InvalidateRegion(VAddr addr, u64 size,
                                  VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0;

+    virtual void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
+        for (const auto [cpu_addr, size] : sequences) {
+            InvalidateRegion(cpu_addr, size);
+        }
+    }
+
    /// Notify rasterizer that any caches of the specified region are desync with guest
    virtual void OnCPUWrite(VAddr addr, u64 size) = 0;

--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@ -186,6 +186,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {

    SCOPE_EXIT({ gpu.TickWork(); });
    FlushWork();
+    gpu_memory->FlushCaching();

    query_cache.UpdateCounters();

@ -393,6 +394,7 @@ void RasterizerVulkan::Clear(u32 layer_count) {

 void RasterizerVulkan::DispatchCompute() {
    FlushWork();
+    gpu_memory->FlushCaching();

    ComputePipeline* const pipeline{pipeline_cache.CurrentComputePipeline()};
    if (!pipeline) {
@ -481,6 +483,27 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size, VideoCommon::Cache
    }
 }

+void RasterizerVulkan::InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
+    {
+        std::scoped_lock lock{texture_cache.mutex};
+        for (const auto [addr, size] : sequences) {
+            texture_cache.WriteMemory(addr, size);
+        }
+    }
+    {
+        std::scoped_lock lock{buffer_cache.mutex};
+        for (const auto [addr, size] : sequences) {
+            buffer_cache.WriteMemory(addr, size);
+        }
+    }
+    {
+        for (const auto [addr, size] : sequences) {
+            query_cache.InvalidateRegion(addr, size);
+            pipeline_cache.InvalidateRegion(addr, size);
+        }
+    }
+}
+
 void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
    if (addr == 0 || size == 0) {
        return;
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@ -79,6 +79,7 @@ public:
                         VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
    void InvalidateRegion(VAddr addr, u64 size,
                          VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
+    void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) override;
    void OnCPUWrite(VAddr addr, u64 size) override;
    void InvalidateGPUCache() override;
    void UnmapMemory(VAddr addr, u64 size) override;