From 6c7eb81f7d871f5c08a4844471633a67725aae73 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Wed, 4 Jan 2023 22:05:20 -0500 Subject: [PATCH] video_core: Cache GPU internal writes. --- src/video_core/CMakeLists.txt | 1 + src/video_core/engines/engine_upload.cpp | 2 +- src/video_core/engines/maxwell_3d.cpp | 7 +- src/video_core/engines/maxwell_dma.cpp | 17 ++-- src/video_core/invalidation_accumulator.h | 78 +++++++++++++++++++ src/video_core/memory_manager.cpp | 62 +++++++++++---- src/video_core/memory_manager.h | 17 +++- src/video_core/rasterizer_interface.h | 7 ++ .../renderer_vulkan/vk_rasterizer.cpp | 23 ++++++ .../renderer_vulkan/vk_rasterizer.h | 1 + 10 files changed, 185 insertions(+), 30 deletions(-) create mode 100644 src/video_core/invalidation_accumulator.h diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index aa271a3770..b7095ae133 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -85,6 +85,7 @@ add_library(video_core STATIC gpu.h gpu_thread.cpp gpu_thread.h + invalidation_accumulator.h memory_manager.cpp memory_manager.h precompiled_headers.h diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp index cea1dd8b0f..7f5a0c29d7 100644 --- a/src/video_core/engines/engine_upload.cpp +++ b/src/video_core/engines/engine_upload.cpp @@ -76,7 +76,7 @@ void State::ProcessData(std::span read_buffer) { regs.dest.height, regs.dest.depth, x_offset, regs.dest.y, x_elements, regs.line_count, regs.dest.BlockHeight(), regs.dest.BlockDepth(), regs.line_length_in); - memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size); + memory_manager.WriteBlockCached(address, tmp_buffer.data(), dst_size); } } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index fbfd1ddd24..97f5477897 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -485,11 +485,6 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) { } void Maxwell3D::ProcessQueryGet() { - // TODO(Subv): Support the other query units. - if (regs.report_semaphore.query.location != Regs::ReportSemaphore::Location::All) { - LOG_DEBUG(HW_GPU, "Locations other than ALL are unimplemented"); - } - switch (regs.report_semaphore.query.operation) { case Regs::ReportSemaphore::Operation::Release: if (regs.report_semaphore.query.short_query != 0) { @@ -649,7 +644,7 @@ void Maxwell3D::ProcessCBMultiData(const u32* start_base, u32 amount) { const GPUVAddr address{buffer_address + regs.const_buffer.offset}; const size_t copy_size = amount * sizeof(u32); - memory_manager.WriteBlock(address, start_base, copy_size); + memory_manager.WriteBlockCached(address, start_base, copy_size); // Increment the current buffer position. regs.const_buffer.offset += static_cast(copy_size); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 01f70ea9e5..7bf08e3e0d 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -69,7 +69,7 @@ void MaxwellDMA::Launch() { if (launch.multi_line_enable) { const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH; const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH; - + memory_manager.FlushCaching(); if (!is_src_pitch && !is_dst_pitch) { // If both the source and the destination are in block layout, assert. CopyBlockLinearToBlockLinear(); @@ -104,6 +104,7 @@ void MaxwellDMA::Launch() { reinterpret_cast(tmp_buffer.data()), regs.line_length_in * sizeof(u32)); } else { + memory_manager.FlushCaching(); const auto convert_linear_2_blocklinear_addr = [](u64 address) { return (address & ~0x1f0ULL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) | ((address & 0x180) >> 1) | ((address & 0x20) << 3); @@ -121,7 +122,7 @@ void MaxwellDMA::Launch() { memory_manager.ReadBlockUnsafe( convert_linear_2_blocklinear_addr(regs.offset_in + offset), tmp_buffer.data(), tmp_buffer.size()); - memory_manager.WriteBlock(regs.offset_out + offset, tmp_buffer.data(), + memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(), tmp_buffer.size()); } } else if (is_src_pitch && !is_dst_pitch) { @@ -132,7 +133,7 @@ void MaxwellDMA::Launch() { for (u32 offset = 0; offset < regs.line_length_in; offset += 16) { memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(), tmp_buffer.size()); - memory_manager.WriteBlock( + memory_manager.WriteBlockCached( convert_linear_2_blocklinear_addr(regs.offset_out + offset), tmp_buffer.data(), tmp_buffer.size()); } @@ -141,7 +142,7 @@ void MaxwellDMA::Launch() { std::vector tmp_buffer(regs.line_length_in); memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), regs.line_length_in); - memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), + memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(), regs.line_length_in); } } @@ -204,7 +205,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() { src_params.origin.y, x_elements, regs.line_count, block_height, block_depth, regs.pitch_out); - memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); + memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); } void MaxwellDMA::CopyPitchToBlockLinear() { @@ -256,7 +257,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() { dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth, regs.pitch_in); - memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); + memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); } void MaxwellDMA::FastCopyBlockLinearToPitch() { @@ -287,7 +288,7 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() { regs.src_params.block_size.height, regs.src_params.block_size.depth, regs.pitch_out); - memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); + memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); } void MaxwellDMA::CopyBlockLinearToBlockLinear() { @@ -347,7 +348,7 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() { dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count, dst.block_size.height, dst.block_size.depth, pitch); - memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); + memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); } void MaxwellDMA::ReleaseSemaphore() { diff --git a/src/video_core/invalidation_accumulator.h b/src/video_core/invalidation_accumulator.h new file mode 100644 index 0000000000..42420e31c8 --- /dev/null +++ b/src/video_core/invalidation_accumulator.h @@ -0,0 +1,78 @@ +// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include + +#include "common/common_types.h" + +namespace VideoCommon { + +class InvalidationAccumulator { +public: + InvalidationAccumulator() = default; + ~InvalidationAccumulator() = default; + + void Add(GPUVAddr address, size_t size) { + const auto reset_values = [&]() { + if (has_collected) { + buffer.emplace_back(start_address, accumulated_size); + } + start_address = address; + accumulated_size = size; + last_collection = start_address + size; + }; + if (address >= start_address && address + size <= last_collection) [[likely]] { + return; + } + size = (address + size + atomicy_side_mask) & atomicy_mask - address; + address = address & atomicy_mask; + if (!has_collected) [[unlikely]] { + reset_values(); + has_collected = true; + return; + } + if (address != last_collection) [[unlikely]] { + reset_values(); + return; + } + accumulated_size += size; + last_collection += size; + } + + void Clear() { + buffer.clear(); + start_address = 0; + last_collection = 0; + has_collected = false; + } + + bool AnyAccumulated() const { + return has_collected; + } + + template + void Callback(Func&& func) { + if (!has_collected) { + return; + } + buffer.emplace_back(start_address, accumulated_size); + for (auto& [address, size] : buffer) { + func(address, size); + } + } + +private: + static constexpr size_t atomicy_bits = 5; + static constexpr size_t atomicy_size = 1ULL << atomicy_bits; + static constexpr size_t atomicy_side_mask = atomicy_size - 1; + static constexpr size_t atomicy_mask = ~atomicy_side_mask; + GPUVAddr start_address{}; + GPUVAddr last_collection{}; + size_t accumulated_size{}; + bool has_collected{}; + std::vector> buffer; +}; + +} // namespace VideoCommon diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 3a5cdeb39f..83924475b5 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -11,6 +11,7 @@ #include "core/hle/kernel/k_page_table.h" #include "core/hle/kernel/k_process.h" #include "core/memory.h" +#include "video_core/invalidation_accumulator.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_base.h" @@ -26,7 +27,8 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64 entries{}, big_entries{}, page_table{address_space_bits, address_space_bits + page_bits - 38, page_bits != big_page_bits ? page_bits : 0}, kind_map{PTEKind::INVALID}, unique_identifier{unique_identifier_generator.fetch_add( - 1, std::memory_order_acq_rel)} { + 1, std::memory_order_acq_rel)}, + accumulator{std::make_unique()} { address_space_size = 1ULL << address_space_bits; page_size = 1ULL << page_bits; page_mask = page_size - 1ULL; @@ -185,15 +187,12 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) { if (size == 0) { return; } - const auto submapped_ranges = GetSubmappedRange(gpu_addr, size); + GetSubmappedRangeImpl(gpu_addr, size, page_stash); - for (const auto& [map_addr, map_size] : submapped_ranges) { - // Flush and invalidate through the GPU interface, to be asynchronous if possible. - const std::optional cpu_addr = GpuToCpuAddress(map_addr); - ASSERT(cpu_addr); - - rasterizer->UnmapMemory(*cpu_addr, map_size); + for (const auto& [map_addr, map_size] : page_stash) { + rasterizer->UnmapMemory(map_addr, map_size); } + page_stash.clear(); BigPageTableOp(gpu_addr, 0, size, PTEKind::INVALID); PageTableOp(gpu_addr, 0, size, PTEKind::INVALID); @@ -454,6 +453,12 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buf WriteBlockImpl(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None); } +void MemoryManager::WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer, + std::size_t size) { + WriteBlockImpl(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None); + accumulator->Add(gpu_dest_addr, size); +} + void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size, VideoCommon::CacheType which) const { auto do_nothing = [&]([[maybe_unused]] std::size_t page_index, @@ -663,7 +668,17 @@ bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) cons std::vector> MemoryManager::GetSubmappedRange( GPUVAddr gpu_addr, std::size_t size) const { std::vector> result{}; - std::optional> last_segment{}; + GetSubmappedRangeImpl(gpu_addr, size, result); + return result; +} + +template +void MemoryManager::GetSubmappedRangeImpl( + GPUVAddr gpu_addr, std::size_t size, + std::vector, std::size_t>>& + result) const { + std::optional, std::size_t>> + last_segment{}; std::optional old_page_addr{}; const auto split = [&last_segment, &result]([[maybe_unused]] std::size_t page_index, [[maybe_unused]] std::size_t offset, @@ -685,8 +700,12 @@ std::vector> MemoryManager::GetSubmappedRange( } old_page_addr = {cpu_addr_base + copy_amount}; if (!last_segment) { - const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset; - last_segment = {new_base_addr, copy_amount}; + if constexpr (is_gpu_address) { + const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset; + last_segment = {new_base_addr, copy_amount}; + } else { + last_segment = {cpu_addr_base, copy_amount}; + } } else { last_segment->second += copy_amount; } @@ -703,8 +722,12 @@ std::vector> MemoryManager::GetSubmappedRange( } old_page_addr = {cpu_addr_base + copy_amount}; if (!last_segment) { - const GPUVAddr new_base_addr = (page_index << page_bits) + offset; - last_segment = {new_base_addr, copy_amount}; + if constexpr (is_gpu_address) { + const GPUVAddr new_base_addr = (page_index << page_bits) + offset; + last_segment = {new_base_addr, copy_amount}; + } else { + last_segment = {cpu_addr_base, copy_amount}; + } } else { last_segment->second += copy_amount; } @@ -715,7 +738,18 @@ std::vector> MemoryManager::GetSubmappedRange( }; MemoryOperation(gpu_addr, size, extend_size_big, split, do_short_pages); split(0, 0, 0); - return result; +} + +void MemoryManager::FlushCaching() { + if (!accumulator->AnyAccumulated()) { + return; + } + accumulator->Callback([this](GPUVAddr addr, size_t size) { + GetSubmappedRangeImpl(addr, size, page_stash); + }); + rasterizer->InnerInvalidation(page_stash); + page_stash.clear(); + accumulator->Clear(); } } // namespace Tegra diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 828e13439e..e6de0d0cba 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -19,6 +19,10 @@ namespace VideoCore { class RasterizerInterface; } +namespace VideoCommon { +class InvalidationAccumulator; +} + namespace Core { class DeviceMemory; namespace Memory { @@ -80,6 +84,7 @@ public: */ void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const; void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); + void WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); /** * Checks if a gpu region can be simply read with a pointer. @@ -102,7 +107,7 @@ public: * will be returned; */ std::vector> GetSubmappedRange(GPUVAddr gpu_addr, - std::size_t size) const; + std::size_t size) const; GPUVAddr Map(GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size, PTEKind kind = PTEKind::INVALID, bool is_big_pages = true); @@ -129,6 +134,8 @@ public: size_t GetMemoryLayoutSize(GPUVAddr gpu_addr, size_t max_size = std::numeric_limits::max()) const; + void FlushCaching(); + private: template inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped, @@ -154,6 +161,12 @@ private: inline bool IsBigPageContinous(size_t big_page_index) const; inline void SetBigPageContinous(size_t big_page_index, bool value); + template + void GetSubmappedRangeImpl( + GPUVAddr gpu_addr, std::size_t size, + std::vector, std::size_t>>& + result) const; + Core::System& system; Core::Memory::Memory& memory; Core::DeviceMemory& device_memory; @@ -201,10 +214,12 @@ private: Common::VirtualBuffer big_page_table_cpu; std::vector big_page_continous; + std::vector> page_stash{}; constexpr static size_t continous_bits = 64; const size_t unique_identifier; + std::unique_ptr accumulator; static std::atomic unique_identifier_generator; }; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index f44c7df506..6b66ad7b60 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "common/common_types.h" #include "common/polyfill_thread.h" #include "video_core/cache_types.h" @@ -95,6 +96,12 @@ public: virtual void InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0; + virtual void InnerInvalidation(std::span> sequences) { + for (const auto [cpu_addr, size] : sequences) { + InvalidateRegion(cpu_addr, size); + } + } + /// Notify rasterizer that any caches of the specified region are desync with guest virtual void OnCPUWrite(VAddr addr, u64 size) = 0; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 242bf9602a..6c4d745649 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -186,6 +186,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { SCOPE_EXIT({ gpu.TickWork(); }); FlushWork(); + gpu_memory->FlushCaching(); query_cache.UpdateCounters(); @@ -393,6 +394,7 @@ void RasterizerVulkan::Clear(u32 layer_count) { void RasterizerVulkan::DispatchCompute() { FlushWork(); + gpu_memory->FlushCaching(); ComputePipeline* const pipeline{pipeline_cache.CurrentComputePipeline()}; if (!pipeline) { @@ -481,6 +483,27 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size, VideoCommon::Cache } } +void RasterizerVulkan::InnerInvalidation(std::span> sequences) { + { + std::scoped_lock lock{texture_cache.mutex}; + for (const auto [addr, size] : sequences) { + texture_cache.WriteMemory(addr, size); + } + } + { + std::scoped_lock lock{buffer_cache.mutex}; + for (const auto [addr, size] : sequences) { + buffer_cache.WriteMemory(addr, size); + } + } + { + for (const auto [addr, size] : sequences) { + query_cache.InvalidateRegion(addr, size); + pipeline_cache.InvalidateRegion(addr, size); + } + } +} + void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index c661e5b197..472cc64d91 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -79,6 +79,7 @@ public: VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; + void InnerInvalidation(std::span> sequences) override; void OnCPUWrite(VAddr addr, u64 size) override; void InvalidateGPUCache() override; void UnmapMemory(VAddr addr, u64 size) override;