From 34a8d0cc8e04b4b9d8e5a75e552f0adb31b5d718 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Fri, 29 Dec 2023 07:53:52 +0100 Subject: [PATCH] SMMU: Implement physical memory mirroring --- src/core/device_memory_manager.h | 27 ++- src/core/device_memory_manager.inc | 154 +++++++++++++++++- src/core/hle/service/nvdrv/core/container.cpp | 6 +- src/core/hle/service/nvdrv/core/nvmap.cpp | 7 +- src/core/hle/service/nvdrv/core/nvmap.h | 6 +- src/core/memory.cpp | 53 +++--- .../renderer_opengl/gl_rasterizer.cpp | 7 +- .../renderer_vulkan/vk_rasterizer.cpp | 6 +- 8 files changed, 226 insertions(+), 40 deletions(-) diff --git a/src/core/device_memory_manager.h b/src/core/device_memory_manager.h index 1a63cbd093..7c7726348e 100644 --- a/src/core/device_memory_manager.h +++ b/src/core/device_memory_manager.h @@ -10,8 +10,10 @@ #include #include "common/common_types.h" +#include "common/scratch_buffer.h" #include "common/virtual_buffer.h" + namespace Core { class DeviceMemory; @@ -49,9 +51,25 @@ public: template const T* GetPointer(DAddr address) const; - DAddr GetAddressFromPAddr(PAddr address) const { + template + void ApplyOpOnPAddr(PAddr address, Common::ScratchBuffer& buffer, Func&& operation) { DAddr subbits = static_cast(address & page_mask); - return (static_cast(compressed_device_addr[(address >> page_bits)]) << page_bits) + subbits; + const u32 base = compressed_device_addr[(address >> page_bits)]; + if ((base >> MULTI_FLAG_BITS) == 0) [[likely]] { + const DAddr d_address = static_cast(base << page_bits) + subbits; + operation(d_address); + return; + } + InnerGatherDeviceAddresses(buffer, address); + for (u32 value : buffer) { + operation(static_cast(value << page_bits) + subbits); + } + } + + template + void ApplyOpOnPointer(const u8* p, Common::ScratchBuffer& buffer, Func&& operation) { + PAddr address = GetRawPhysicalAddr(p); + ApplyOpOnPAddr(address, buffer, operation); } PAddr GetPhysicalRawAddressFromDAddr(DAddr address) const { @@ -98,6 +116,9 @@ private: static constexpr size_t page_size = 1ULL << page_bits; static constexpr size_t page_mask = page_size - 1ULL; static constexpr u32 physical_address_base = 1U << page_bits; + static constexpr u32 MULTI_FLAG_BITS = 31; + static constexpr u32 MULTI_FLAG = 1U << MULTI_FLAG_BITS; + static constexpr u32 MULTI_MASK = ~MULTI_FLAG; template T* GetPointerFromRaw(PAddr addr) { @@ -117,6 +138,8 @@ private: void WalkBlock(const DAddr addr, const std::size_t size, auto on_unmapped, auto on_memory, auto increment); + void InnerGatherDeviceAddresses(Common::ScratchBuffer& buffer, PAddr address); + std::unique_ptr> impl; const uintptr_t physical_base; diff --git a/src/core/device_memory_manager.inc b/src/core/device_memory_manager.inc index 8c5f82d315..4fb3ad3ab5 100644 --- a/src/core/device_memory_manager.inc +++ b/src/core/device_memory_manager.inc @@ -18,10 +18,117 @@ namespace Core { +namespace { + +class PhysicalAddressContainer { +public: + PhysicalAddressContainer() = default; + ~PhysicalAddressContainer() = default; + + void GatherValues(u32 start_entry, Common::ScratchBuffer& buffer) { + buffer.resize(8); + buffer.resize(0); + size_t index = 0; + const auto add_value = [&](u32 value) { + buffer[index] = value; + index++; + buffer.resize(index); + }; + + u32 iter_entry = start_entry; + Entry* current = &storage[iter_entry - 1]; + add_value(current->value); + while (current->next_entry != 0) { + iter_entry = current->next_entry; + current = &storage[iter_entry - 1]; + add_value(current->value); + } + } + + u32 Register(u32 value) { + return RegisterImplementation(value); + } + + void Register(u32 value, u32 start_entry) { + auto entry_id = RegisterImplementation(value); + u32 iter_entry = start_entry; + Entry* current = &storage[iter_entry - 1]; + while (current->next_entry != 0) { + iter_entry = current->next_entry; + current = &storage[iter_entry - 1]; + } + current->next_entry = entry_id; + } + + std::pair Unregister(u32 value, u32 start_entry) { + u32 iter_entry = start_entry; + Entry* previous{}; + Entry* current = &storage[iter_entry - 1]; + Entry* next{}; + bool more_than_one_remaining = false; + u32 result_start{start_entry}; + size_t count = 0; + while (current->value != value) { + count++; + previous = current; + iter_entry = current->next_entry; + current = &storage[iter_entry - 1]; + } + // Find next + u32 next_entry = current->next_entry; + if (next_entry != 0) { + next = &storage[next_entry - 1]; + more_than_one_remaining = next->next_entry != 0; + } + if (previous) { + previous->next_entry = next_entry; + } else { + result_start = next_entry; + } + free_entries.emplace_back(iter_entry); + return std::make_pair(more_than_one_remaining || count > 1, result_start); + } + + u32 ReleaseEntry(u32 start_entry) { + Entry* current = &storage[start_entry - 1]; + free_entries.emplace_back(start_entry); + return current->value; + } + +private: + u32 RegisterImplementation(u32 value) { + auto entry_id = GetNewEntry(); + auto& entry = storage[entry_id - 1]; + entry.next_entry = 0; + entry.value = value; + return entry_id; + } + u32 GetNewEntry() { + if (!free_entries.empty()) { + u32 result = free_entries.front(); + free_entries.pop_front(); + return result; + } + storage.emplace_back(); + u32 new_entry = static_cast(storage.size()); + return new_entry; + } + + struct Entry { + u32 next_entry{}; + u32 value{}; + }; + + std::deque storage; + std::deque free_entries; +}; + struct EmptyAllocator { EmptyAllocator([[maybe_unused]] DAddr address) {} }; +} // namespace + template struct DeviceMemoryManagerAllocator { static constexpr bool supports_pinning = DTraits::supports_pinning; @@ -38,6 +145,7 @@ struct DeviceMemoryManagerAllocator { std::conditional_t, EmptyAllocator> pin_allocator; Common::FlatAllocator main_allocator; + PhysicalAddressContainer multi_dev_address; /// Returns true when vaddr -> vaddr+size is fully contained in the buffer template @@ -109,6 +217,9 @@ DeviceMemoryManager::DeviceMemoryManager(const DeviceMemory& device_memo cpu_backing_address(device_as_size >> Memory::YUZU_PAGEBITS) { impl = std::make_unique>(); cached_pages = std::make_unique(); + for (size_t i = 0; i < 1ULL << (33 - 12); i++) { + compressed_device_addr[i] = 0; + } } template @@ -155,8 +266,19 @@ void DeviceMemoryManager::Map(DAddr address, VAddr virtual_address, size } auto phys_addr = static_cast(GetRawPhysicalAddr(ptr) >> Memory::YUZU_PAGEBITS) + 1U; compressed_physical_ptr[start_page_d + i] = phys_addr; - compressed_device_addr[phys_addr - 1U] = static_cast(start_page_d + i); InsertCPUBacking(start_page_d + i, new_vaddress, process_id); + const u32 base_dev = compressed_device_addr[phys_addr - 1U]; + const u32 new_dev = static_cast(start_page_d + i); + if (base_dev == 0) [[likely]] { + compressed_device_addr[phys_addr - 1U] = new_dev; + continue; + } + u32 start_id = base_dev & MULTI_MASK; + if ((base_dev >> MULTI_FLAG_BITS) == 0) { + start_id = impl->multi_dev_address.Register(base_dev); + compressed_device_addr[phys_addr - 1U] = MULTI_FLAG | start_id; + } + impl->multi_dev_address.Register(new_dev, start_id); } } @@ -170,12 +292,38 @@ void DeviceMemoryManager::Unmap(DAddr address, size_t size) { auto phys_addr = compressed_physical_ptr[start_page_d + i]; compressed_physical_ptr[start_page_d + i] = 0; cpu_backing_address[start_page_d + i] = 0; - if (phys_addr != 0) { - compressed_device_addr[phys_addr - 1] = 0; + if (phys_addr != 0) [[likely]] { + const u32 base_dev = compressed_device_addr[phys_addr - 1U]; + if ((base_dev >> MULTI_FLAG_BITS) == 0) [[likely]] { + compressed_device_addr[phys_addr - 1] = 0; + continue; + } + const auto [more_entries, new_start] = impl->multi_dev_address.Unregister( + static_cast(start_page_d + i), base_dev & MULTI_MASK); + if (!more_entries) { + compressed_device_addr[phys_addr - 1] = + impl->multi_dev_address.ReleaseEntry(new_start); + continue; + } + compressed_device_addr[phys_addr - 1] = new_start | MULTI_FLAG; } } } +template +void DeviceMemoryManager::InnerGatherDeviceAddresses(Common::ScratchBuffer& buffer, + PAddr address) { + size_t phys_addr = address >> page_bits; + std::scoped_lock lk(mapping_guard); + u32 backing = compressed_device_addr[phys_addr]; + if ((backing >> MULTI_FLAG_BITS) != 0) { + impl->multi_dev_address.GatherValues(backing & MULTI_MASK, buffer); + return; + } + buffer.resize(1); + buffer[0] = backing; +} + template template T* DeviceMemoryManager::GetPointer(DAddr address) { diff --git a/src/core/hle/service/nvdrv/core/container.cpp b/src/core/hle/service/nvdrv/core/container.cpp index 7c2231fe6e..e12ce05c13 100644 --- a/src/core/hle/service/nvdrv/core/container.cpp +++ b/src/core/hle/service/nvdrv/core/container.cpp @@ -16,8 +16,8 @@ namespace Service::Nvidia::NvCore { struct ContainerImpl { - explicit ContainerImpl(Tegra::Host1x::Host1x& host1x_) - : host1x{host1x_}, file{host1x_}, manager{host1x_}, device_file_data{} {} + explicit ContainerImpl(Container& core, Tegra::Host1x::Host1x& host1x_) + : host1x{host1x_}, file{core, host1x_}, manager{host1x_}, device_file_data{} {} Tegra::Host1x::Host1x& host1x; NvMap file; SyncpointManager manager; @@ -29,7 +29,7 @@ struct ContainerImpl { }; Container::Container(Tegra::Host1x::Host1x& host1x_) { - impl = std::make_unique(host1x_); + impl = std::make_unique(*this, host1x_); } Container::~Container() = default; diff --git a/src/core/hle/service/nvdrv/core/nvmap.cpp b/src/core/hle/service/nvdrv/core/nvmap.cpp index 7879c6f04a..e4168a37c5 100644 --- a/src/core/hle/service/nvdrv/core/nvmap.cpp +++ b/src/core/hle/service/nvdrv/core/nvmap.cpp @@ -7,6 +7,7 @@ #include "common/alignment.h" #include "common/assert.h" #include "common/logging/log.h" +#include "core/hle/service/nvdrv/core/container.h" #include "core/hle/service/nvdrv/core/nvmap.h" #include "core/memory.h" #include "video_core/host1x/host1x.h" @@ -64,7 +65,7 @@ NvResult NvMap::Handle::Duplicate(bool internal_session) { return NvResult::Success; } -NvMap::NvMap(Tegra::Host1x::Host1x& host1x_) : host1x{host1x_} {} +NvMap::NvMap(Container& core_, Tegra::Host1x::Host1x& host1x_) : host1x{host1x_}, core{core_} {} void NvMap::AddHandle(std::shared_ptr handle_description) { std::scoped_lock lock(handles_lock); @@ -160,6 +161,8 @@ DAddr NvMap::PinHandle(NvMap::Handle::Id handle, size_t session_id, bool low_are // If not then allocate some space and map it DAddr address{}; auto& smmu = host1x.MemoryManager(); + auto* session = core.GetSession(session_id); + auto allocate = std::bind(&Tegra::MaxwellDeviceMemoryManager::Allocate, &smmu, _1); //: std::bind(&Tegra::MaxwellDeviceMemoryManager::Allocate, &smmu, _1); while ((address = allocate(static_cast(handle_description->aligned_size))) == 0) { @@ -179,7 +182,7 @@ DAddr NvMap::PinHandle(NvMap::Handle::Id handle, size_t session_id, bool low_are handle_description->d_address = address; smmu.Map(address, handle_description->address, handle_description->aligned_size, - session_id); + session->smmu_id); } handle_description->pins++; diff --git a/src/core/hle/service/nvdrv/core/nvmap.h b/src/core/hle/service/nvdrv/core/nvmap.h index e9e9e8b5be..7dd6d26c3c 100644 --- a/src/core/hle/service/nvdrv/core/nvmap.h +++ b/src/core/hle/service/nvdrv/core/nvmap.h @@ -25,6 +25,8 @@ class Host1x; } // namespace Tegra namespace Service::Nvidia::NvCore { + +class Container; /** * @brief The nvmap core class holds the global state for nvmap and provides methods to manage * handles @@ -109,7 +111,7 @@ public: bool can_unlock; //!< If the address region is ready to be unlocked }; - explicit NvMap(Tegra::Host1x::Host1x& host1x); + explicit NvMap(Container& core, Tegra::Host1x::Host1x& host1x); /** * @brief Creates an unallocated handle of the given size @@ -173,5 +175,7 @@ private: * @return If the handle was removed from the map */ bool TryRemoveHandle(const Handle& handle_description); + + Container& core; }; } // namespace Service::Nvidia::NvCore diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 609e775aed..f126840cbf 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -44,7 +44,8 @@ bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessA // from outside classes. This also allows modification to the internals of the memory // subsystem without needing to rebuild all files that make use of the memory interface. struct Memory::Impl { - explicit Impl(Core::System& system_) : system{system_} {} + explicit Impl(Core::System& system_) + : system{system_} {} void SetCurrentPageTable(Kernel::KProcess& process) { current_page_table = &process.GetPageTable().GetImpl(); @@ -817,26 +818,31 @@ struct Memory::Impl { void HandleRasterizerDownload(VAddr v_address, size_t size) { const auto* p = GetPointerImpl( v_address, []() {}, []() {}); - auto& gpu_device_memory = system.Host1x().MemoryManager(); - DAddr address = - gpu_device_memory.GetAddressFromPAddr(system.DeviceMemory().GetRawPhysicalAddr(p)); + if (!gpu_device_memory) [[unlikely]] { + gpu_device_memory = &system.Host1x().MemoryManager(); + } const size_t core = system.GetCurrentHostThreadID(); auto& current_area = rasterizer_read_areas[core]; - const DAddr end_address = address + size; - if (current_area.start_address <= address && end_address <= current_area.end_address) - [[likely]] { - return; - } - current_area = system.GPU().OnCPURead(address, size); + gpu_device_memory->ApplyOpOnPointer( + p, scratch_buffers[core], [&](DAddr address) { + const DAddr end_address = address + size; + if (current_area.start_address <= address && end_address <= current_area.end_address) + [[likely]] { + return; + } + current_area = system.GPU().OnCPURead(address, size); + }); } void HandleRasterizerWrite(VAddr v_address, size_t size) { const auto* p = GetPointerImpl( v_address, []() {}, []() {}); - PAddr address = system.DeviceMemory().GetRawPhysicalAddr(p); constexpr size_t sys_core = Core::Hardware::NUM_CPU_CORES - 1; const size_t core = std::min(system.GetCurrentHostThreadID(), sys_core); // any other calls threads go to syscore. + if (!gpu_device_memory) [[unlikely]] { + gpu_device_memory = &system.Host1x().MemoryManager(); + } // Guard on sys_core; if (core == sys_core) [[unlikely]] { sys_core_guard.lock(); @@ -846,17 +852,20 @@ struct Memory::Impl { sys_core_guard.unlock(); } }); - auto& current_area = rasterizer_write_areas[core]; - PAddr subaddress = address >> YUZU_PAGEBITS; - bool do_collection = current_area.last_address == subaddress; - if (!do_collection) [[unlikely]] { - do_collection = system.GPU().OnCPUWrite(address, size); - if (!do_collection) { - return; + gpu_device_memory->ApplyOpOnPointer( + p, scratch_buffers[core], [&](DAddr address) { + auto& current_area = rasterizer_write_areas[core]; + PAddr subaddress = address >> YUZU_PAGEBITS; + bool do_collection = current_area.last_address == subaddress; + if (!do_collection) [[unlikely]] { + do_collection = system.GPU().OnCPUWrite(address, size); + if (!do_collection) { + return; + } + current_area.last_address = subaddress; } - current_area.last_address = subaddress; - } - gpu_dirty_managers[core].Collect(address, size); + gpu_dirty_managers[core].Collect(address, size); + }); } struct GPUDirtyState { @@ -872,10 +881,12 @@ struct Memory::Impl { } Core::System& system; + Tegra::MaxwellDeviceMemoryManager* gpu_device_memory{}; Common::PageTable* current_page_table = nullptr; std::array rasterizer_read_areas{}; std::array rasterizer_write_areas{}; + std::array, Core::Hardware::NUM_CPU_CORES> scratch_buffers{}; std::span gpu_dirty_managers; std::mutex sys_core_guard; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index ca31e2fbd6..71b748c743 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -554,9 +554,8 @@ void RasterizerOpenGL::InvalidateRegion(DAddr addr, u64 size, VideoCommon::Cache } } -bool RasterizerOpenGL::OnCPUWrite(PAddr p_addr, u64 size) { +bool RasterizerOpenGL::OnCPUWrite(DAddr addr, u64 size) { MICROPROFILE_SCOPE(OpenGL_CacheManagement); - const DAddr addr = device_memory.GetAddressFromPAddr(p_addr); if (addr == 0 || size == 0) { return false; } @@ -577,9 +576,9 @@ bool RasterizerOpenGL::OnCPUWrite(PAddr p_addr, u64 size) { return false; } -void RasterizerOpenGL::OnCacheInvalidation(PAddr p_addr, u64 size) { +void RasterizerOpenGL::OnCacheInvalidation(DAddr addr, u64 size) { MICROPROFILE_SCOPE(OpenGL_CacheManagement); - const DAddr addr = device_memory.GetAddressFromPAddr(p_addr); + if (addr == 0 || size == 0) { return; } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index efcc349a0b..7db1319856 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -602,8 +602,7 @@ void RasterizerVulkan::InnerInvalidation(std::span