SMMU: Implement physical memory mirroring

This commit is contained in:
Fernando Sahmkow 2023-12-29 07:53:52 +01:00 committed by Liam
parent 0a2536a0df
commit 34a8d0cc8e
8 changed files with 226 additions and 40 deletions

View file

@ -10,8 +10,10 @@
#include <mutex> #include <mutex>
#include "common/common_types.h" #include "common/common_types.h"
#include "common/scratch_buffer.h"
#include "common/virtual_buffer.h" #include "common/virtual_buffer.h"
namespace Core { namespace Core {
class DeviceMemory; class DeviceMemory;
@ -49,9 +51,25 @@ public:
template <typename T> template <typename T>
const T* GetPointer(DAddr address) const; const T* GetPointer(DAddr address) const;
DAddr GetAddressFromPAddr(PAddr address) const { template <typename Func>
void ApplyOpOnPAddr(PAddr address, Common::ScratchBuffer<u32>& buffer, Func&& operation) {
DAddr subbits = static_cast<DAddr>(address & page_mask); DAddr subbits = static_cast<DAddr>(address & page_mask);
return (static_cast<DAddr>(compressed_device_addr[(address >> page_bits)]) << page_bits) + subbits; const u32 base = compressed_device_addr[(address >> page_bits)];
if ((base >> MULTI_FLAG_BITS) == 0) [[likely]] {
const DAddr d_address = static_cast<DAddr>(base << page_bits) + subbits;
operation(d_address);
return;
}
InnerGatherDeviceAddresses(buffer, address);
for (u32 value : buffer) {
operation(static_cast<DAddr>(value << page_bits) + subbits);
}
}
template <typename Func>
void ApplyOpOnPointer(const u8* p, Common::ScratchBuffer<u32>& buffer, Func&& operation) {
PAddr address = GetRawPhysicalAddr<u8>(p);
ApplyOpOnPAddr(address, buffer, operation);
} }
PAddr GetPhysicalRawAddressFromDAddr(DAddr address) const { PAddr GetPhysicalRawAddressFromDAddr(DAddr address) const {
@ -98,6 +116,9 @@ private:
static constexpr size_t page_size = 1ULL << page_bits; static constexpr size_t page_size = 1ULL << page_bits;
static constexpr size_t page_mask = page_size - 1ULL; static constexpr size_t page_mask = page_size - 1ULL;
static constexpr u32 physical_address_base = 1U << page_bits; static constexpr u32 physical_address_base = 1U << page_bits;
static constexpr u32 MULTI_FLAG_BITS = 31;
static constexpr u32 MULTI_FLAG = 1U << MULTI_FLAG_BITS;
static constexpr u32 MULTI_MASK = ~MULTI_FLAG;
template <typename T> template <typename T>
T* GetPointerFromRaw(PAddr addr) { T* GetPointerFromRaw(PAddr addr) {
@ -117,6 +138,8 @@ private:
void WalkBlock(const DAddr addr, const std::size_t size, auto on_unmapped, auto on_memory, void WalkBlock(const DAddr addr, const std::size_t size, auto on_unmapped, auto on_memory,
auto increment); auto increment);
void InnerGatherDeviceAddresses(Common::ScratchBuffer<u32>& buffer, PAddr address);
std::unique_ptr<DeviceMemoryManagerAllocator<Traits>> impl; std::unique_ptr<DeviceMemoryManagerAllocator<Traits>> impl;
const uintptr_t physical_base; const uintptr_t physical_base;

View file

@ -18,10 +18,117 @@
namespace Core { namespace Core {
namespace {
class PhysicalAddressContainer {
public:
PhysicalAddressContainer() = default;
~PhysicalAddressContainer() = default;
void GatherValues(u32 start_entry, Common::ScratchBuffer<u32>& buffer) {
buffer.resize(8);
buffer.resize(0);
size_t index = 0;
const auto add_value = [&](u32 value) {
buffer[index] = value;
index++;
buffer.resize(index);
};
u32 iter_entry = start_entry;
Entry* current = &storage[iter_entry - 1];
add_value(current->value);
while (current->next_entry != 0) {
iter_entry = current->next_entry;
current = &storage[iter_entry - 1];
add_value(current->value);
}
}
u32 Register(u32 value) {
return RegisterImplementation(value);
}
void Register(u32 value, u32 start_entry) {
auto entry_id = RegisterImplementation(value);
u32 iter_entry = start_entry;
Entry* current = &storage[iter_entry - 1];
while (current->next_entry != 0) {
iter_entry = current->next_entry;
current = &storage[iter_entry - 1];
}
current->next_entry = entry_id;
}
std::pair<bool, u32> Unregister(u32 value, u32 start_entry) {
u32 iter_entry = start_entry;
Entry* previous{};
Entry* current = &storage[iter_entry - 1];
Entry* next{};
bool more_than_one_remaining = false;
u32 result_start{start_entry};
size_t count = 0;
while (current->value != value) {
count++;
previous = current;
iter_entry = current->next_entry;
current = &storage[iter_entry - 1];
}
// Find next
u32 next_entry = current->next_entry;
if (next_entry != 0) {
next = &storage[next_entry - 1];
more_than_one_remaining = next->next_entry != 0;
}
if (previous) {
previous->next_entry = next_entry;
} else {
result_start = next_entry;
}
free_entries.emplace_back(iter_entry);
return std::make_pair(more_than_one_remaining || count > 1, result_start);
}
u32 ReleaseEntry(u32 start_entry) {
Entry* current = &storage[start_entry - 1];
free_entries.emplace_back(start_entry);
return current->value;
}
private:
u32 RegisterImplementation(u32 value) {
auto entry_id = GetNewEntry();
auto& entry = storage[entry_id - 1];
entry.next_entry = 0;
entry.value = value;
return entry_id;
}
u32 GetNewEntry() {
if (!free_entries.empty()) {
u32 result = free_entries.front();
free_entries.pop_front();
return result;
}
storage.emplace_back();
u32 new_entry = static_cast<u32>(storage.size());
return new_entry;
}
struct Entry {
u32 next_entry{};
u32 value{};
};
std::deque<Entry> storage;
std::deque<u32> free_entries;
};
struct EmptyAllocator { struct EmptyAllocator {
EmptyAllocator([[maybe_unused]] DAddr address) {} EmptyAllocator([[maybe_unused]] DAddr address) {}
}; };
} // namespace
template <typename DTraits> template <typename DTraits>
struct DeviceMemoryManagerAllocator { struct DeviceMemoryManagerAllocator {
static constexpr bool supports_pinning = DTraits::supports_pinning; static constexpr bool supports_pinning = DTraits::supports_pinning;
@ -38,6 +145,7 @@ struct DeviceMemoryManagerAllocator {
std::conditional_t<supports_pinning, Common::FlatAllocator<DAddr, 0, pin_bits>, EmptyAllocator> std::conditional_t<supports_pinning, Common::FlatAllocator<DAddr, 0, pin_bits>, EmptyAllocator>
pin_allocator; pin_allocator;
Common::FlatAllocator<DAddr, 0, device_virtual_bits> main_allocator; Common::FlatAllocator<DAddr, 0, device_virtual_bits> main_allocator;
PhysicalAddressContainer multi_dev_address;
/// Returns true when vaddr -> vaddr+size is fully contained in the buffer /// Returns true when vaddr -> vaddr+size is fully contained in the buffer
template <bool pin_area> template <bool pin_area>
@ -109,6 +217,9 @@ DeviceMemoryManager<Traits>::DeviceMemoryManager(const DeviceMemory& device_memo
cpu_backing_address(device_as_size >> Memory::YUZU_PAGEBITS) { cpu_backing_address(device_as_size >> Memory::YUZU_PAGEBITS) {
impl = std::make_unique<DeviceMemoryManagerAllocator<Traits>>(); impl = std::make_unique<DeviceMemoryManagerAllocator<Traits>>();
cached_pages = std::make_unique<CachedPages>(); cached_pages = std::make_unique<CachedPages>();
for (size_t i = 0; i < 1ULL << (33 - 12); i++) {
compressed_device_addr[i] = 0;
}
} }
template <typename Traits> template <typename Traits>
@ -155,8 +266,19 @@ void DeviceMemoryManager<Traits>::Map(DAddr address, VAddr virtual_address, size
} }
auto phys_addr = static_cast<u32>(GetRawPhysicalAddr(ptr) >> Memory::YUZU_PAGEBITS) + 1U; auto phys_addr = static_cast<u32>(GetRawPhysicalAddr(ptr) >> Memory::YUZU_PAGEBITS) + 1U;
compressed_physical_ptr[start_page_d + i] = phys_addr; compressed_physical_ptr[start_page_d + i] = phys_addr;
compressed_device_addr[phys_addr - 1U] = static_cast<u32>(start_page_d + i);
InsertCPUBacking(start_page_d + i, new_vaddress, process_id); InsertCPUBacking(start_page_d + i, new_vaddress, process_id);
const u32 base_dev = compressed_device_addr[phys_addr - 1U];
const u32 new_dev = static_cast<u32>(start_page_d + i);
if (base_dev == 0) [[likely]] {
compressed_device_addr[phys_addr - 1U] = new_dev;
continue;
}
u32 start_id = base_dev & MULTI_MASK;
if ((base_dev >> MULTI_FLAG_BITS) == 0) {
start_id = impl->multi_dev_address.Register(base_dev);
compressed_device_addr[phys_addr - 1U] = MULTI_FLAG | start_id;
}
impl->multi_dev_address.Register(new_dev, start_id);
} }
} }
@ -170,12 +292,38 @@ void DeviceMemoryManager<Traits>::Unmap(DAddr address, size_t size) {
auto phys_addr = compressed_physical_ptr[start_page_d + i]; auto phys_addr = compressed_physical_ptr[start_page_d + i];
compressed_physical_ptr[start_page_d + i] = 0; compressed_physical_ptr[start_page_d + i] = 0;
cpu_backing_address[start_page_d + i] = 0; cpu_backing_address[start_page_d + i] = 0;
if (phys_addr != 0) { if (phys_addr != 0) [[likely]] {
const u32 base_dev = compressed_device_addr[phys_addr - 1U];
if ((base_dev >> MULTI_FLAG_BITS) == 0) [[likely]] {
compressed_device_addr[phys_addr - 1] = 0; compressed_device_addr[phys_addr - 1] = 0;
continue;
}
const auto [more_entries, new_start] = impl->multi_dev_address.Unregister(
static_cast<u32>(start_page_d + i), base_dev & MULTI_MASK);
if (!more_entries) {
compressed_device_addr[phys_addr - 1] =
impl->multi_dev_address.ReleaseEntry(new_start);
continue;
}
compressed_device_addr[phys_addr - 1] = new_start | MULTI_FLAG;
} }
} }
} }
template <typename Traits>
void DeviceMemoryManager<Traits>::InnerGatherDeviceAddresses(Common::ScratchBuffer<u32>& buffer,
PAddr address) {
size_t phys_addr = address >> page_bits;
std::scoped_lock lk(mapping_guard);
u32 backing = compressed_device_addr[phys_addr];
if ((backing >> MULTI_FLAG_BITS) != 0) {
impl->multi_dev_address.GatherValues(backing & MULTI_MASK, buffer);
return;
}
buffer.resize(1);
buffer[0] = backing;
}
template <typename Traits> template <typename Traits>
template <typename T> template <typename T>
T* DeviceMemoryManager<Traits>::GetPointer(DAddr address) { T* DeviceMemoryManager<Traits>::GetPointer(DAddr address) {

View file

@ -16,8 +16,8 @@
namespace Service::Nvidia::NvCore { namespace Service::Nvidia::NvCore {
struct ContainerImpl { struct ContainerImpl {
explicit ContainerImpl(Tegra::Host1x::Host1x& host1x_) explicit ContainerImpl(Container& core, Tegra::Host1x::Host1x& host1x_)
: host1x{host1x_}, file{host1x_}, manager{host1x_}, device_file_data{} {} : host1x{host1x_}, file{core, host1x_}, manager{host1x_}, device_file_data{} {}
Tegra::Host1x::Host1x& host1x; Tegra::Host1x::Host1x& host1x;
NvMap file; NvMap file;
SyncpointManager manager; SyncpointManager manager;
@ -29,7 +29,7 @@ struct ContainerImpl {
}; };
Container::Container(Tegra::Host1x::Host1x& host1x_) { Container::Container(Tegra::Host1x::Host1x& host1x_) {
impl = std::make_unique<ContainerImpl>(host1x_); impl = std::make_unique<ContainerImpl>(*this, host1x_);
} }
Container::~Container() = default; Container::~Container() = default;

View file

@ -7,6 +7,7 @@
#include "common/alignment.h" #include "common/alignment.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/logging/log.h" #include "common/logging/log.h"
#include "core/hle/service/nvdrv/core/container.h"
#include "core/hle/service/nvdrv/core/nvmap.h" #include "core/hle/service/nvdrv/core/nvmap.h"
#include "core/memory.h" #include "core/memory.h"
#include "video_core/host1x/host1x.h" #include "video_core/host1x/host1x.h"
@ -64,7 +65,7 @@ NvResult NvMap::Handle::Duplicate(bool internal_session) {
return NvResult::Success; return NvResult::Success;
} }
NvMap::NvMap(Tegra::Host1x::Host1x& host1x_) : host1x{host1x_} {} NvMap::NvMap(Container& core_, Tegra::Host1x::Host1x& host1x_) : host1x{host1x_}, core{core_} {}
void NvMap::AddHandle(std::shared_ptr<Handle> handle_description) { void NvMap::AddHandle(std::shared_ptr<Handle> handle_description) {
std::scoped_lock lock(handles_lock); std::scoped_lock lock(handles_lock);
@ -160,6 +161,8 @@ DAddr NvMap::PinHandle(NvMap::Handle::Id handle, size_t session_id, bool low_are
// If not then allocate some space and map it // If not then allocate some space and map it
DAddr address{}; DAddr address{};
auto& smmu = host1x.MemoryManager(); auto& smmu = host1x.MemoryManager();
auto* session = core.GetSession(session_id);
auto allocate = std::bind(&Tegra::MaxwellDeviceMemoryManager::Allocate, &smmu, _1); auto allocate = std::bind(&Tegra::MaxwellDeviceMemoryManager::Allocate, &smmu, _1);
//: std::bind(&Tegra::MaxwellDeviceMemoryManager::Allocate, &smmu, _1); //: std::bind(&Tegra::MaxwellDeviceMemoryManager::Allocate, &smmu, _1);
while ((address = allocate(static_cast<size_t>(handle_description->aligned_size))) == 0) { while ((address = allocate(static_cast<size_t>(handle_description->aligned_size))) == 0) {
@ -179,7 +182,7 @@ DAddr NvMap::PinHandle(NvMap::Handle::Id handle, size_t session_id, bool low_are
handle_description->d_address = address; handle_description->d_address = address;
smmu.Map(address, handle_description->address, handle_description->aligned_size, smmu.Map(address, handle_description->address, handle_description->aligned_size,
session_id); session->smmu_id);
} }
handle_description->pins++; handle_description->pins++;

View file

@ -25,6 +25,8 @@ class Host1x;
} // namespace Tegra } // namespace Tegra
namespace Service::Nvidia::NvCore { namespace Service::Nvidia::NvCore {
class Container;
/** /**
* @brief The nvmap core class holds the global state for nvmap and provides methods to manage * @brief The nvmap core class holds the global state for nvmap and provides methods to manage
* handles * handles
@ -109,7 +111,7 @@ public:
bool can_unlock; //!< If the address region is ready to be unlocked bool can_unlock; //!< If the address region is ready to be unlocked
}; };
explicit NvMap(Tegra::Host1x::Host1x& host1x); explicit NvMap(Container& core, Tegra::Host1x::Host1x& host1x);
/** /**
* @brief Creates an unallocated handle of the given size * @brief Creates an unallocated handle of the given size
@ -173,5 +175,7 @@ private:
* @return If the handle was removed from the map * @return If the handle was removed from the map
*/ */
bool TryRemoveHandle(const Handle& handle_description); bool TryRemoveHandle(const Handle& handle_description);
Container& core;
}; };
} // namespace Service::Nvidia::NvCore } // namespace Service::Nvidia::NvCore

View file

@ -44,7 +44,8 @@ bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessA
// from outside classes. This also allows modification to the internals of the memory // from outside classes. This also allows modification to the internals of the memory
// subsystem without needing to rebuild all files that make use of the memory interface. // subsystem without needing to rebuild all files that make use of the memory interface.
struct Memory::Impl { struct Memory::Impl {
explicit Impl(Core::System& system_) : system{system_} {} explicit Impl(Core::System& system_)
: system{system_} {}
void SetCurrentPageTable(Kernel::KProcess& process) { void SetCurrentPageTable(Kernel::KProcess& process) {
current_page_table = &process.GetPageTable().GetImpl(); current_page_table = &process.GetPageTable().GetImpl();
@ -817,26 +818,31 @@ struct Memory::Impl {
void HandleRasterizerDownload(VAddr v_address, size_t size) { void HandleRasterizerDownload(VAddr v_address, size_t size) {
const auto* p = GetPointerImpl( const auto* p = GetPointerImpl(
v_address, []() {}, []() {}); v_address, []() {}, []() {});
auto& gpu_device_memory = system.Host1x().MemoryManager(); if (!gpu_device_memory) [[unlikely]] {
DAddr address = gpu_device_memory = &system.Host1x().MemoryManager();
gpu_device_memory.GetAddressFromPAddr(system.DeviceMemory().GetRawPhysicalAddr(p)); }
const size_t core = system.GetCurrentHostThreadID(); const size_t core = system.GetCurrentHostThreadID();
auto& current_area = rasterizer_read_areas[core]; auto& current_area = rasterizer_read_areas[core];
gpu_device_memory->ApplyOpOnPointer(
p, scratch_buffers[core], [&](DAddr address) {
const DAddr end_address = address + size; const DAddr end_address = address + size;
if (current_area.start_address <= address && end_address <= current_area.end_address) if (current_area.start_address <= address && end_address <= current_area.end_address)
[[likely]] { [[likely]] {
return; return;
} }
current_area = system.GPU().OnCPURead(address, size); current_area = system.GPU().OnCPURead(address, size);
});
} }
void HandleRasterizerWrite(VAddr v_address, size_t size) { void HandleRasterizerWrite(VAddr v_address, size_t size) {
const auto* p = GetPointerImpl( const auto* p = GetPointerImpl(
v_address, []() {}, []() {}); v_address, []() {}, []() {});
PAddr address = system.DeviceMemory().GetRawPhysicalAddr(p);
constexpr size_t sys_core = Core::Hardware::NUM_CPU_CORES - 1; constexpr size_t sys_core = Core::Hardware::NUM_CPU_CORES - 1;
const size_t core = std::min(system.GetCurrentHostThreadID(), const size_t core = std::min(system.GetCurrentHostThreadID(),
sys_core); // any other calls threads go to syscore. sys_core); // any other calls threads go to syscore.
if (!gpu_device_memory) [[unlikely]] {
gpu_device_memory = &system.Host1x().MemoryManager();
}
// Guard on sys_core; // Guard on sys_core;
if (core == sys_core) [[unlikely]] { if (core == sys_core) [[unlikely]] {
sys_core_guard.lock(); sys_core_guard.lock();
@ -846,6 +852,8 @@ struct Memory::Impl {
sys_core_guard.unlock(); sys_core_guard.unlock();
} }
}); });
gpu_device_memory->ApplyOpOnPointer(
p, scratch_buffers[core], [&](DAddr address) {
auto& current_area = rasterizer_write_areas[core]; auto& current_area = rasterizer_write_areas[core];
PAddr subaddress = address >> YUZU_PAGEBITS; PAddr subaddress = address >> YUZU_PAGEBITS;
bool do_collection = current_area.last_address == subaddress; bool do_collection = current_area.last_address == subaddress;
@ -857,6 +865,7 @@ struct Memory::Impl {
current_area.last_address = subaddress; current_area.last_address = subaddress;
} }
gpu_dirty_managers[core].Collect(address, size); gpu_dirty_managers[core].Collect(address, size);
});
} }
struct GPUDirtyState { struct GPUDirtyState {
@ -872,10 +881,12 @@ struct Memory::Impl {
} }
Core::System& system; Core::System& system;
Tegra::MaxwellDeviceMemoryManager* gpu_device_memory{};
Common::PageTable* current_page_table = nullptr; Common::PageTable* current_page_table = nullptr;
std::array<VideoCore::RasterizerDownloadArea, Core::Hardware::NUM_CPU_CORES> std::array<VideoCore::RasterizerDownloadArea, Core::Hardware::NUM_CPU_CORES>
rasterizer_read_areas{}; rasterizer_read_areas{};
std::array<GPUDirtyState, Core::Hardware::NUM_CPU_CORES> rasterizer_write_areas{}; std::array<GPUDirtyState, Core::Hardware::NUM_CPU_CORES> rasterizer_write_areas{};
std::array<Common::ScratchBuffer<u32>, Core::Hardware::NUM_CPU_CORES> scratch_buffers{};
std::span<Core::GPUDirtyMemoryManager> gpu_dirty_managers; std::span<Core::GPUDirtyMemoryManager> gpu_dirty_managers;
std::mutex sys_core_guard; std::mutex sys_core_guard;

View file

@ -554,9 +554,8 @@ void RasterizerOpenGL::InvalidateRegion(DAddr addr, u64 size, VideoCommon::Cache
} }
} }
bool RasterizerOpenGL::OnCPUWrite(PAddr p_addr, u64 size) { bool RasterizerOpenGL::OnCPUWrite(DAddr addr, u64 size) {
MICROPROFILE_SCOPE(OpenGL_CacheManagement); MICROPROFILE_SCOPE(OpenGL_CacheManagement);
const DAddr addr = device_memory.GetAddressFromPAddr(p_addr);
if (addr == 0 || size == 0) { if (addr == 0 || size == 0) {
return false; return false;
} }
@ -577,9 +576,9 @@ bool RasterizerOpenGL::OnCPUWrite(PAddr p_addr, u64 size) {
return false; return false;
} }
void RasterizerOpenGL::OnCacheInvalidation(PAddr p_addr, u64 size) { void RasterizerOpenGL::OnCacheInvalidation(DAddr addr, u64 size) {
MICROPROFILE_SCOPE(OpenGL_CacheManagement); MICROPROFILE_SCOPE(OpenGL_CacheManagement);
const DAddr addr = device_memory.GetAddressFromPAddr(p_addr);
if (addr == 0 || size == 0) { if (addr == 0 || size == 0) {
return; return;
} }

View file

@ -602,8 +602,7 @@ void RasterizerVulkan::InnerInvalidation(std::span<const std::pair<DAddr, std::s
} }
} }
bool RasterizerVulkan::OnCPUWrite(PAddr p_addr, u64 size) { bool RasterizerVulkan::OnCPUWrite(DAddr addr, u64 size) {
const DAddr addr = device_memory.GetAddressFromPAddr(p_addr);
if (addr == 0 || size == 0) { if (addr == 0 || size == 0) {
return false; return false;
} }
@ -624,8 +623,7 @@ bool RasterizerVulkan::OnCPUWrite(PAddr p_addr, u64 size) {
return false; return false;
} }
void RasterizerVulkan::OnCacheInvalidation(PAddr p_addr, u64 size) { void RasterizerVulkan::OnCacheInvalidation(DAddr addr, u64 size) {
const DAddr addr = device_memory.GetAddressFromPAddr(p_addr);
if (addr == 0 || size == 0) { if (addr == 0 || size == 0) {
return; return;
} }