diff --git a/src/core/device_memory_manager.h b/src/core/device_memory_manager.h
index 1a63cbd093..7c7726348e 100644
--- a/src/core/device_memory_manager.h
+++ b/src/core/device_memory_manager.h
@@ -10,8 +10,10 @@
 #include <mutex>
 
 #include "common/common_types.h"
+#include "common/scratch_buffer.h"
 #include "common/virtual_buffer.h"
 
+
 namespace Core {
 
 class DeviceMemory;
@@ -49,9 +51,25 @@ public:
     template <typename T>
     const T* GetPointer(DAddr address) const;
 
-    DAddr GetAddressFromPAddr(PAddr address) const {
+    template <typename Func>
+    void ApplyOpOnPAddr(PAddr address, Common::ScratchBuffer<u32>& buffer, Func&& operation) {
         DAddr subbits = static_cast<DAddr>(address & page_mask);
-        return (static_cast<DAddr>(compressed_device_addr[(address >> page_bits)]) << page_bits) + subbits;
+        const u32 base = compressed_device_addr[(address >> page_bits)];
+        if ((base >> MULTI_FLAG_BITS) == 0) [[likely]] {
+            const DAddr d_address = static_cast<DAddr>(base << page_bits) + subbits;
+            operation(d_address);
+            return;
+        }
+        InnerGatherDeviceAddresses(buffer, address);
+        for (u32 value : buffer) {
+            operation(static_cast<DAddr>(value << page_bits) + subbits);
+        }
+    }
+
+    template <typename Func>
+    void ApplyOpOnPointer(const u8* p, Common::ScratchBuffer<u32>& buffer, Func&& operation) {
+        PAddr address = GetRawPhysicalAddr<u8>(p);
+        ApplyOpOnPAddr(address, buffer, operation);
     }
 
     PAddr GetPhysicalRawAddressFromDAddr(DAddr address) const {
@@ -98,6 +116,9 @@ private:
     static constexpr size_t page_size = 1ULL << page_bits;
     static constexpr size_t page_mask = page_size - 1ULL;
     static constexpr u32 physical_address_base = 1U << page_bits;
+    static constexpr u32 MULTI_FLAG_BITS = 31;
+    static constexpr u32 MULTI_FLAG = 1U << MULTI_FLAG_BITS;
+    static constexpr u32 MULTI_MASK = ~MULTI_FLAG;
 
     template <typename T>
     T* GetPointerFromRaw(PAddr addr) {
@@ -117,6 +138,8 @@ private:
     void WalkBlock(const DAddr addr, const std::size_t size, auto on_unmapped, auto on_memory,
                    auto increment);
 
+    void InnerGatherDeviceAddresses(Common::ScratchBuffer<u32>& buffer, PAddr address);
+
     std::unique_ptr<DeviceMemoryManagerAllocator<Traits>> impl;
 
     const uintptr_t physical_base;
diff --git a/src/core/device_memory_manager.inc b/src/core/device_memory_manager.inc
index 8c5f82d315..4fb3ad3ab5 100644
--- a/src/core/device_memory_manager.inc
+++ b/src/core/device_memory_manager.inc
@@ -18,10 +18,117 @@
 
 namespace Core {
 
+namespace {
+
+class PhysicalAddressContainer {
+public:
+    PhysicalAddressContainer() = default;
+    ~PhysicalAddressContainer() = default;
+
+    void GatherValues(u32 start_entry, Common::ScratchBuffer<u32>& buffer) {
+        buffer.resize(8);
+        buffer.resize(0);
+        size_t index = 0;
+        const auto add_value = [&](u32 value) {
+            buffer[index] = value;
+            index++;
+            buffer.resize(index);
+        };
+
+        u32 iter_entry = start_entry;
+        Entry* current = &storage[iter_entry - 1];
+        add_value(current->value);
+        while (current->next_entry != 0) {
+            iter_entry = current->next_entry;
+            current = &storage[iter_entry - 1];
+            add_value(current->value);
+        }
+    }
+
+    u32 Register(u32 value) {
+        return RegisterImplementation(value);
+    }
+
+    void Register(u32 value, u32 start_entry) {
+        auto entry_id = RegisterImplementation(value);
+        u32 iter_entry = start_entry;
+        Entry* current = &storage[iter_entry - 1];
+        while (current->next_entry != 0) {
+            iter_entry = current->next_entry;
+            current = &storage[iter_entry - 1];
+        }
+        current->next_entry = entry_id;
+    }
+
+    std::pair<bool, u32> Unregister(u32 value, u32 start_entry) {
+        u32 iter_entry = start_entry;
+        Entry* previous{};
+        Entry* current = &storage[iter_entry - 1];
+        Entry* next{};
+        bool more_than_one_remaining = false;
+        u32 result_start{start_entry};
+        size_t count = 0;
+        while (current->value != value) {
+            count++;
+            previous = current;
+            iter_entry = current->next_entry;
+            current = &storage[iter_entry - 1];
+        }
+        // Find next
+        u32 next_entry = current->next_entry;
+        if (next_entry != 0) {
+            next = &storage[next_entry - 1];
+            more_than_one_remaining = next->next_entry != 0;
+        }
+        if (previous) {
+            previous->next_entry = next_entry;
+        } else {
+            result_start = next_entry;
+        }
+        free_entries.emplace_back(iter_entry);
+        return std::make_pair(more_than_one_remaining || count > 1, result_start);
+    }
+
+    u32 ReleaseEntry(u32 start_entry) {
+        Entry* current = &storage[start_entry - 1];
+        free_entries.emplace_back(start_entry);
+        return current->value;
+    }
+
+private:
+    u32 RegisterImplementation(u32 value) {
+        auto entry_id = GetNewEntry();
+        auto& entry = storage[entry_id - 1];
+        entry.next_entry = 0;
+        entry.value = value;
+        return entry_id;
+    }
+    u32 GetNewEntry() {
+        if (!free_entries.empty()) {
+            u32 result = free_entries.front();
+            free_entries.pop_front();
+            return result;
+        }
+        storage.emplace_back();
+        u32 new_entry = static_cast<u32>(storage.size());
+        return new_entry;
+    }
+
+    struct Entry {
+        u32 next_entry{};
+        u32 value{};
+    };
+
+    std::deque<Entry> storage;
+    std::deque<u32> free_entries;
+};
+
 struct EmptyAllocator {
     EmptyAllocator([[maybe_unused]] DAddr address) {}
 };
 
+} // namespace
+
 template <typename DTraits>
 struct DeviceMemoryManagerAllocator {
     static constexpr bool supports_pinning = DTraits::supports_pinning;
@@ -38,6 +145,7 @@ struct DeviceMemoryManagerAllocator {
     std::conditional_t<supports_pinning, Common::FlatAllocator<DAddr, 0, pin_bits>, EmptyAllocator>
         pin_allocator;
     Common::FlatAllocator<DAddr, 0, device_virtual_bits> main_allocator;
+    PhysicalAddressContainer multi_dev_address;
 
     /// Returns true when vaddr -> vaddr+size is fully contained in the buffer
     template <bool pin_area>
@@ -109,6 +217,9 @@ DeviceMemoryManager<Traits>::DeviceMemoryManager(const DeviceMemory& device_memo
       cpu_backing_address(device_as_size >> Memory::YUZU_PAGEBITS) {
     impl = std::make_unique<DeviceMemoryManagerAllocator<Traits>>();
     cached_pages = std::make_unique<CachedPages>();
+    for (size_t i = 0; i < 1ULL << (33 - 12); i++) {
+        compressed_device_addr[i] = 0;
+    }
 }
 
 template <typename Traits>
@@ -155,8 +266,19 @@ void DeviceMemoryManager<Traits>::Map(DAddr address, VAddr virtual_address, size
         }
         auto phys_addr = static_cast<u32>(GetRawPhysicalAddr(ptr) >> Memory::YUZU_PAGEBITS) + 1U;
         compressed_physical_ptr[start_page_d + i] = phys_addr;
-        compressed_device_addr[phys_addr - 1U] = static_cast<u32>(start_page_d + i);
         InsertCPUBacking(start_page_d + i, new_vaddress, process_id);
+        const u32 base_dev = compressed_device_addr[phys_addr - 1U];
+        const u32 new_dev = static_cast<u32>(start_page_d + i);
+        if (base_dev == 0) [[likely]] {
+            compressed_device_addr[phys_addr - 1U] = new_dev;
+            continue;
+        }
+        u32 start_id = base_dev & MULTI_MASK;
+        if ((base_dev >> MULTI_FLAG_BITS) == 0) {
+            start_id = impl->multi_dev_address.Register(base_dev);
+            compressed_device_addr[phys_addr - 1U] = MULTI_FLAG | start_id;
+        }
+        impl->multi_dev_address.Register(new_dev, start_id);
     }
 }
 
@@ -170,12 +292,38 @@ void DeviceMemoryManager<Traits>::Unmap(DAddr address, size_t size) {
         auto phys_addr = compressed_physical_ptr[start_page_d + i];
         compressed_physical_ptr[start_page_d + i] = 0;
         cpu_backing_address[start_page_d + i] = 0;
-        if (phys_addr != 0) {
-            compressed_device_addr[phys_addr - 1] = 0;
+        if (phys_addr != 0) [[likely]] {
+            const u32 base_dev = compressed_device_addr[phys_addr - 1U];
+            if ((base_dev >> MULTI_FLAG_BITS) == 0) [[likely]] {
+                compressed_device_addr[phys_addr - 1] = 0;
+                continue;
+            }
+            const auto [more_entries, new_start] = impl->multi_dev_address.Unregister(
+                static_cast<u32>(start_page_d + i), base_dev & MULTI_MASK);
+            if (!more_entries) {
+                compressed_device_addr[phys_addr - 1] =
+                    impl->multi_dev_address.ReleaseEntry(new_start);
+                continue;
+            }
+            compressed_device_addr[phys_addr - 1] = new_start | MULTI_FLAG;
         }
     }
 }
 
+template <typename Traits>
+void DeviceMemoryManager<Traits>::InnerGatherDeviceAddresses(Common::ScratchBuffer<u32>& buffer,
+                                                             PAddr address) {
+    size_t phys_addr = address >> page_bits;
+    std::scoped_lock lk(mapping_guard);
+    u32 backing = compressed_device_addr[phys_addr];
+    if ((backing >> MULTI_FLAG_BITS) != 0) {
+        impl->multi_dev_address.GatherValues(backing & MULTI_MASK, buffer);
+        return;
+    }
+    buffer.resize(1);
+    buffer[0] = backing;
+}
+
 template <typename Traits>
 template <typename T>
 T* DeviceMemoryManager<Traits>::GetPointer(DAddr address) {
diff --git a/src/core/hle/service/nvdrv/core/container.cpp b/src/core/hle/service/nvdrv/core/container.cpp
index 7c2231fe6e..e12ce05c13 100644
--- a/src/core/hle/service/nvdrv/core/container.cpp
+++ b/src/core/hle/service/nvdrv/core/container.cpp
@@ -16,8 +16,8 @@
 namespace Service::Nvidia::NvCore {
 
 struct ContainerImpl {
-    explicit ContainerImpl(Tegra::Host1x::Host1x& host1x_)
-        : host1x{host1x_}, file{host1x_}, manager{host1x_}, device_file_data{} {}
+    explicit ContainerImpl(Container& core, Tegra::Host1x::Host1x& host1x_)
+        : host1x{host1x_}, file{core, host1x_}, manager{host1x_}, device_file_data{} {}
     Tegra::Host1x::Host1x& host1x;
     NvMap file;
     SyncpointManager manager;
@@ -29,7 +29,7 @@ struct ContainerImpl {
 };
 
 Container::Container(Tegra::Host1x::Host1x& host1x_) {
-    impl = std::make_unique<ContainerImpl>(host1x_);
+    impl = std::make_unique<ContainerImpl>(*this, host1x_);
 }
 
 Container::~Container() = default;
diff --git a/src/core/hle/service/nvdrv/core/nvmap.cpp b/src/core/hle/service/nvdrv/core/nvmap.cpp
index 7879c6f04a..e4168a37c5 100644
--- a/src/core/hle/service/nvdrv/core/nvmap.cpp
+++ b/src/core/hle/service/nvdrv/core/nvmap.cpp
@@ -7,6 +7,7 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/hle/service/nvdrv/core/container.h"
 #include "core/hle/service/nvdrv/core/nvmap.h"
 #include "core/memory.h"
 #include "video_core/host1x/host1x.h"
@@ -64,7 +65,7 @@ NvResult NvMap::Handle::Duplicate(bool internal_session) {
     return NvResult::Success;
 }
 
-NvMap::NvMap(Tegra::Host1x::Host1x& host1x_) : host1x{host1x_} {}
+NvMap::NvMap(Container& core_, Tegra::Host1x::Host1x& host1x_) : host1x{host1x_}, core{core_} {}
 
 void NvMap::AddHandle(std::shared_ptr<Handle> handle_description) {
     std::scoped_lock lock(handles_lock);
@@ -160,6 +161,8 @@ DAddr NvMap::PinHandle(NvMap::Handle::Id handle, size_t session_id, bool low_are
         // If not then allocate some space and map it
         DAddr address{};
         auto& smmu = host1x.MemoryManager();
+        auto* session = core.GetSession(session_id);
+
         auto allocate = std::bind(&Tegra::MaxwellDeviceMemoryManager::Allocate, &smmu, _1);
                          //: std::bind(&Tegra::MaxwellDeviceMemoryManager::Allocate, &smmu, _1);
         while ((address = allocate(static_cast<size_t>(handle_description->aligned_size))) == 0) {
@@ -179,7 +182,7 @@ DAddr NvMap::PinHandle(NvMap::Handle::Id handle, size_t session_id, bool low_are
         handle_description->d_address = address;
 
         smmu.Map(address, handle_description->address, handle_description->aligned_size,
-                 session_id);
+                 session->smmu_id);
     }
 
     handle_description->pins++;
diff --git a/src/core/hle/service/nvdrv/core/nvmap.h b/src/core/hle/service/nvdrv/core/nvmap.h
index e9e9e8b5be..7dd6d26c3c 100644
--- a/src/core/hle/service/nvdrv/core/nvmap.h
+++ b/src/core/hle/service/nvdrv/core/nvmap.h
@@ -25,6 +25,8 @@ class Host1x;
 } // namespace Tegra
 
 namespace Service::Nvidia::NvCore {
+
+class Container;
 /**
  * @brief The nvmap core class holds the global state for nvmap and provides methods to manage
  * handles
@@ -109,7 +111,7 @@ public:
         bool can_unlock;   //!< If the address region is ready to be unlocked
     };
 
-    explicit NvMap(Tegra::Host1x::Host1x& host1x);
+    explicit NvMap(Container& core, Tegra::Host1x::Host1x& host1x);
 
     /**
      * @brief Creates an unallocated handle of the given size
@@ -173,5 +175,7 @@ private:
      * @return If the handle was removed from the map
      */
     bool TryRemoveHandle(const Handle& handle_description);
+
+    Container& core;
 };
 } // namespace Service::Nvidia::NvCore
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 609e775aed..f126840cbf 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -44,7 +44,8 @@ bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessA
 // from outside classes. This also allows modification to the internals of the memory
 // subsystem without needing to rebuild all files that make use of the memory interface.
 struct Memory::Impl {
-    explicit Impl(Core::System& system_) : system{system_} {}
+    explicit Impl(Core::System& system_)
+        : system{system_} {}
 
     void SetCurrentPageTable(Kernel::KProcess& process) {
         current_page_table = &process.GetPageTable().GetImpl();
@@ -817,26 +818,31 @@ struct Memory::Impl {
     void HandleRasterizerDownload(VAddr v_address, size_t size) {
         const auto* p = GetPointerImpl(
             v_address, []() {}, []() {});
-        auto& gpu_device_memory = system.Host1x().MemoryManager();
-        DAddr address =
-            gpu_device_memory.GetAddressFromPAddr(system.DeviceMemory().GetRawPhysicalAddr(p));
+        if (!gpu_device_memory) [[unlikely]] {
+            gpu_device_memory = &system.Host1x().MemoryManager();
+        }
         const size_t core = system.GetCurrentHostThreadID();
         auto& current_area = rasterizer_read_areas[core];
-        const DAddr end_address = address + size;
-        if (current_area.start_address <= address && end_address <= current_area.end_address)
-            [[likely]] {
-            return;
-        }
-        current_area = system.GPU().OnCPURead(address, size);
+        gpu_device_memory->ApplyOpOnPointer(
+            p, scratch_buffers[core], [&](DAddr address) {
+            const DAddr end_address = address + size;
+            if (current_area.start_address <= address && end_address <= current_area.end_address)
+                [[likely]] {
+                return;
+            }
+            current_area = system.GPU().OnCPURead(address, size);
+        });
     }
 
     void HandleRasterizerWrite(VAddr v_address, size_t size) {
         const auto* p = GetPointerImpl(
             v_address, []() {}, []() {});
-        PAddr address = system.DeviceMemory().GetRawPhysicalAddr(p);
         constexpr size_t sys_core = Core::Hardware::NUM_CPU_CORES - 1;
         const size_t core = std::min(system.GetCurrentHostThreadID(),
                                      sys_core); // any other calls threads go to syscore.
+        if (!gpu_device_memory) [[unlikely]] {
+            gpu_device_memory = &system.Host1x().MemoryManager();
+        }
         // Guard on sys_core;
         if (core == sys_core) [[unlikely]] {
             sys_core_guard.lock();
@@ -846,17 +852,20 @@ struct Memory::Impl {
                 sys_core_guard.unlock();
             }
         });
-        auto& current_area = rasterizer_write_areas[core];
-        PAddr subaddress = address >> YUZU_PAGEBITS;
-        bool do_collection = current_area.last_address == subaddress;
-        if (!do_collection) [[unlikely]] {
-            do_collection = system.GPU().OnCPUWrite(address, size);
-            if (!do_collection) {
-                return;
+        gpu_device_memory->ApplyOpOnPointer(
+            p, scratch_buffers[core], [&](DAddr address) {
+            auto& current_area = rasterizer_write_areas[core];
+            PAddr subaddress = address >> YUZU_PAGEBITS;
+            bool do_collection = current_area.last_address == subaddress;
+            if (!do_collection) [[unlikely]] {
+                do_collection = system.GPU().OnCPUWrite(address, size);
+                if (!do_collection) {
+                    return;
+                }
+                current_area.last_address = subaddress;
             }
-            current_area.last_address = subaddress;
-        }
-        gpu_dirty_managers[core].Collect(address, size);
+            gpu_dirty_managers[core].Collect(address, size);
+        });
     }
 
     struct GPUDirtyState {
@@ -872,10 +881,12 @@ struct Memory::Impl {
     }
 
     Core::System& system;
+    Tegra::MaxwellDeviceMemoryManager* gpu_device_memory{};
     Common::PageTable* current_page_table = nullptr;
     std::array<VideoCore::RasterizerDownloadArea, Core::Hardware::NUM_CPU_CORES>
         rasterizer_read_areas{};
     std::array<GPUDirtyState, Core::Hardware::NUM_CPU_CORES> rasterizer_write_areas{};
+    std::array<Common::ScratchBuffer<u32>, Core::Hardware::NUM_CPU_CORES> scratch_buffers{};
     std::span<Core::GPUDirtyMemoryManager> gpu_dirty_managers;
     std::mutex sys_core_guard;
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index ca31e2fbd6..71b748c743 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -554,9 +554,8 @@ void RasterizerOpenGL::InvalidateRegion(DAddr addr, u64 size, VideoCommon::Cache
     }
 }
 
-bool RasterizerOpenGL::OnCPUWrite(PAddr p_addr, u64 size) {
+bool RasterizerOpenGL::OnCPUWrite(DAddr addr, u64 size) {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
-    const DAddr addr = device_memory.GetAddressFromPAddr(p_addr);
     if (addr == 0 || size == 0) {
         return false;
     }
@@ -577,9 +576,9 @@ bool RasterizerOpenGL::OnCPUWrite(PAddr p_addr, u64 size) {
     return false;
 }
 
-void RasterizerOpenGL::OnCacheInvalidation(PAddr p_addr, u64 size) {
+void RasterizerOpenGL::OnCacheInvalidation(DAddr addr, u64 size) {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
-    const DAddr addr = device_memory.GetAddressFromPAddr(p_addr);
+
     if (addr == 0 || size == 0) {
         return;
     }
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index efcc349a0b..7db1319856 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -602,8 +602,7 @@ void RasterizerVulkan::InnerInvalidation(std::span<const std::pair<DAddr, std::s
     }
 }
 
-bool RasterizerVulkan::OnCPUWrite(PAddr p_addr, u64 size) {
-    const DAddr addr = device_memory.GetAddressFromPAddr(p_addr);
+bool RasterizerVulkan::OnCPUWrite(DAddr addr, u64 size) {
     if (addr == 0 || size == 0) {
         return false;
     }
@@ -624,8 +623,7 @@ bool RasterizerVulkan::OnCPUWrite(PAddr p_addr, u64 size) {
     return false;
 }
 
-void RasterizerVulkan::OnCacheInvalidation(PAddr p_addr, u64 size) {
-    const DAddr addr = device_memory.GetAddressFromPAddr(p_addr);
+void RasterizerVulkan::OnCacheInvalidation(DAddr addr, u64 size) {
     if (addr == 0 || size == 0) {
         return;
     }