From ac74b71d7530452126792c5fa0bf01fe7378ba00 Mon Sep 17 00:00:00 2001 From: bunnei Date: Tue, 27 Nov 2018 19:17:33 -0500 Subject: [PATCH] dma_pushbuffer: Optimize to avoid loop and copy on Push. --- src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp | 14 ++++++-------- src/video_core/dma_pusher.cpp | 12 ++++++++++-- src/video_core/dma_pusher.h | 10 +++++++--- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp index 39a58b6855..2e2b0ae1cf 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp @@ -128,11 +128,9 @@ u32 nvhost_gpu::AllocateObjectContext(const std::vector& input, std::vector< return 0; } -static void PushGPUEntries(const std::vector& entries) { +static void PushGPUEntries(Tegra::CommandList&& entries) { auto& dma_pusher{Core::System::GetInstance().GPU().DmaPusher()}; - for (const auto& entry : entries) { - dma_pusher.Push(entry); - } + dma_pusher.Push(std::move(entries)); dma_pusher.DispatchCalls(); } @@ -149,11 +147,11 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector& input, std::vector& outp params.num_entries * sizeof(Tegra::CommandListHeader), "Incorrect input size"); - std::vector entries(params.num_entries); + Tegra::CommandList entries(params.num_entries); std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)], params.num_entries * sizeof(Tegra::CommandListHeader)); - PushGPUEntries(entries); + PushGPUEntries(std::move(entries)); params.fence_out.id = 0; params.fence_out.value = 0; @@ -170,11 +168,11 @@ u32 nvhost_gpu::KickoffPB(const std::vector& input, std::vector& output) LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}", params.address, params.num_entries, params.flags); - std::vector entries(params.num_entries); + Tegra::CommandList entries(params.num_entries); Memory::ReadBlock(params.address, entries.data(), params.num_entries * sizeof(Tegra::CommandListHeader)); - PushGPUEntries(entries); + PushGPUEntries(std::move(entries)); params.fence_out.id = 0; params.fence_out.value = 0; diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 23ec979447..63a958f11f 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -23,6 +23,8 @@ void DmaPusher::DispatchCalls() { // On entering GPU code, assume all memory may be touched by the ARM core. gpu.Maxwell3D().dirty_flags.OnMemoryWrite(); + dma_pushbuffer_subindex = 0; + while (Core::System::GetInstance().IsPoweredOn()) { if (!Step()) { break; @@ -89,11 +91,17 @@ bool DmaPusher::Step() { } } else if (ib_enable && !dma_pushbuffer.empty()) { // Current pushbuffer empty, but we have more IB entries to read - const CommandListHeader& command_list_header{dma_pushbuffer.front()}; + const CommandList& command_list{dma_pushbuffer.front()}; + const CommandListHeader& command_list_header{command_list[dma_pushbuffer_subindex++]}; dma_get = command_list_header.addr; dma_put = dma_get + command_list_header.size * sizeof(u32); non_main = command_list_header.is_non_main; - dma_pushbuffer.pop(); + + if (dma_pushbuffer_subindex >= command_list.size()) { + // We've gone through the current list, remove it from the queue + dma_pushbuffer.pop(); + dma_pushbuffer_subindex = 0; + } } else { // Otherwise, pushbuffer empty and IB empty or nonexistent - nothing to do return {}; diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index 39d98e46e2..16e0697c4f 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include "common/bit_field.h" @@ -45,6 +46,8 @@ static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect class GPU; +using CommandList = std::vector; + /** * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the * emulated app fills with commands and tells PFIFO to process. The pushbuffers are then assembled @@ -57,8 +60,8 @@ public: explicit DmaPusher(GPU& gpu); ~DmaPusher(); - void Push(const CommandListHeader& command_list_header) { - dma_pushbuffer.push(command_list_header); + void Push(CommandList&& entries) { + dma_pushbuffer.push(std::move(entries)); } void DispatchCalls(); @@ -72,7 +75,8 @@ private: GPU& gpu; - std::queue dma_pushbuffer; + std::queue dma_pushbuffer; ///< Queue of command lists to be processed + std::size_t dma_pushbuffer_subindex{}; ///< Index within a command list within the pushbuffer struct DmaState { u32 method; ///< Current method