From c5600435817ff751d2a5550ded54a033ccf0c15b Mon Sep 17 00:00:00 2001 From: Markus Wick Date: Thu, 6 Sep 2018 14:51:24 +0200 Subject: [PATCH 1/3] rasterizer: Drop unused handler. This virtual function is called in a very hot spot, and it does nothing. If this kind of feature is required, please be more specific and add callbacks in the switch statement within Maxwell3D::WriteReg. There is no point in having another switch statement within the rasterizer. --- src/video_core/engines/maxwell_3d.cpp | 2 -- src/video_core/rasterizer_interface.h | 3 --- src/video_core/renderer_opengl/gl_rasterizer.cpp | 2 -- src/video_core/renderer_opengl/gl_rasterizer.h | 1 - 4 files changed, 8 deletions(-) diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 1308080b5e..329079ddd7 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -135,8 +135,6 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) { break; } - rasterizer.NotifyMaxwellRegisterChanged(method); - if (debug_context) { debug_context->OnEvent(Tegra::DebugContext::Event::MaxwellCommandProcessed, nullptr); } diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 9d78e8b6b1..cd819d69fe 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -20,9 +20,6 @@ public: /// Clear the current framebuffer virtual void Clear() = 0; - /// Notify rasterizer that the specified Maxwell register has been changed - virtual void NotifyMaxwellRegisterChanged(u32 method) = 0; - /// Notify rasterizer that all caches should be flushed to Switch memory virtual void FlushAll() = 0; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index c7e2c877c6..fdfca767ad 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -527,8 +527,6 @@ void RasterizerOpenGL::DrawArrays() { state.Apply(); } -void RasterizerOpenGL::NotifyMaxwellRegisterChanged(u32 method) {} - void RasterizerOpenGL::FlushAll() {} void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {} diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 3d62cc196b..eaf31ae966 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -45,7 +45,6 @@ public: void DrawArrays() override; void Clear() override; - void NotifyMaxwellRegisterChanged(u32 method) override; void FlushAll() override; void FlushRegion(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size) override; From 0cfb0bacb2581d79631f496afbc3a3d5dd19eb42 Mon Sep 17 00:00:00 2001 From: Markus Wick Date: Thu, 6 Sep 2018 15:48:08 +0200 Subject: [PATCH 2/3] video_core: Move command buffer loop. This moves the hot loop into video_core. This refactoring shall reduce the CPU overhead of calling ProcessCommandList. --- .../hle/service/nvdrv/devices/nvhost_gpu.cpp | 26 +++-- .../hle/service/nvdrv/devices/nvhost_gpu.h | 17 --- src/video_core/command_processor.cpp | 101 ++++++++++-------- src/video_core/command_processor.h | 17 +++ src/video_core/gpu.h | 4 +- 5 files changed, 86 insertions(+), 79 deletions(-) diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp index 4cdf7f6135..8e0f9a9e57 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp @@ -8,6 +8,7 @@ #include "core/core.h" #include "core/hle/service/nvdrv/devices/nvhost_gpu.h" #include "core/memory.h" +#include "video_core/command_processor.h" #include "video_core/gpu.h" #include "video_core/memory_manager.h" @@ -134,17 +135,16 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector& input, std::vector& outp LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}", params.address, params.num_entries, params.flags); - ASSERT_MSG(input.size() == - sizeof(IoctlSubmitGpfifo) + params.num_entries * sizeof(IoctlGpfifoEntry), + ASSERT_MSG(input.size() == sizeof(IoctlSubmitGpfifo) + + params.num_entries * sizeof(Tegra::CommandListHeader), "Incorrect input size"); - std::vector entries(params.num_entries); + std::vector entries(params.num_entries); std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)], - params.num_entries * sizeof(IoctlGpfifoEntry)); - for (auto entry : entries) { - Tegra::GPUVAddr va_addr = entry.Address(); - Core::System::GetInstance().GPU().ProcessCommandList(va_addr, entry.sz); - } + params.num_entries * sizeof(Tegra::CommandListHeader)); + + Core::System::GetInstance().GPU().ProcessCommandLists(entries); + params.fence_out.id = 0; params.fence_out.value = 0; std::memcpy(output.data(), ¶ms, sizeof(IoctlSubmitGpfifo)); @@ -160,14 +160,12 @@ u32 nvhost_gpu::KickoffPB(const std::vector& input, std::vector& output) LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}", params.address, params.num_entries, params.flags); - std::vector entries(params.num_entries); + std::vector entries(params.num_entries); Memory::ReadBlock(params.address, entries.data(), - params.num_entries * sizeof(IoctlGpfifoEntry)); + params.num_entries * sizeof(Tegra::CommandListHeader)); + + Core::System::GetInstance().GPU().ProcessCommandLists(entries); - for (auto entry : entries) { - Tegra::GPUVAddr va_addr = entry.Address(); - Core::System::GetInstance().GPU().ProcessCommandList(va_addr, entry.sz); - } params.fence_out.id = 0; params.fence_out.value = 0; std::memcpy(output.data(), ¶ms, output.size()); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h index 03b7356d07..baaefd79a9 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h @@ -10,7 +10,6 @@ #include "common/common_types.h" #include "common/swap.h" #include "core/hle/service/nvdrv/devices/nvdevice.h" -#include "video_core/memory_manager.h" namespace Service::Nvidia::Devices { @@ -151,22 +150,6 @@ private: }; static_assert(sizeof(IoctlAllocObjCtx) == 16, "IoctlAllocObjCtx is incorrect size"); - struct IoctlGpfifoEntry { - u32_le entry0; // gpu_va_lo - union { - u32_le entry1; // gpu_va_hi | (unk_0x02 << 0x08) | (size << 0x0A) | (unk_0x01 << 0x1F) - BitField<0, 8, u32_le> gpu_va_hi; - BitField<8, 2, u32_le> unk1; - BitField<10, 21, u32_le> sz; - BitField<31, 1, u32_le> unk2; - }; - - Tegra::GPUVAddr Address() const { - return (static_cast(gpu_va_hi) << 32) | entry0; - } - }; - static_assert(sizeof(IoctlGpfifoEntry) == 8, "IoctlGpfifoEntry is incorrect size"); - struct IoctlSubmitGpfifo { u64_le address; // pointer to gpfifo entry structs u32_le num_entries; // number of fence objects being submitted diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index d5831e7521..e0c2771053 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -69,57 +69,64 @@ void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params) } } -void GPU::ProcessCommandList(GPUVAddr address, u32 size) { - const boost::optional head_address = memory_manager->GpuToCpuAddress(address); - VAddr current_addr = *head_address; - while (current_addr < *head_address + size * sizeof(CommandHeader)) { - const CommandHeader header = {Memory::Read32(current_addr)}; - current_addr += sizeof(u32); +MICROPROFILE_DEFINE(ProcessCommandLists, "GPU", "Execute command buffer", MP_RGB(128, 128, 192)); - switch (header.mode.Value()) { - case SubmissionMode::IncreasingOld: - case SubmissionMode::Increasing: { - // Increase the method value with each argument. - for (unsigned i = 0; i < header.arg_count; ++i) { - WriteReg(header.method + i, header.subchannel, Memory::Read32(current_addr), - header.arg_count - i - 1); - current_addr += sizeof(u32); - } - break; - } - case SubmissionMode::NonIncreasingOld: - case SubmissionMode::NonIncreasing: { - // Use the same method value for all arguments. - for (unsigned i = 0; i < header.arg_count; ++i) { - WriteReg(header.method, header.subchannel, Memory::Read32(current_addr), - header.arg_count - i - 1); - current_addr += sizeof(u32); - } - break; - } - case SubmissionMode::IncreaseOnce: { - ASSERT(header.arg_count.Value() >= 1); - - // Use the original method for the first argument and then the next method for all other - // arguments. - WriteReg(header.method, header.subchannel, Memory::Read32(current_addr), - header.arg_count - 1); +void GPU::ProcessCommandLists(const std::vector& commands) { + MICROPROFILE_SCOPE(ProcessCommandLists); + for (auto entry : commands) { + Tegra::GPUVAddr address = entry.Address(); + u32 size = entry.sz; + const boost::optional head_address = memory_manager->GpuToCpuAddress(address); + VAddr current_addr = *head_address; + while (current_addr < *head_address + size * sizeof(CommandHeader)) { + const CommandHeader header = {Memory::Read32(current_addr)}; current_addr += sizeof(u32); - for (unsigned i = 1; i < header.arg_count; ++i) { - WriteReg(header.method + 1, header.subchannel, Memory::Read32(current_addr), - header.arg_count - i - 1); - current_addr += sizeof(u32); + switch (header.mode.Value()) { + case SubmissionMode::IncreasingOld: + case SubmissionMode::Increasing: { + // Increase the method value with each argument. + for (unsigned i = 0; i < header.arg_count; ++i) { + WriteReg(header.method + i, header.subchannel, Memory::Read32(current_addr), + header.arg_count - i - 1); + current_addr += sizeof(u32); + } + break; + } + case SubmissionMode::NonIncreasingOld: + case SubmissionMode::NonIncreasing: { + // Use the same method value for all arguments. + for (unsigned i = 0; i < header.arg_count; ++i) { + WriteReg(header.method, header.subchannel, Memory::Read32(current_addr), + header.arg_count - i - 1); + current_addr += sizeof(u32); + } + break; + } + case SubmissionMode::IncreaseOnce: { + ASSERT(header.arg_count.Value() >= 1); + + // Use the original method for the first argument and then the next method for all + // other arguments. + WriteReg(header.method, header.subchannel, Memory::Read32(current_addr), + header.arg_count - 1); + current_addr += sizeof(u32); + + for (unsigned i = 1; i < header.arg_count; ++i) { + WriteReg(header.method + 1, header.subchannel, Memory::Read32(current_addr), + header.arg_count - i - 1); + current_addr += sizeof(u32); + } + break; + } + case SubmissionMode::Inline: { + // The register value is stored in the bits 16-28 as an immediate + WriteReg(header.method, header.subchannel, header.inline_data, 0); + break; + } + default: + UNIMPLEMENTED(); } - break; - } - case SubmissionMode::Inline: { - // The register value is stored in the bits 16-28 as an immediate - WriteReg(header.method, header.subchannel, header.inline_data, 0); - break; - } - default: - UNIMPLEMENTED(); } } } diff --git a/src/video_core/command_processor.h b/src/video_core/command_processor.h index a01153e0b3..bd766e77a3 100644 --- a/src/video_core/command_processor.h +++ b/src/video_core/command_processor.h @@ -7,6 +7,7 @@ #include #include "common/bit_field.h" #include "common/common_types.h" +#include "video_core/memory_manager.h" namespace Tegra { @@ -19,6 +20,22 @@ enum class SubmissionMode : u32 { IncreaseOnce = 5 }; +struct CommandListHeader { + u32 entry0; // gpu_va_lo + union { + u32 entry1; // gpu_va_hi | (unk_0x02 << 0x08) | (size << 0x0A) | (unk_0x01 << 0x1F) + BitField<0, 8, u32> gpu_va_hi; + BitField<8, 2, u32> unk1; + BitField<10, 21, u32> sz; + BitField<31, 1, u32> unk2; + }; + + GPUVAddr Address() const { + return (static_cast(gpu_va_hi) << 32) | entry0; + } +}; +static_assert(sizeof(CommandListHeader) == 8, "CommandListHeader is incorrect size"); + union CommandHeader { u32 hex; diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index d29f31f52e..9163fbdc6a 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -6,6 +6,7 @@ #include #include +#include #include "common/common_types.h" #include "core/hle/service/nvflinger/buffer_queue.h" #include "video_core/memory_manager.h" @@ -67,6 +68,7 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format); /// Returns the number of bytes per pixel of each depth format. u32 DepthFormatBytesPerPixel(DepthFormat format); +struct CommandListHeader; class DebugContext; /** @@ -115,7 +117,7 @@ public: ~GPU(); /// Processes a command list stored at the specified address in GPU memory. - void ProcessCommandList(GPUVAddr address, u32 size); + void ProcessCommandLists(const std::vector& commands); /// Returns a reference to the Maxwell3D GPU engine. Engines::Maxwell3D& Maxwell3D(); From c1b8cd90589141feb182da0d48c335bd624a4793 Mon Sep 17 00:00:00 2001 From: Markus Wick Date: Thu, 6 Sep 2018 17:02:46 +0200 Subject: [PATCH 3/3] video_core: Refactor command_processor. Inline the WriteReg helper as it is called ~20k times per frame. --- src/video_core/command_processor.cpp | 83 ++++++++++++++-------------- src/video_core/gpu.h | 3 - 2 files changed, 42 insertions(+), 44 deletions(-) diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index e0c2771053..2625ddfdca 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -28,51 +28,52 @@ enum class BufferMethods { CountBufferMethods = 0x40, }; -void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params) { - LOG_TRACE(HW_GPU, - "Processing method {:08X} on subchannel {} value " - "{:08X} remaining params {}", - method, subchannel, value, remaining_params); - - ASSERT(subchannel < bound_engines.size()); - - if (method == static_cast(BufferMethods::BindObject)) { - // Bind the current subchannel to the desired engine id. - LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", subchannel, value); - bound_engines[subchannel] = static_cast(value); - return; - } - - if (method < static_cast(BufferMethods::CountBufferMethods)) { - // TODO(Subv): Research and implement these methods. - LOG_ERROR(HW_GPU, "Special buffer methods other than Bind are not implemented"); - return; - } - - const EngineID engine = bound_engines[subchannel]; - - switch (engine) { - case EngineID::FERMI_TWOD_A: - fermi_2d->WriteReg(method, value); - break; - case EngineID::MAXWELL_B: - maxwell_3d->WriteReg(method, value, remaining_params); - break; - case EngineID::MAXWELL_COMPUTE_B: - maxwell_compute->WriteReg(method, value); - break; - case EngineID::MAXWELL_DMA_COPY_A: - maxwell_dma->WriteReg(method, value); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented engine"); - } -} - MICROPROFILE_DEFINE(ProcessCommandLists, "GPU", "Execute command buffer", MP_RGB(128, 128, 192)); void GPU::ProcessCommandLists(const std::vector& commands) { MICROPROFILE_SCOPE(ProcessCommandLists); + + auto WriteReg = [this](u32 method, u32 subchannel, u32 value, u32 remaining_params) { + LOG_TRACE(HW_GPU, + "Processing method {:08X} on subchannel {} value " + "{:08X} remaining params {}", + method, subchannel, value, remaining_params); + + ASSERT(subchannel < bound_engines.size()); + + if (method == static_cast(BufferMethods::BindObject)) { + // Bind the current subchannel to the desired engine id. + LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", subchannel, value); + bound_engines[subchannel] = static_cast(value); + return; + } + + if (method < static_cast(BufferMethods::CountBufferMethods)) { + // TODO(Subv): Research and implement these methods. + LOG_ERROR(HW_GPU, "Special buffer methods other than Bind are not implemented"); + return; + } + + const EngineID engine = bound_engines[subchannel]; + + switch (engine) { + case EngineID::FERMI_TWOD_A: + fermi_2d->WriteReg(method, value); + break; + case EngineID::MAXWELL_B: + maxwell_3d->WriteReg(method, value, remaining_params); + break; + case EngineID::MAXWELL_COMPUTE_B: + maxwell_compute->WriteReg(method, value); + break; + case EngineID::MAXWELL_DMA_COPY_A: + maxwell_dma->WriteReg(method, value); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented engine"); + } + }; + for (auto entry : commands) { Tegra::GPUVAddr address = entry.Address(); u32 size = entry.sz; diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 9163fbdc6a..4f71f99d7c 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -132,9 +132,6 @@ public: const Tegra::MemoryManager& MemoryManager() const; private: - /// Writes a single register in the engine bound to the specified subchannel - void WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params); - std::unique_ptr memory_manager; /// Mapping of command subchannels to their bound engine ids.