From abea6fa90c901d0b47487ed38d44511b18f0addf Mon Sep 17 00:00:00 2001 From: bunnei Date: Fri, 23 Nov 2018 23:20:56 -0500 Subject: [PATCH 1/3] gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). --- .../hle/service/nvdrv/devices/nvhost_gpu.cpp | 13 ++- src/video_core/CMakeLists.txt | 4 +- src/video_core/command_processor.h | 53 --------- src/video_core/dma_pusher.cpp | 110 ++++++++++++++++++ src/video_core/dma_pusher.h | 95 +++++++++++++++ src/video_core/engines/fermi_2d.cpp | 8 +- src/video_core/engines/fermi_2d.h | 2 +- src/video_core/engines/kepler_memory.cpp | 10 +- src/video_core/engines/kepler_memory.h | 3 +- src/video_core/engines/maxwell_3d.cpp | 53 +++++---- src/video_core/engines/maxwell_3d.h | 2 +- src/video_core/engines/maxwell_compute.cpp | 8 +- src/video_core/engines/maxwell_compute.h | 3 +- src/video_core/engines/maxwell_dma.cpp | 8 +- src/video_core/engines/maxwell_dma.h | 2 +- src/video_core/gpu.cpp | 58 +++++++++ src/video_core/gpu.h | 27 ++++- src/video_core/macro_interpreter.cpp | 2 +- 18 files changed, 353 insertions(+), 108 deletions(-) delete mode 100644 src/video_core/command_processor.h create mode 100644 src/video_core/dma_pusher.cpp create mode 100644 src/video_core/dma_pusher.h diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp index 874d5e1c3e..39a58b6855 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp @@ -8,7 +8,6 @@ #include "core/core.h" #include "core/hle/service/nvdrv/devices/nvhost_gpu.h" #include "core/memory.h" -#include "video_core/command_processor.h" #include "video_core/gpu.h" #include "video_core/memory_manager.h" @@ -129,6 +128,14 @@ u32 nvhost_gpu::AllocateObjectContext(const std::vector& input, std::vector< return 0; } +static void PushGPUEntries(const std::vector& entries) { + auto& dma_pusher{Core::System::GetInstance().GPU().DmaPusher()}; + for (const auto& entry : entries) { + dma_pusher.Push(entry); + } + dma_pusher.DispatchCalls(); +} + u32 nvhost_gpu::SubmitGPFIFO(const std::vector& input, std::vector& output) { if (input.size() < sizeof(IoctlSubmitGpfifo)) { UNIMPLEMENTED(); @@ -146,7 +153,7 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector& input, std::vector& outp std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)], params.num_entries * sizeof(Tegra::CommandListHeader)); - Core::System::GetInstance().GPU().ProcessCommandLists(entries); + PushGPUEntries(entries); params.fence_out.id = 0; params.fence_out.value = 0; @@ -167,7 +174,7 @@ u32 nvhost_gpu::KickoffPB(const std::vector& input, std::vector& output) Memory::ReadBlock(params.address, entries.data(), params.num_entries * sizeof(Tegra::CommandListHeader)); - Core::System::GetInstance().GPU().ProcessCommandLists(entries); + PushGPUEntries(entries); params.fence_out.id = 0; params.fence_out.value = 0; diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 3f906a5179..0406fbcd9a 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -1,6 +1,6 @@ add_library(video_core STATIC - command_processor.cpp - command_processor.h + dma_pusher.cpp + dma_pusher.h debug_utils/debug_utils.cpp debug_utils/debug_utils.h engines/fermi_2d.cpp diff --git a/src/video_core/command_processor.h b/src/video_core/command_processor.h deleted file mode 100644 index bd766e77a3..0000000000 --- a/src/video_core/command_processor.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include -#include "common/bit_field.h" -#include "common/common_types.h" -#include "video_core/memory_manager.h" - -namespace Tegra { - -enum class SubmissionMode : u32 { - IncreasingOld = 0, - Increasing = 1, - NonIncreasingOld = 2, - NonIncreasing = 3, - Inline = 4, - IncreaseOnce = 5 -}; - -struct CommandListHeader { - u32 entry0; // gpu_va_lo - union { - u32 entry1; // gpu_va_hi | (unk_0x02 << 0x08) | (size << 0x0A) | (unk_0x01 << 0x1F) - BitField<0, 8, u32> gpu_va_hi; - BitField<8, 2, u32> unk1; - BitField<10, 21, u32> sz; - BitField<31, 1, u32> unk2; - }; - - GPUVAddr Address() const { - return (static_cast(gpu_va_hi) << 32) | entry0; - } -}; -static_assert(sizeof(CommandListHeader) == 8, "CommandListHeader is incorrect size"); - -union CommandHeader { - u32 hex; - - BitField<0, 13, u32> method; - BitField<13, 3, u32> subchannel; - - BitField<16, 13, u32> arg_count; - BitField<16, 13, u32> inline_data; - - BitField<29, 3, SubmissionMode> mode; -}; -static_assert(std::is_standard_layout_v, "CommandHeader is not standard layout"); -static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!"); - -} // namespace Tegra diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp new file mode 100644 index 0000000000..9f85a7acad --- /dev/null +++ b/src/video_core/dma_pusher.cpp @@ -0,0 +1,110 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "core/core.h" +#include "core/memory.h" +#include "video_core/dma_pusher.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" + +namespace Tegra { + +DmaPusher::DmaPusher(GPU& gpu) : gpu(gpu) {} + +DmaPusher::~DmaPusher() = default; + +void DmaPusher::DispatchCalls() { + // On entering GPU code, assume all memory may be touched by the ARM core. + gpu.Maxwell3D().dirty_flags.OnMemoryWrite(); + + while (Core::System::GetInstance().IsPoweredOn()) { + if (!Step()) { + break; + } + } +} + +bool DmaPusher::Step() { + if (dma_get != dma_put) { + // Push buffer non-empty, read a word + const CommandHeader command_header{ + Memory::Read32(*gpu.MemoryManager().GpuToCpuAddress(dma_get))}; + + dma_get += sizeof(u32); + + if (!non_main) { + dma_mget = dma_get; + } + + // now, see if we're in the middle of a command + if (dma_state.length_pending) { + // Second word of long non-inc methods command - method count + dma_state.length_pending = 0; + dma_state.method_count = command_header.method_count_; + } else if (dma_state.method_count) { + // Data word of methods command + CallMethod(command_header.argument); + + if (!dma_state.non_incrementing) { + dma_state.method++; + } + + if (dma_increment_once) { + dma_state.non_incrementing = true; + } + + dma_state.method_count--; + } else { + // No command active - this is the first word of a new one + switch (command_header.mode) { + case SubmissionMode::Increasing: + SetState(command_header); + dma_state.non_incrementing = false; + dma_increment_once = false; + break; + case SubmissionMode::NonIncreasing: + SetState(command_header); + dma_state.non_incrementing = true; + dma_increment_once = false; + break; + case SubmissionMode::Inline: + dma_state.method = command_header.method; + dma_state.subchannel = command_header.subchannel; + CallMethod(command_header.arg_count); + dma_state.non_incrementing = true; + dma_increment_once = false; + break; + case SubmissionMode::IncreaseOnce: + SetState(command_header); + dma_state.non_incrementing = false; + dma_increment_once = true; + break; + } + } + } else if (ib_enable && !dma_pushbuffer.empty()) { + // Current pushbuffer empty, but we have more IB entries to read + const CommandListHeader& command_list_header{dma_pushbuffer.front()}; + dma_get = command_list_header.addr; + dma_put = dma_get + command_list_header.size * sizeof(u32); + non_main = command_list_header.is_non_main; + dma_pushbuffer.pop(); + } else { + // Otherwise, pushbuffer empty and IB empty or nonexistent - nothing to do + return {}; + } + + return true; +} + +void DmaPusher::SetState(const CommandHeader& command_header) { + dma_state.method = command_header.method; + dma_state.subchannel = command_header.subchannel; + dma_state.method_count = command_header.method_count; +} + +void DmaPusher::CallMethod(u32 argument) const { + gpu.CallMethod({dma_state.method, argument, dma_state.subchannel, dma_state.method_count}); +} + +} // namespace Tegra diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h new file mode 100644 index 0000000000..39d98e46e2 --- /dev/null +++ b/src/video_core/dma_pusher.h @@ -0,0 +1,95 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include + +#include "common/bit_field.h" +#include "common/common_types.h" +#include "video_core/memory_manager.h" + +namespace Tegra { + +enum class SubmissionMode : u32 { + IncreasingOld = 0, + Increasing = 1, + NonIncreasingOld = 2, + NonIncreasing = 3, + Inline = 4, + IncreaseOnce = 5 +}; + +struct CommandListHeader { + union { + u64 raw; + BitField<0, 40, GPUVAddr> addr; + BitField<41, 1, u64> is_non_main; + BitField<42, 21, u64> size; + }; +}; +static_assert(sizeof(CommandListHeader) == sizeof(u64), "CommandListHeader is incorrect size"); + +union CommandHeader { + u32 argument; + BitField<0, 13, u32> method; + BitField<0, 24, u32> method_count_; + BitField<13, 3, u32> subchannel; + BitField<16, 13, u32> arg_count; + BitField<16, 13, u32> method_count; + BitField<29, 3, SubmissionMode> mode; +}; +static_assert(std::is_standard_layout_v, "CommandHeader is not standard layout"); +static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!"); + +class GPU; + +/** + * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the + * emulated app fills with commands and tells PFIFO to process. The pushbuffers are then assembled + * into a "command stream" consisting of 32-bit words that make up "commands". + * See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for + * details on this implementation. + */ +class DmaPusher { +public: + explicit DmaPusher(GPU& gpu); + ~DmaPusher(); + + void Push(const CommandListHeader& command_list_header) { + dma_pushbuffer.push(command_list_header); + } + + void DispatchCalls(); + +private: + bool Step(); + + void SetState(const CommandHeader& command_header); + + void CallMethod(u32 argument) const; + + GPU& gpu; + + std::queue dma_pushbuffer; + + struct DmaState { + u32 method; ///< Current method + u32 subchannel; ///< Current subchannel + u32 method_count; ///< Current method count + u32 length_pending; ///< Large NI command length pending + bool non_incrementing; ///< Current command’s NI flag + }; + + DmaState dma_state{}; + bool dma_increment_once{}; + + GPUVAddr dma_put{}; ///< pushbuffer current end address + GPUVAddr dma_get{}; ///< pushbuffer current read address + GPUVAddr dma_mget{}; ///< main pushbuffer last read address + bool ib_enable{true}; ///< IB mode enabled + bool non_main{}; ///< non-main pushbuffer active +}; + +} // namespace Tegra diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index 8d0700d131..dbea5bb5eb 100644 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -14,13 +14,13 @@ namespace Tegra::Engines { Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) : memory_manager(memory_manager), rasterizer{rasterizer} {} -void Fermi2D::WriteReg(u32 method, u32 value) { - ASSERT_MSG(method < Regs::NUM_REGS, +void Fermi2D::CallMethod(const GPU::MethodCall& method_call) { + ASSERT_MSG(method_call.method < Regs::NUM_REGS, "Invalid Fermi2D register, increase the size of the Regs structure"); - regs.reg_array[method] = value; + regs.reg_array[method_call.method] = method_call.argument; - switch (method) { + switch (method_call.method) { case FERMI2D_REG_INDEX(trigger): { HandleSurfaceCopy(); break; diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h index 2a6e8bbbb3..50009bf753 100644 --- a/src/video_core/engines/fermi_2d.h +++ b/src/video_core/engines/fermi_2d.h @@ -27,7 +27,7 @@ public: ~Fermi2D() = default; /// Write the value to the register identified by method. - void WriteReg(u32 method, u32 value); + void CallMethod(const GPU::MethodCall& method_call); struct Regs { static constexpr std::size_t NUM_REGS = 0x258; diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index 2adbc9eaf4..4880191fc6 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -17,19 +17,19 @@ KeplerMemory::KeplerMemory(VideoCore::RasterizerInterface& rasterizer, KeplerMemory::~KeplerMemory() = default; -void KeplerMemory::WriteReg(u32 method, u32 value) { - ASSERT_MSG(method < Regs::NUM_REGS, +void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) { + ASSERT_MSG(method_call.method < Regs::NUM_REGS, "Invalid KeplerMemory register, increase the size of the Regs structure"); - regs.reg_array[method] = value; + regs.reg_array[method_call.method] = method_call.argument; - switch (method) { + switch (method_call.method) { case KEPLERMEMORY_REG_INDEX(exec): { state.write_offset = 0; break; } case KEPLERMEMORY_REG_INDEX(data): { - ProcessData(value); + ProcessData(method_call.argument); break; } } diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h index bf4a13cffd..fe9ebc5b9b 100644 --- a/src/video_core/engines/kepler_memory.h +++ b/src/video_core/engines/kepler_memory.h @@ -9,6 +9,7 @@ #include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" +#include "video_core/gpu.h" #include "video_core/memory_manager.h" namespace VideoCore { @@ -26,7 +27,7 @@ public: ~KeplerMemory(); /// Write the value to the register identified by method. - void WriteReg(u32 method, u32 value); + void CallMethod(const GPU::MethodCall& method_call); struct Regs { static constexpr size_t NUM_REGS = 0x7F; diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index f0a5470b9f..b19b3a75a5 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -97,71 +97,74 @@ void Maxwell3D::CallMacroMethod(u32 method, std::vector parameters) { macro_interpreter.Execute(search->second, std::move(parameters)); } -void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) { +void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { auto debug_context = Core::System::GetInstance().GetGPUDebugContext(); // It is an error to write to a register other than the current macro's ARG register before it // has finished execution. if (executing_macro != 0) { - ASSERT(method == executing_macro + 1); + ASSERT(method_call.method == executing_macro + 1); } // Methods after 0xE00 are special, they're actually triggers for some microcode that was // uploaded to the GPU during initialization. - if (method >= MacroRegistersStart) { + if (method_call.method >= MacroRegistersStart) { // We're trying to execute a macro if (executing_macro == 0) { // A macro call must begin by writing the macro method's register, not its argument. - ASSERT_MSG((method % 2) == 0, + ASSERT_MSG((method_call.method % 2) == 0, "Can't start macro execution by writing to the ARGS register"); - executing_macro = method; + executing_macro = method_call.method; } - macro_params.push_back(value); + macro_params.push_back(method_call.argument); // Call the macro when there are no more parameters in the command buffer - if (remaining_params == 0) { + if (method_call.IsLastCall()) { CallMacroMethod(executing_macro, std::move(macro_params)); } return; } - ASSERT_MSG(method < Regs::NUM_REGS, + ASSERT_MSG(method_call.method < Regs::NUM_REGS, "Invalid Maxwell3D register, increase the size of the Regs structure"); if (debug_context) { debug_context->OnEvent(Tegra::DebugContext::Event::MaxwellCommandLoaded, nullptr); } - if (regs.reg_array[method] != value) { - regs.reg_array[method] = value; + if (regs.reg_array[method_call.method] != method_call.argument) { + regs.reg_array[method_call.method] = method_call.argument; // Vertex format - if (method >= MAXWELL3D_REG_INDEX(vertex_attrib_format) && - method < MAXWELL3D_REG_INDEX(vertex_attrib_format) + regs.vertex_attrib_format.size()) { + if (method_call.method >= MAXWELL3D_REG_INDEX(vertex_attrib_format) && + method_call.method < + MAXWELL3D_REG_INDEX(vertex_attrib_format) + regs.vertex_attrib_format.size()) { dirty_flags.vertex_attrib_format = true; } // Vertex buffer - if (method >= MAXWELL3D_REG_INDEX(vertex_array) && - method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * 32) { - dirty_flags.vertex_array |= 1u << ((method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2); - } else if (method >= MAXWELL3D_REG_INDEX(vertex_array_limit) && - method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * 32) { + if (method_call.method >= MAXWELL3D_REG_INDEX(vertex_array) && + method_call.method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * 32) { dirty_flags.vertex_array |= - 1u << ((method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1); - } else if (method >= MAXWELL3D_REG_INDEX(instanced_arrays) && - method < MAXWELL3D_REG_INDEX(instanced_arrays) + 32) { - dirty_flags.vertex_array |= 1u << (method - MAXWELL3D_REG_INDEX(instanced_arrays)); + 1u << ((method_call.method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2); + } else if (method_call.method >= MAXWELL3D_REG_INDEX(vertex_array_limit) && + method_call.method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * 32) { + dirty_flags.vertex_array |= + 1u << ((method_call.method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1); + } else if (method_call.method >= MAXWELL3D_REG_INDEX(instanced_arrays) && + method_call.method < MAXWELL3D_REG_INDEX(instanced_arrays) + 32) { + dirty_flags.vertex_array |= + 1u << (method_call.method - MAXWELL3D_REG_INDEX(instanced_arrays)); } } - switch (method) { + switch (method_call.method) { case MAXWELL3D_REG_INDEX(macros.data): { - ProcessMacroUpload(value); + ProcessMacroUpload(method_call.argument); break; } case MAXWELL3D_REG_INDEX(macros.bind): { - ProcessMacroBind(value); + ProcessMacroBind(method_call.argument); break; } case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]): @@ -180,7 +183,7 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) { case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]): case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]): case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): { - ProcessCBData(value); + ProcessCBData(method_call.argument); break; } case MAXWELL3D_REG_INDEX(cb_bind[0].raw_config): { diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 9324d9710a..84471f1812 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1080,7 +1080,7 @@ public: u32 GetRegisterValue(u32 method) const; /// Write the value to the register identified by method. - void WriteReg(u32 method, u32 value, u32 remaining_params); + void CallMethod(const GPU::MethodCall& method_call); /// Returns a list of enabled textures for the specified shader stage. std::vector GetStageTextures(Regs::ShaderStage stage) const; diff --git a/src/video_core/engines/maxwell_compute.cpp b/src/video_core/engines/maxwell_compute.cpp index 8b5f083519..656db6a61b 100644 --- a/src/video_core/engines/maxwell_compute.cpp +++ b/src/video_core/engines/maxwell_compute.cpp @@ -8,13 +8,13 @@ namespace Tegra::Engines { -void MaxwellCompute::WriteReg(u32 method, u32 value) { - ASSERT_MSG(method < Regs::NUM_REGS, +void MaxwellCompute::CallMethod(const GPU::MethodCall& method_call) { + ASSERT_MSG(method_call.method < Regs::NUM_REGS, "Invalid MaxwellCompute register, increase the size of the Regs structure"); - regs.reg_array[method] = value; + regs.reg_array[method_call.method] = method_call.argument; - switch (method) { + switch (method_call.method) { case MAXWELL_COMPUTE_REG_INDEX(compute): { LOG_CRITICAL(HW_GPU, "Compute shaders are not implemented"); UNREACHABLE(); diff --git a/src/video_core/engines/maxwell_compute.h b/src/video_core/engines/maxwell_compute.h index 6ea934fb9b..1d71f11bdb 100644 --- a/src/video_core/engines/maxwell_compute.h +++ b/src/video_core/engines/maxwell_compute.h @@ -9,6 +9,7 @@ #include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" +#include "video_core/gpu.h" namespace Tegra::Engines { @@ -42,7 +43,7 @@ public: "MaxwellCompute Regs has wrong size"); /// Write the value to the register identified by method. - void WriteReg(u32 method, u32 value); + void CallMethod(const GPU::MethodCall& method_call); }; #define ASSERT_REG_POSITION(field_name, position) \ diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index a34e884fe5..06462f570d 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -14,16 +14,16 @@ namespace Tegra::Engines { MaxwellDMA::MaxwellDMA(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) : memory_manager(memory_manager), rasterizer{rasterizer} {} -void MaxwellDMA::WriteReg(u32 method, u32 value) { - ASSERT_MSG(method < Regs::NUM_REGS, +void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) { + ASSERT_MSG(method_call.method < Regs::NUM_REGS, "Invalid MaxwellDMA register, increase the size of the Regs structure"); - regs.reg_array[method] = value; + regs.reg_array[method_call.method] = method_call.argument; #define MAXWELLDMA_REG_INDEX(field_name) \ (offsetof(Tegra::Engines::MaxwellDMA::Regs, field_name) / sizeof(u32)) - switch (method) { + switch (method_call.method) { case MAXWELLDMA_REG_INDEX(exec): { HandleCopy(); break; diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index 5f3704f057..1f8cd65d24 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -24,7 +24,7 @@ public: ~MaxwellDMA() = default; /// Write the value to the register identified by method. - void WriteReg(u32 method, u32 value); + void CallMethod(const GPU::MethodCall& method_call); struct Regs { static constexpr std::size_t NUM_REGS = 0x1D6; diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 51b3904f65..4a96530ae3 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -3,6 +3,7 @@ // Refer to the license.txt file included. #include "common/assert.h" +#include "common/microprofile.h" #include "video_core/engines/fermi_2d.h" #include "video_core/engines/kepler_memory.h" #include "video_core/engines/maxwell_3d.h" @@ -26,6 +27,7 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) { GPU::GPU(VideoCore::RasterizerInterface& rasterizer) { memory_manager = std::make_unique(); + dma_pusher = std::make_unique(*this); maxwell_3d = std::make_unique(rasterizer, *memory_manager); fermi_2d = std::make_unique(rasterizer, *memory_manager); maxwell_compute = std::make_unique(); @@ -51,6 +53,14 @@ const MemoryManager& GPU::MemoryManager() const { return *memory_manager; } +DmaPusher& GPU::DmaPusher() { + return *dma_pusher; +} + +const DmaPusher& GPU::DmaPusher() const { + return *dma_pusher; +} + u32 RenderTargetBytesPerPixel(RenderTargetFormat format) { ASSERT(format != RenderTargetFormat::NONE); @@ -113,4 +123,52 @@ u32 DepthFormatBytesPerPixel(DepthFormat format) { } } +enum class BufferMethods { + BindObject = 0, + CountBufferMethods = 0x40, +}; + +MICROPROFILE_DEFINE(ProcessCommandLists, "GPU", "Execute command buffer", MP_RGB(128, 128, 192)); + +void GPU::CallMethod(const MethodCall& method_call) { + MICROPROFILE_SCOPE(ProcessCommandLists); + + LOG_TRACE(HW_GPU, + "Processing method {:08X} on subchannel {} value " + "{:08X} remaining params {}", + MethCall.method, MethCall.subchannel, value, remaining_params); + + ASSERT(method_call.subchannel < bound_engines.size()); + + if (method_call.method == static_cast(BufferMethods::BindObject)) { + // Bind the current subchannel to the desired engine id. + LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel, + method_call.argument); + bound_engines[method_call.subchannel] = static_cast(method_call.argument); + return; + } + + const EngineID engine = bound_engines[method_call.subchannel]; + + switch (engine) { + case EngineID::FERMI_TWOD_A: + fermi_2d->CallMethod(method_call); + break; + case EngineID::MAXWELL_B: + maxwell_3d->CallMethod(method_call); + break; + case EngineID::MAXWELL_COMPUTE_B: + maxwell_compute->CallMethod(method_call); + break; + case EngineID::MAXWELL_DMA_COPY_A: + maxwell_dma->CallMethod(method_call); + break; + case EngineID::KEPLER_INLINE_TO_MEMORY_B: + kepler_memory->CallMethod(method_call); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented engine"); + } +} + } // namespace Tegra diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 5cc1e19ca0..af5ccd1e98 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -9,6 +9,7 @@ #include #include "common/common_types.h" #include "core/hle/service/nvflinger/buffer_queue.h" +#include "video_core/dma_pusher.h" #include "video_core/memory_manager.h" namespace VideoCore { @@ -119,8 +120,23 @@ public: explicit GPU(VideoCore::RasterizerInterface& rasterizer); ~GPU(); - /// Processes a command list stored at the specified address in GPU memory. - void ProcessCommandLists(const std::vector& commands); + struct MethodCall { + u32 method{}; + u32 argument{}; + u32 subchannel{}; + u32 method_count{}; + + bool IsLastCall() const { + return method_count <= 1; + } + + MethodCall(u32 method, u32 argument, u32 subchannel = 0, u32 method_count = 0) + : method(method), argument(argument), subchannel(subchannel), + method_count(method_count) {} + }; + + /// Calls a GPU method. + void CallMethod(const MethodCall& method_call); /// Returns a reference to the Maxwell3D GPU engine. Engines::Maxwell3D& Maxwell3D(); @@ -134,7 +150,14 @@ public: /// Returns a const reference to the GPU memory manager. const Tegra::MemoryManager& MemoryManager() const; + /// Returns a reference to the GPU DMA pusher. + Tegra::DmaPusher& DmaPusher(); + + /// Returns a const reference to the GPU DMA pusher. + const Tegra::DmaPusher& DmaPusher() const; + private: + std::unique_ptr dma_pusher; std::unique_ptr memory_manager; /// Mapping of command subchannels to their bound engine ids. diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro_interpreter.cpp index 2b0dea5cd2..9c55e9f1e1 100644 --- a/src/video_core/macro_interpreter.cpp +++ b/src/video_core/macro_interpreter.cpp @@ -250,7 +250,7 @@ void MacroInterpreter::SetMethodAddress(u32 address) { } void MacroInterpreter::Send(u32 value) { - maxwell3d.WriteReg(method_address.address, value, 0); + maxwell3d.CallMethod({method_address.address, value}); // Increment the method address by the method increment. method_address.address.Assign(method_address.address.Value() + method_address.increment.Value()); From c568f5cea79fe2e9d93fb545aee9bf6b3f6d70ae Mon Sep 17 00:00:00 2001 From: bunnei Date: Tue, 27 Nov 2018 18:42:21 -0500 Subject: [PATCH 2/3] gpu: Move command list profiling to DmaPusher::DispatchCalls. --- src/video_core/dma_pusher.cpp | 5 +++++ src/video_core/gpu.cpp | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 9f85a7acad..23ec979447 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include "common/microprofile.h" #include "core/core.h" #include "core/memory.h" #include "video_core/dma_pusher.h" @@ -14,7 +15,11 @@ DmaPusher::DmaPusher(GPU& gpu) : gpu(gpu) {} DmaPusher::~DmaPusher() = default; +MICROPROFILE_DEFINE(DispatchCalls, "GPU", "Execute command buffer", MP_RGB(128, 128, 192)); + void DmaPusher::DispatchCalls() { + MICROPROFILE_SCOPE(DispatchCalls); + // On entering GPU code, assume all memory may be touched by the ARM core. gpu.Maxwell3D().dirty_flags.OnMemoryWrite(); diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 4a96530ae3..6c81dee649 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -3,7 +3,6 @@ // Refer to the license.txt file included. #include "common/assert.h" -#include "common/microprofile.h" #include "video_core/engines/fermi_2d.h" #include "video_core/engines/kepler_memory.h" #include "video_core/engines/maxwell_3d.h" @@ -128,11 +127,7 @@ enum class BufferMethods { CountBufferMethods = 0x40, }; -MICROPROFILE_DEFINE(ProcessCommandLists, "GPU", "Execute command buffer", MP_RGB(128, 128, 192)); - void GPU::CallMethod(const MethodCall& method_call) { - MICROPROFILE_SCOPE(ProcessCommandLists); - LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {} value " "{:08X} remaining params {}", From ac74b71d7530452126792c5fa0bf01fe7378ba00 Mon Sep 17 00:00:00 2001 From: bunnei Date: Tue, 27 Nov 2018 19:17:33 -0500 Subject: [PATCH 3/3] dma_pushbuffer: Optimize to avoid loop and copy on Push. --- src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp | 14 ++++++-------- src/video_core/dma_pusher.cpp | 12 ++++++++++-- src/video_core/dma_pusher.h | 10 +++++++--- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp index 39a58b6855..2e2b0ae1cf 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp @@ -128,11 +128,9 @@ u32 nvhost_gpu::AllocateObjectContext(const std::vector& input, std::vector< return 0; } -static void PushGPUEntries(const std::vector& entries) { +static void PushGPUEntries(Tegra::CommandList&& entries) { auto& dma_pusher{Core::System::GetInstance().GPU().DmaPusher()}; - for (const auto& entry : entries) { - dma_pusher.Push(entry); - } + dma_pusher.Push(std::move(entries)); dma_pusher.DispatchCalls(); } @@ -149,11 +147,11 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector& input, std::vector& outp params.num_entries * sizeof(Tegra::CommandListHeader), "Incorrect input size"); - std::vector entries(params.num_entries); + Tegra::CommandList entries(params.num_entries); std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)], params.num_entries * sizeof(Tegra::CommandListHeader)); - PushGPUEntries(entries); + PushGPUEntries(std::move(entries)); params.fence_out.id = 0; params.fence_out.value = 0; @@ -170,11 +168,11 @@ u32 nvhost_gpu::KickoffPB(const std::vector& input, std::vector& output) LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}", params.address, params.num_entries, params.flags); - std::vector entries(params.num_entries); + Tegra::CommandList entries(params.num_entries); Memory::ReadBlock(params.address, entries.data(), params.num_entries * sizeof(Tegra::CommandListHeader)); - PushGPUEntries(entries); + PushGPUEntries(std::move(entries)); params.fence_out.id = 0; params.fence_out.value = 0; diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 23ec979447..63a958f11f 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -23,6 +23,8 @@ void DmaPusher::DispatchCalls() { // On entering GPU code, assume all memory may be touched by the ARM core. gpu.Maxwell3D().dirty_flags.OnMemoryWrite(); + dma_pushbuffer_subindex = 0; + while (Core::System::GetInstance().IsPoweredOn()) { if (!Step()) { break; @@ -89,11 +91,17 @@ bool DmaPusher::Step() { } } else if (ib_enable && !dma_pushbuffer.empty()) { // Current pushbuffer empty, but we have more IB entries to read - const CommandListHeader& command_list_header{dma_pushbuffer.front()}; + const CommandList& command_list{dma_pushbuffer.front()}; + const CommandListHeader& command_list_header{command_list[dma_pushbuffer_subindex++]}; dma_get = command_list_header.addr; dma_put = dma_get + command_list_header.size * sizeof(u32); non_main = command_list_header.is_non_main; - dma_pushbuffer.pop(); + + if (dma_pushbuffer_subindex >= command_list.size()) { + // We've gone through the current list, remove it from the queue + dma_pushbuffer.pop(); + dma_pushbuffer_subindex = 0; + } } else { // Otherwise, pushbuffer empty and IB empty or nonexistent - nothing to do return {}; diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index 39d98e46e2..16e0697c4f 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include "common/bit_field.h" @@ -45,6 +46,8 @@ static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect class GPU; +using CommandList = std::vector; + /** * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the * emulated app fills with commands and tells PFIFO to process. The pushbuffers are then assembled @@ -57,8 +60,8 @@ public: explicit DmaPusher(GPU& gpu); ~DmaPusher(); - void Push(const CommandListHeader& command_list_header) { - dma_pushbuffer.push(command_list_header); + void Push(CommandList&& entries) { + dma_pushbuffer.push(std::move(entries)); } void DispatchCalls(); @@ -72,7 +75,8 @@ private: GPU& gpu; - std::queue dma_pushbuffer; + std::queue dma_pushbuffer; ///< Queue of command lists to be processed + std::size_t dma_pushbuffer_subindex{}; ///< Index within a command list within the pushbuffer struct DmaState { u32 method; ///< Current method