VideoCore: Implement DispatchIndirect

This commit is contained in:
Fernando Sahmkow 2023-08-27 02:58:00 +02:00
parent 710ca3ca49
commit 115792158d
11 changed files with 119 additions and 13 deletions

View file

@ -14,6 +14,7 @@
namespace Tegra { namespace Tegra {
constexpr u32 MacroRegistersStart = 0xE00; constexpr u32 MacroRegistersStart = 0xE00;
constexpr u32 ComputeInline = 0x6D;
DmaPusher::DmaPusher(Core::System& system_, GPU& gpu_, MemoryManager& memory_manager_, DmaPusher::DmaPusher(Core::System& system_, GPU& gpu_, MemoryManager& memory_manager_,
Control::ChannelState& channel_state_) Control::ChannelState& channel_state_)
@ -83,20 +84,35 @@ bool DmaPusher::Step() {
dma_state.dma_get, command_list_header.size * sizeof(u32)); dma_state.dma_get, command_list_header.size * sizeof(u32));
} }
} }
if (Settings::IsGPULevelHigh() && dma_state.method < MacroRegistersStart) { const auto safe_process = [&] {
Core::Memory::GpuGuestMemory<Tegra::CommandHeader, Core::Memory::GpuGuestMemory<Tegra::CommandHeader,
Core::Memory::GuestMemoryFlags::SafeRead> Core::Memory::GuestMemoryFlags::SafeRead>
headers(memory_manager, dma_state.dma_get, command_list_header.size, headers(memory_manager, dma_state.dma_get, command_list_header.size,
&command_headers); &command_headers);
ProcessCommands(headers); ProcessCommands(headers);
};
const auto unsafe_process = [&] {
Core::Memory::GpuGuestMemory<Tegra::CommandHeader,
Core::Memory::GuestMemoryFlags::UnsafeRead>
headers(memory_manager, dma_state.dma_get, command_list_header.size,
&command_headers);
ProcessCommands(headers);
};
if (Settings::IsGPULevelHigh()) {
if (dma_state.method >= MacroRegistersStart) {
unsafe_process();
return true;
}
if (subchannel_type[dma_state.subchannel] == Engines::EngineTypes::KeplerCompute &&
dma_state.method == ComputeInline) {
unsafe_process();
return true;
}
safe_process();
return true; return true;
} }
Core::Memory::GpuGuestMemory<Tegra::CommandHeader, unsafe_process();
Core::Memory::GuestMemoryFlags::UnsafeRead>
headers(memory_manager, dma_state.dma_get, command_list_header.size, &command_headers);
ProcessCommands(headers);
} }
return true; return true;
} }

View file

@ -130,8 +130,10 @@ public:
void DispatchCalls(); void DispatchCalls();
void BindSubchannel(Engines::EngineInterface* engine, u32 subchannel_id) { void BindSubchannel(Engines::EngineInterface* engine, u32 subchannel_id,
Engines::EngineTypes engine_type) {
subchannels[subchannel_id] = engine; subchannels[subchannel_id] = engine;
subchannel_type[subchannel_id] = engine_type;
} }
void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
@ -170,6 +172,7 @@ private:
const bool ib_enable{true}; ///< IB mode enabled const bool ib_enable{true}; ///< IB mode enabled
std::array<Engines::EngineInterface*, max_subchannels> subchannels{}; std::array<Engines::EngineInterface*, max_subchannels> subchannels{};
std::array<Engines::EngineTypes, max_subchannels> subchannel_type;
GPU& gpu; GPU& gpu;
Core::System& system; Core::System& system;

View file

@ -11,6 +11,14 @@
namespace Tegra::Engines { namespace Tegra::Engines {
enum class EngineTypes : u32 {
KeplerCompute,
Maxwell3D,
Fermi2D,
MaxwellDMA,
KeplerMemory,
};
class EngineInterface { class EngineInterface {
public: public:
virtual ~EngineInterface() = default; virtual ~EngineInterface() = default;

View file

@ -69,6 +69,14 @@ public:
/// Binds a rasterizer to this engine. /// Binds a rasterizer to this engine.
void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
GPUVAddr ExecTargetAddress() const {
return regs.dest.Address();
}
u32 GetUploadSize() const {
return copy_size;
}
private: private:
void ProcessData(std::span<const u8> read_buffer); void ProcessData(std::span<const u8> read_buffer);

View file

@ -43,16 +43,33 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal
switch (method) { switch (method) {
case KEPLER_COMPUTE_REG_INDEX(exec_upload): { case KEPLER_COMPUTE_REG_INDEX(exec_upload): {
UploadInfo info{.upload_address = upload_address,
.exec_address = upload_state.ExecTargetAddress(),
.copy_size = upload_state.GetUploadSize()};
uploads.push_back(info);
upload_state.ProcessExec(regs.exec_upload.linear != 0); upload_state.ProcessExec(regs.exec_upload.linear != 0);
break; break;
} }
case KEPLER_COMPUTE_REG_INDEX(data_upload): { case KEPLER_COMPUTE_REG_INDEX(data_upload): {
upload_address = current_dma_segment;
upload_state.ProcessData(method_argument, is_last_call); upload_state.ProcessData(method_argument, is_last_call);
break; break;
} }
case KEPLER_COMPUTE_REG_INDEX(launch): case KEPLER_COMPUTE_REG_INDEX(launch): {
const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
for (auto& data : uploads) {
const GPUVAddr offset = data.exec_address - launch_desc_loc;
if (offset / sizeof(u32) == LAUNCH_REG_INDEX(grid_dim_x) &&
memory_manager.IsMemoryDirty(data.upload_address, data.copy_size)) {
indirect_compute = {data.upload_address};
}
}
uploads.clear();
ProcessLaunch(); ProcessLaunch();
indirect_compute = std::nullopt;
break; break;
}
default: default:
break; break;
} }
@ -62,6 +79,7 @@ void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amoun
u32 methods_pending) { u32 methods_pending) {
switch (method) { switch (method) {
case KEPLER_COMPUTE_REG_INDEX(data_upload): case KEPLER_COMPUTE_REG_INDEX(data_upload):
upload_address = current_dma_segment;
upload_state.ProcessData(base_start, amount); upload_state.ProcessData(base_start, amount);
return; return;
default: default:

View file

@ -5,6 +5,7 @@
#include <array> #include <array>
#include <cstddef> #include <cstddef>
#include <optional>
#include <vector> #include <vector>
#include "common/bit_field.h" #include "common/bit_field.h"
#include "common/common_funcs.h" #include "common/common_funcs.h"
@ -36,6 +37,9 @@ namespace Tegra::Engines {
#define KEPLER_COMPUTE_REG_INDEX(field_name) \ #define KEPLER_COMPUTE_REG_INDEX(field_name) \
(offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32)) (offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32))
#define LAUNCH_REG_INDEX(field_name) \
(offsetof(Tegra::Engines::KeplerCompute::LaunchParams, field_name) / sizeof(u32))
class KeplerCompute final : public EngineInterface { class KeplerCompute final : public EngineInterface {
public: public:
explicit KeplerCompute(Core::System& system, MemoryManager& memory_manager); explicit KeplerCompute(Core::System& system, MemoryManager& memory_manager);
@ -201,6 +205,10 @@ public:
void CallMultiMethod(u32 method, const u32* base_start, u32 amount, void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
u32 methods_pending) override; u32 methods_pending) override;
std::optional<GPUVAddr> GetIndirectComputeAddress() const {
return indirect_compute;
}
private: private:
void ProcessLaunch(); void ProcessLaunch();
@ -216,6 +224,15 @@ private:
MemoryManager& memory_manager; MemoryManager& memory_manager;
VideoCore::RasterizerInterface* rasterizer = nullptr; VideoCore::RasterizerInterface* rasterizer = nullptr;
Upload::State upload_state; Upload::State upload_state;
GPUVAddr upload_address;
struct UploadInfo {
GPUVAddr upload_address;
GPUVAddr exec_address;
u32 copy_size;
};
std::vector<UploadInfo> uploads;
std::optional<GPUVAddr> indirect_compute{};
}; };
#define ASSERT_REG_POSITION(field_name, position) \ #define ASSERT_REG_POSITION(field_name, position) \

View file

@ -34,19 +34,24 @@ void Puller::ProcessBindMethod(const MethodCall& method_call) {
bound_engines[method_call.subchannel] = engine_id; bound_engines[method_call.subchannel] = engine_id;
switch (engine_id) { switch (engine_id) {
case EngineID::FERMI_TWOD_A: case EngineID::FERMI_TWOD_A:
dma_pusher.BindSubchannel(channel_state.fermi_2d.get(), method_call.subchannel); dma_pusher.BindSubchannel(channel_state.fermi_2d.get(), method_call.subchannel,
EngineTypes::Fermi2D);
break; break;
case EngineID::MAXWELL_B: case EngineID::MAXWELL_B:
dma_pusher.BindSubchannel(channel_state.maxwell_3d.get(), method_call.subchannel); dma_pusher.BindSubchannel(channel_state.maxwell_3d.get(), method_call.subchannel,
EngineTypes::Maxwell3D);
break; break;
case EngineID::KEPLER_COMPUTE_B: case EngineID::KEPLER_COMPUTE_B:
dma_pusher.BindSubchannel(channel_state.kepler_compute.get(), method_call.subchannel); dma_pusher.BindSubchannel(channel_state.kepler_compute.get(), method_call.subchannel,
EngineTypes::KeplerCompute);
break; break;
case EngineID::MAXWELL_DMA_COPY_A: case EngineID::MAXWELL_DMA_COPY_A:
dma_pusher.BindSubchannel(channel_state.maxwell_dma.get(), method_call.subchannel); dma_pusher.BindSubchannel(channel_state.maxwell_dma.get(), method_call.subchannel,
EngineTypes::MaxwellDMA);
break; break;
case EngineID::KEPLER_INLINE_TO_MEMORY_B: case EngineID::KEPLER_INLINE_TO_MEMORY_B:
dma_pusher.BindSubchannel(channel_state.kepler_memory.get(), method_call.subchannel); dma_pusher.BindSubchannel(channel_state.kepler_memory.get(), method_call.subchannel,
EngineTypes::KeplerMemory);
break; break;
default: default:
UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id); UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id);

View file

@ -380,6 +380,17 @@ void RasterizerOpenGL::DispatchCompute() {
pipeline->SetEngine(kepler_compute, gpu_memory); pipeline->SetEngine(kepler_compute, gpu_memory);
pipeline->Configure(); pipeline->Configure();
const auto& qmd{kepler_compute->launch_description}; const auto& qmd{kepler_compute->launch_description};
auto indirect_address = kepler_compute->GetIndirectComputeAddress();
if (indirect_address) {
// DispatchIndirect
static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize;
const auto post_op = VideoCommon::ObtainBufferOperation::DiscardWrite;
const auto [buffer, offset] =
buffer_cache.ObtainBuffer(*indirect_address, 12, sync_info, post_op);
glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, buffer->Handle());
glDispatchComputeIndirect(static_cast<GLintptr>(offset));
return;
}
glDispatchCompute(qmd.grid_dim_x, qmd.grid_dim_y, qmd.grid_dim_z); glDispatchCompute(qmd.grid_dim_x, qmd.grid_dim_y, qmd.grid_dim_z);
++num_queued_commands; ++num_queued_commands;
has_written_global_memory |= pipeline->WritesGlobalMemory(); has_written_global_memory |= pipeline->WritesGlobalMemory();

View file

@ -463,6 +463,20 @@ void RasterizerVulkan::DispatchCompute() {
pipeline->Configure(*kepler_compute, *gpu_memory, scheduler, buffer_cache, texture_cache); pipeline->Configure(*kepler_compute, *gpu_memory, scheduler, buffer_cache, texture_cache);
const auto& qmd{kepler_compute->launch_description}; const auto& qmd{kepler_compute->launch_description};
auto indirect_address = kepler_compute->GetIndirectComputeAddress();
if (indirect_address) {
// DispatchIndirect
static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize;
const auto post_op = VideoCommon::ObtainBufferOperation::DiscardWrite;
const auto [buffer, offset] =
buffer_cache.ObtainBuffer(*indirect_address, 12, sync_info, post_op);
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([indirect_buffer = buffer->Handle(),
indirect_offset = offset](vk::CommandBuffer cmdbuf) {
cmdbuf.DispatchIndirect(indirect_buffer, indirect_offset);
});
return;
}
const std::array<u32, 3> dim{qmd.grid_dim_x, qmd.grid_dim_y, qmd.grid_dim_z}; const std::array<u32, 3> dim{qmd.grid_dim_x, qmd.grid_dim_y, qmd.grid_dim_z};
scheduler.RequestOutsideRenderPassOperationContext(); scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); }); scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); });

View file

@ -92,6 +92,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
X(vkCmdCopyImage); X(vkCmdCopyImage);
X(vkCmdCopyImageToBuffer); X(vkCmdCopyImageToBuffer);
X(vkCmdDispatch); X(vkCmdDispatch);
X(vkCmdDispatchIndirect);
X(vkCmdDraw); X(vkCmdDraw);
X(vkCmdDrawIndexed); X(vkCmdDrawIndexed);
X(vkCmdDrawIndirect); X(vkCmdDrawIndirect);

View file

@ -203,6 +203,7 @@ struct DeviceDispatch : InstanceDispatch {
PFN_vkCmdCopyImage vkCmdCopyImage{}; PFN_vkCmdCopyImage vkCmdCopyImage{};
PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{}; PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{};
PFN_vkCmdDispatch vkCmdDispatch{}; PFN_vkCmdDispatch vkCmdDispatch{};
PFN_vkCmdDispatchIndirect vkCmdDispatchIndirect{};
PFN_vkCmdDraw vkCmdDraw{}; PFN_vkCmdDraw vkCmdDraw{};
PFN_vkCmdDrawIndexed vkCmdDrawIndexed{}; PFN_vkCmdDrawIndexed vkCmdDrawIndexed{};
PFN_vkCmdDrawIndirect vkCmdDrawIndirect{}; PFN_vkCmdDrawIndirect vkCmdDrawIndirect{};
@ -1209,6 +1210,10 @@ public:
dld->vkCmdDispatch(handle, x, y, z); dld->vkCmdDispatch(handle, x, y, z);
} }
void DispatchIndirect(VkBuffer indirect_buffer, VkDeviceSize offset) const noexcept {
dld->vkCmdDispatchIndirect(handle, indirect_buffer, offset);
}
void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask,
VkDependencyFlags dependency_flags, Span<VkMemoryBarrier> memory_barriers, VkDependencyFlags dependency_flags, Span<VkMemoryBarrier> memory_barriers,
Span<VkBufferMemoryBarrier> buffer_barriers, Span<VkBufferMemoryBarrier> buffer_barriers,