diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 2442c3c294..e61d9af806 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -33,6 +33,7 @@ set(SHADER_FILES opengl_fidelityfx_fsr.frag opengl_fidelityfx_fsr_easu.frag opengl_fidelityfx_fsr_rcas.frag + opengl_lmem_warmup.comp opengl_present.frag opengl_present.vert opengl_present_scaleforce.frag diff --git a/src/video_core/host_shaders/opengl_lmem_warmup.comp b/src/video_core/host_shaders/opengl_lmem_warmup.comp new file mode 100644 index 0000000000..518268477a --- /dev/null +++ b/src/video_core/host_shaders/opengl_lmem_warmup.comp @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +// This shader is a workaround for a quirk in NVIDIA OpenGL drivers +// Shaders using local memory see a great performance benefit if a shader that was dispatched +// before it had more local memory allocated. +// This shader allocates the maximum local memory allowed on NVIDIA drivers to ensure that +// subsequent shaders see the performance boost. + +// NOTE: This shader does no actual meaningful work and returns immediately, +// it is simply a means to have the driver expect a shader using lots of local memory. + +#version 450 + +layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; + +layout(location = 0) uniform uint uniform_data; + +layout(binding = 0, rgba8) uniform writeonly restrict image2DArray dest_image; + +#define MAX_LMEM_SIZE 4080 // Size chosen to avoid errors in Nvidia's GLSL compiler +#define NUM_LMEM_CONSTANTS 1 +#define ARRAY_SIZE MAX_LMEM_SIZE - NUM_LMEM_CONSTANTS + +uint lmem_0[ARRAY_SIZE]; +const uvec4 constant_values[NUM_LMEM_CONSTANTS] = uvec4[](uvec4(0)); + +void main() { + const uint global_id = gl_GlobalInvocationID.x; + if (global_id <= 128) { + // Since the shader is called with a dispatch of 1x1x1 + // This should always be the case, and this shader will not actually execute + return; + } + for (uint t = 0; t < uniform_data; t++) { + const uint offset = (t * uniform_data); + lmem_0[offset] = t; + } + const uint offset = (gl_GlobalInvocationID.y * uniform_data + gl_GlobalInvocationID.x); + const uint value = lmem_0[offset]; + const uint const_value = constant_values[offset / 4][offset % 4]; + const uvec4 color = uvec4(value + const_value); + + // A "side-effect" is needed so the variables don't get optimized out, + // but this should never execute so there should be no clobbering of previously bound state. + imageStore(dest_image, ivec3(gl_GlobalInvocationID), color); +} diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index fc711c44ae..d032885168 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -222,6 +222,7 @@ void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) { gpu.TickWork(); std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; + program_manager.LocalMemoryWarmup(); pipeline->SetEngine(maxwell3d, gpu_memory); pipeline->Configure(is_indexed); @@ -371,6 +372,7 @@ void RasterizerOpenGL::DispatchCompute() { if (!pipeline) { return; } + program_manager.LocalMemoryWarmup(); pipeline->SetEngine(kepler_compute, gpu_memory); pipeline->Configure(); const auto& qmd{kepler_compute->launch_description}; diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 98841ae65e..2f6ba68239 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -3,7 +3,9 @@ #include +#include "video_core/host_shaders/opengl_lmem_warmup_comp.h" #include "video_core/renderer_opengl/gl_shader_manager.h" +#include "video_core/renderer_opengl/gl_shader_util.h" namespace OpenGL { @@ -12,7 +14,8 @@ static constexpr std::array ASSEMBLY_PROGRAM_ENUMS{ GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV, }; -ProgramManager::ProgramManager(const Device& device) { +ProgramManager::ProgramManager(const Device& device) + : lmem_warmup_program(CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER)) { glCreateProgramPipelines(1, &pipeline.handle); if (device.UseAssemblyShaders()) { glEnable(GL_COMPUTE_PROGRAM_NV); @@ -98,6 +101,11 @@ void ProgramManager::BindAssemblyPrograms(std::span current_programs{}; GLuint current_assembly_compute_program = 0; + OGLProgram lmem_warmup_program; }; } // namespace OpenGL