suyu/src/video_core/renderer_opengl/gl_rasterizer.cpp

1184 lines
46 KiB
C++

// Copyright 2015 Citra Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <algorithm>
#include <array>
#include <bitset>
#include <memory>
#include <string>
#include <string_view>
#include <tuple>
#include <utility>
#include <glad/glad.h>
#include "common/alignment.h"
#include "common/assert.h"
#include "common/logging/log.h"
#include "common/math_util.h"
#include "common/microprofile.h"
#include "common/scope_exit.h"
#include "core/core.h"
#include "core/hle/kernel/process.h"
#include "core/memory.h"
#include "core/settings.h"
#include "video_core/engines/kepler_compute.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/engines/shader_type.h"
#include "video_core/memory_manager.h"
#include "video_core/renderer_opengl/gl_query_cache.h"
#include "video_core/renderer_opengl/gl_rasterizer.h"
#include "video_core/renderer_opengl/gl_shader_cache.h"
#include "video_core/renderer_opengl/gl_shader_gen.h"
#include "video_core/renderer_opengl/maxwell_to_gl.h"
#include "video_core/renderer_opengl/renderer_opengl.h"
namespace OpenGL {
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
using VideoCore::Surface::PixelFormat;
using VideoCore::Surface::SurfaceTarget;
using VideoCore::Surface::SurfaceType;
MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Format Setup", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_VB, "OpenGL", "Vertex Buffer Setup", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Shader, "OpenGL", "Shader Setup", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_UBO, "OpenGL", "Const Buffer Setup", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Index, "OpenGL", "Index Buffer Setup", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Texture, "OpenGL", "Texture Setup", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Framebuffer, "OpenGL", "Framebuffer Setup", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100));
MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255, 100, 100));
namespace {
template <typename Engine, typename Entry>
Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
Tegra::Engines::ShaderType shader_type,
std::size_t index = 0) {
if (entry.IsBindless()) {
const Tegra::Texture::TextureHandle tex_handle =
engine.AccessConstBuffer32(shader_type, entry.GetBuffer(), entry.GetOffset());
return engine.GetTextureInfo(tex_handle);
}
const auto& gpu_profile = engine.AccessGuestDriverProfile();
const u32 offset =
entry.GetOffset() + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
return engine.GetStageTexture(shader_type, offset);
} else {
return engine.GetTexture(offset);
}
}
std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
const GLShader::ConstBufferEntry& entry) {
if (!entry.IsIndirect()) {
return entry.GetSize();
}
if (buffer.size > Maxwell::MaxConstBufferSize) {
LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size,
Maxwell::MaxConstBufferSize);
return Maxwell::MaxConstBufferSize;
}
return buffer.size;
}
void oglEnable(GLenum cap, bool state) {
(state ? glEnable : glDisable)(cap);
}
void oglEnablei(GLenum cap, bool state, GLuint index) {
(state ? glEnablei : glDisablei)(cap, index);
}
} // Anonymous namespace
RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
ScreenInfo& info)
: RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device},
shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, system{system},
screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
shader_program_manager = std::make_unique<GLShader::ProgramManager>();
state.draw.shader_program = 0;
state.Apply();
CheckExtensions();
}
RasterizerOpenGL::~RasterizerOpenGL() {}
void RasterizerOpenGL::CheckExtensions() {
if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) {
LOG_WARNING(
Render_OpenGL,
"Anisotropic filter is not supported! This can cause graphical issues in some games.");
}
}
void RasterizerOpenGL::SetupVertexFormat() {
auto& gpu = system.GPU().Maxwell3D();
const auto& regs = gpu.regs;
MICROPROFILE_SCOPE(OpenGL_VAO);
// Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. Enables
// the first 16 vertex attributes always, as we don't know which ones are actually used until
// shader time. Note, Tegra technically supports 32, but we're capping this to 16 for now to
// avoid OpenGL errors.
// TODO(Subv): Analyze the shader to identify which attributes are actually used and don't
// assume every shader uses them all.
for (u32 index = 0; index < 16; ++index) {
const auto& attrib = regs.vertex_attrib_format[index];
// Ignore invalid attributes.
if (!attrib.IsValid()) {
glDisableVertexAttribArray(index);
continue;
}
glEnableVertexAttribArray(index);
if (attrib.type == Maxwell::VertexAttribute::Type::SignedInt ||
attrib.type == Maxwell::VertexAttribute::Type::UnsignedInt) {
glVertexAttribIFormat(index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib),
attrib.offset);
} else {
glVertexAttribFormat(index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib),
attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset);
}
glVertexAttribBinding(index, attrib.buffer);
}
}
void RasterizerOpenGL::SetupVertexBuffer() {
auto& gpu = system.GPU().Maxwell3D();
const auto& regs = gpu.regs;
MICROPROFILE_SCOPE(OpenGL_VB);
// Upload all guest vertex arrays sequentially to our buffer
for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
const auto& vertex_array = regs.vertex_array[index];
if (!vertex_array.IsEnabled()) {
continue;
}
const GPUVAddr start = vertex_array.StartAddress();
const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
ASSERT(end > start);
const u64 size = end - start + 1;
const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);
// Bind the vertex array to the buffer at the current offset.
vertex_array_pushbuffer.SetVertexBuffer(index, vertex_buffer, vertex_buffer_offset,
vertex_array.stride);
if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) {
// Enable vertex buffer instancing with the specified divisor.
glVertexBindingDivisor(index, vertex_array.divisor);
} else {
// Disable the vertex buffer instancing.
glVertexBindingDivisor(index, 0);
}
}
}
void RasterizerOpenGL::SetupVertexInstances() {
auto& gpu = system.GPU().Maxwell3D();
const auto& regs = gpu.regs;
// Upload all guest vertex arrays sequentially to our buffer
for (u32 index = 0; index < 16; ++index) {
if (regs.instanced_arrays.IsInstancingEnabled(index) &&
regs.vertex_array[index].divisor != 0) {
// Enable vertex buffer instancing with the specified divisor.
glVertexBindingDivisor(index, regs.vertex_array[index].divisor);
} else {
// Disable the vertex buffer instancing.
glVertexBindingDivisor(index, 0);
}
}
}
GLintptr RasterizerOpenGL::SetupIndexBuffer() {
MICROPROFILE_SCOPE(OpenGL_Index);
const auto& regs = system.GPU().Maxwell3D().regs;
const std::size_t size = CalculateIndexBufferSize();
const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
vertex_array_pushbuffer.SetIndexBuffer(buffer);
return offset;
}
void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
MICROPROFILE_SCOPE(OpenGL_Shader);
auto& gpu = system.GPU().Maxwell3D();
std::array<bool, Maxwell::NumClipDistances> clip_distances{};
for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
const auto& shader_config = gpu.regs.shader_config[index];
const auto program{static_cast<Maxwell::ShaderProgram>(index)};
// Skip stages that are not enabled
if (!gpu.regs.IsShaderConfigEnabled(index)) {
switch (program) {
case Maxwell::ShaderProgram::Geometry:
shader_program_manager->UseTrivialGeometryShader();
break;
case Maxwell::ShaderProgram::Fragment:
shader_program_manager->UseTrivialFragmentShader();
break;
default:
break;
}
continue;
}
// Currently this stages are not supported in the OpenGL backend.
// Todo(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL
if (program == Maxwell::ShaderProgram::TesselationControl) {
continue;
} else if (program == Maxwell::ShaderProgram::TesselationEval) {
continue;
}
Shader shader{shader_cache.GetStageProgram(program)};
// Stage indices are 0 - 5
const std::size_t stage = index == 0 ? 0 : index - 1;
SetupDrawConstBuffers(stage, shader);
SetupDrawGlobalMemory(stage, shader);
SetupDrawTextures(stage, shader);
SetupDrawImages(stage, shader);
const ProgramVariant variant(primitive_mode);
const auto program_handle = shader->GetHandle(variant);
switch (program) {
case Maxwell::ShaderProgram::VertexA:
case Maxwell::ShaderProgram::VertexB:
shader_program_manager->UseProgrammableVertexShader(program_handle);
break;
case Maxwell::ShaderProgram::Geometry:
shader_program_manager->UseProgrammableGeometryShader(program_handle);
break;
case Maxwell::ShaderProgram::Fragment:
shader_program_manager->UseProgrammableFragmentShader(program_handle);
break;
default:
UNIMPLEMENTED_MSG("Unimplemented shader index={}, enable={}, offset=0x{:08X}", index,
shader_config.enable.Value(), shader_config.offset);
}
// Workaround for Intel drivers.
// When a clip distance is enabled but not set in the shader it crops parts of the screen
// (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the
// clip distances only when it's written by a shader stage.
for (std::size_t i = 0; i < Maxwell::NumClipDistances; ++i) {
clip_distances[i] = clip_distances[i] || shader->GetShaderEntries().clip_distances[i];
}
// When VertexA is enabled, we have dual vertex shaders
if (program == Maxwell::ShaderProgram::VertexA) {
// VertexB was combined with VertexA, so we skip the VertexB iteration
++index;
}
}
SyncClipEnabled(clip_distances);
}
std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
const auto& regs = system.GPU().Maxwell3D().regs;
std::size_t size = 0;
for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
if (!regs.vertex_array[index].IsEnabled())
continue;
const GPUVAddr start = regs.vertex_array[index].StartAddress();
const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
ASSERT(end > start);
size += end - start + 1;
}
return size;
}
std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const {
const auto& regs = system.GPU().Maxwell3D().regs;
return static_cast<std::size_t>(regs.index_array.count) *
static_cast<std::size_t>(regs.index_array.FormatSizeInBytes());
}
void RasterizerOpenGL::LoadDiskResources(const std::atomic_bool& stop_loading,
const VideoCore::DiskResourceLoadCallback& callback) {
shader_cache.LoadDiskCache(stop_loading, callback);
}
void RasterizerOpenGL::ConfigureFramebuffers() {
MICROPROFILE_SCOPE(OpenGL_Framebuffer);
auto& gpu = system.GPU().Maxwell3D();
texture_cache.GuardRenderTargets(true);
View depth_surface = texture_cache.GetDepthBufferSurface(true);
const auto& regs = gpu.regs;
UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0);
// Bind the framebuffer surfaces
FramebufferCacheKey key;
const auto colors_count = static_cast<std::size_t>(regs.rt_control.count);
for (std::size_t index = 0; index < colors_count; ++index) {
View color_surface{texture_cache.GetColorBufferSurface(index, true)};
if (!color_surface) {
continue;
}
// Assume that a surface will be written to if it is used as a framebuffer, even
// if the shader doesn't actually write to it.
texture_cache.MarkColorBufferInUse(index);
key.SetAttachment(index, regs.rt_control.GetMap(index));
key.colors[index] = std::move(color_surface);
}
if (depth_surface) {
// Assume that a surface will be written to if it is used as a framebuffer, even if
// the shader doesn't actually write to it.
texture_cache.MarkDepthBufferInUse();
key.zeta = std::move(depth_surface);
}
texture_cache.GuardRenderTargets(false);
state.draw.draw_framebuffer = framebuffer_cache.GetFramebuffer(key);
}
void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb,
bool using_depth_fb, bool using_stencil_fb) {
using VideoCore::Surface::SurfaceType;
auto& gpu = system.GPU().Maxwell3D();
const auto& regs = gpu.regs;
texture_cache.GuardRenderTargets(true);
View color_surface;
if (using_color_fb) {
color_surface = texture_cache.GetColorBufferSurface(regs.clear_buffers.RT, false);
}
View depth_surface;
if (using_depth_fb || using_stencil_fb) {
depth_surface = texture_cache.GetDepthBufferSurface(false);
}
texture_cache.GuardRenderTargets(false);
FramebufferCacheKey key;
key.colors[0] = color_surface;
key.zeta = depth_surface;
current_state.draw.draw_framebuffer = framebuffer_cache.GetFramebuffer(key);
current_state.ApplyFramebufferState();
}
void RasterizerOpenGL::Clear() {
const auto& maxwell3d = system.GPU().Maxwell3D();
if (!maxwell3d.ShouldExecute()) {
return;
}
const auto& regs = maxwell3d.regs;
bool use_color{};
bool use_depth{};
bool use_stencil{};
OpenGLState prev_state{OpenGLState::GetCurState()};
SCOPE_EXIT({ prev_state.Apply(); });
OpenGLState clear_state{OpenGLState::GetCurState()};
if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B ||
regs.clear_buffers.A) {
use_color = true;
}
if (use_color) {
// TODO: Signal state tracker about these changes
glColorMaski(0, regs.clear_buffers.R, regs.clear_buffers.G, regs.clear_buffers.B,
regs.clear_buffers.A);
SyncFramebufferSRGB();
// TODO(Rodrigo): Determine if clamping is used on clears
SyncFragmentColorClampState();
}
if (regs.clear_buffers.Z) {
ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear Z but buffer is not enabled!");
use_depth = true;
// TODO: Signal state tracker about these changes
glDepthMask(GL_TRUE);
}
if (regs.clear_buffers.S) {
ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear stencil but buffer is not enabled!");
use_stencil = true;
clear_state.stencil.test_enabled = true;
if (regs.clear_flags.stencil) {
// Stencil affects the clear so fill it with the used masks
clear_state.stencil.front.test_func = GL_ALWAYS;
clear_state.stencil.front.test_mask = regs.stencil_front_func_mask;
clear_state.stencil.front.action_stencil_fail = GL_KEEP;
clear_state.stencil.front.action_depth_fail = GL_KEEP;
clear_state.stencil.front.action_depth_pass = GL_KEEP;
clear_state.stencil.front.write_mask = regs.stencil_front_mask;
if (regs.stencil_two_side_enable) {
clear_state.stencil.back.test_func = GL_ALWAYS;
clear_state.stencil.back.test_mask = regs.stencil_back_func_mask;
clear_state.stencil.back.action_stencil_fail = GL_KEEP;
clear_state.stencil.back.action_depth_fail = GL_KEEP;
clear_state.stencil.back.action_depth_pass = GL_KEEP;
clear_state.stencil.back.write_mask = regs.stencil_back_mask;
} else {
clear_state.stencil.back.test_func = GL_ALWAYS;
clear_state.stencil.back.test_mask = 0xFFFFFFFF;
clear_state.stencil.back.write_mask = 0xFFFFFFFF;
clear_state.stencil.back.action_stencil_fail = GL_KEEP;
clear_state.stencil.back.action_depth_fail = GL_KEEP;
clear_state.stencil.back.action_depth_pass = GL_KEEP;
}
}
}
if (!use_color && !use_depth && !use_stencil) {
// No color surface nor depth/stencil surface are enabled
return;
}
ConfigureClearFramebuffer(clear_state, use_color, use_depth, use_stencil);
SyncRasterizeEnable(clear_state);
if (regs.clear_flags.scissor) {
SyncScissorTest();
}
UNIMPLEMENTED_IF(regs.clear_flags.viewport);
clear_state.Apply();
if (use_color) {
glClearBufferfv(GL_COLOR, 0, regs.clear_color);
}
if (use_depth && use_stencil) {
glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil);
} else if (use_depth) {
glClearBufferfv(GL_DEPTH, 0, &regs.clear_depth);
} else if (use_stencil) {
glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil);
}
++num_queued_commands;
}
void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
MICROPROFILE_SCOPE(OpenGL_Drawing);
auto& gpu = system.GPU().Maxwell3D();
const auto& regs = gpu.regs;
query_cache.UpdateCounters();
SyncViewport();
SyncRasterizeEnable(state);
SyncColorMask();
SyncFragmentColorClampState();
SyncMultiSampleState();
SyncDepthTestState();
SyncStencilTestState();
SyncBlendState();
SyncLogicOpState();
SyncCullMode();
SyncPrimitiveRestart();
SyncScissorTest();
SyncTransformFeedback();
SyncPointState();
SyncPolygonOffset();
SyncAlphaTest();
SyncFramebufferSRGB();
buffer_cache.Acquire();
std::size_t buffer_size = CalculateVertexArraysSize();
// Add space for index buffer
if (is_indexed) {
buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize();
}
// Uniform space for the 5 shader stages
buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
(sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) *
Maxwell::MaxShaderStage;
// Add space for at least 18 constant buffers
buffer_size += Maxwell::MaxConstBuffers *
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
// Prepare the vertex array.
buffer_cache.Map(buffer_size);
// Prepare vertex array format.
SetupVertexFormat();
vertex_array_pushbuffer.Setup();
// Upload vertex and index data.
SetupVertexBuffer();
SetupVertexInstances();
GLintptr index_buffer_offset;
if (is_indexed) {
index_buffer_offset = SetupIndexBuffer();
}
// Prepare packed bindings.
bind_ubo_pushbuffer.Setup();
bind_ssbo_pushbuffer.Setup();
// Setup emulation uniform buffer.
GLShader::MaxwellUniformData ubo;
ubo.SetFromRegs(gpu);
const auto [buffer, offset] =
buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
bind_ubo_pushbuffer.Push(EmulationUniformBlockBinding, buffer, offset,
static_cast<GLsizeiptr>(sizeof(ubo)));
// Setup shaders and their used resources.
texture_cache.GuardSamplers(true);
const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology);
SetupShaders(primitive_mode);
texture_cache.GuardSamplers(false);
ConfigureFramebuffers();
// Signal the buffer cache that we are not going to upload more things.
const bool invalidate = buffer_cache.Unmap();
// Now that we are no longer uploading data, we can safely bind the buffers to OpenGL.
vertex_array_pushbuffer.Bind();
bind_ubo_pushbuffer.Bind();
bind_ssbo_pushbuffer.Bind();
shader_program_manager->ApplyTo(state);
state.Apply();
if (texture_cache.TextureBarrier()) {
glTextureBarrier();
}
++num_queued_commands;
const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance);
const GLsizei num_instances =
static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1);
if (is_indexed) {
const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base);
const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count);
const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset);
const GLenum format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format);
if (num_instances == 1 && base_instance == 0 && base_vertex == 0) {
glDrawElements(primitive_mode, num_vertices, format, offset);
} else if (num_instances == 1 && base_instance == 0) {
glDrawElementsBaseVertex(primitive_mode, num_vertices, format, offset, base_vertex);
} else if (base_vertex == 0 && base_instance == 0) {
glDrawElementsInstanced(primitive_mode, num_vertices, format, offset, num_instances);
} else if (base_vertex == 0) {
glDrawElementsInstancedBaseInstance(primitive_mode, num_vertices, format, offset,
num_instances, base_instance);
} else if (base_instance == 0) {
glDrawElementsInstancedBaseVertex(primitive_mode, num_vertices, format, offset,
num_instances, base_vertex);
} else {
glDrawElementsInstancedBaseVertexBaseInstance(primitive_mode, num_vertices, format,
offset, num_instances, base_vertex,
base_instance);
}
} else {
const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first);
const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count);
if (num_instances == 1 && base_instance == 0) {
glDrawArrays(primitive_mode, base_vertex, num_vertices);
} else if (base_instance == 0) {
glDrawArraysInstanced(primitive_mode, base_vertex, num_vertices, num_instances);
} else {
glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices,
num_instances, base_instance);
}
}
}
void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
if (device.HasBrokenCompute()) {
return;
}
buffer_cache.Acquire();
auto kernel = shader_cache.GetComputeKernel(code_addr);
SetupComputeTextures(kernel);
SetupComputeImages(kernel);
const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
const ProgramVariant variant(launch_desc.block_dim_x, launch_desc.block_dim_y,
launch_desc.block_dim_z, launch_desc.shared_alloc,
launch_desc.local_pos_alloc);
state.draw.shader_program = kernel->GetHandle(variant);
state.draw.program_pipeline = 0;
const std::size_t buffer_size =
Tegra::Engines::KeplerCompute::NumConstBuffers *
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
buffer_cache.Map(buffer_size);
bind_ubo_pushbuffer.Setup();
bind_ssbo_pushbuffer.Setup();
SetupComputeConstBuffers(kernel);
SetupComputeGlobalMemory(kernel);
buffer_cache.Unmap();
bind_ubo_pushbuffer.Bind();
bind_ssbo_pushbuffer.Bind();
state.ApplyTextures();
state.ApplyImages();
state.ApplyShaderProgram();
state.ApplyProgramPipeline();
glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
++num_queued_commands;
}
void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) {
query_cache.ResetCounter(type);
}
void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
std::optional<u64> timestamp) {
query_cache.Query(gpu_addr, type, timestamp);
}
void RasterizerOpenGL::FlushAll() {}
void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
MICROPROFILE_SCOPE(OpenGL_CacheManagement);
if (!addr || !size) {
return;
}
texture_cache.FlushRegion(addr, size);
buffer_cache.FlushRegion(addr, size);
query_cache.FlushRegion(addr, size);
}
void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
MICROPROFILE_SCOPE(OpenGL_CacheManagement);
if (!addr || !size) {
return;
}
texture_cache.InvalidateRegion(addr, size);
shader_cache.InvalidateRegion(addr, size);
buffer_cache.InvalidateRegion(addr, size);
query_cache.InvalidateRegion(addr, size);
}
void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
if (Settings::values.use_accurate_gpu_emulation) {
FlushRegion(addr, size);
}
InvalidateRegion(addr, size);
}
void RasterizerOpenGL::FlushCommands() {
// Only flush when we have commands queued to OpenGL.
if (num_queued_commands == 0) {
return;
}
num_queued_commands = 0;
glFlush();
}
void RasterizerOpenGL::TickFrame() {
// Ticking a frame means that buffers will be swapped, calling glFlush implicitly.
num_queued_commands = 0;
buffer_cache.TickFrame();
}
bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
const Tegra::Engines::Fermi2D::Regs::Surface& dst,
const Tegra::Engines::Fermi2D::Config& copy_config) {
MICROPROFILE_SCOPE(OpenGL_Blits);
texture_cache.DoFermiCopy(src, dst, copy_config);
return true;
}
bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
VAddr framebuffer_addr, u32 pixel_stride) {
if (!framebuffer_addr) {
return {};
}
MICROPROFILE_SCOPE(OpenGL_CacheManagement);
const auto surface{
texture_cache.TryFindFramebufferSurface(system.Memory().GetPointer(framebuffer_addr))};
if (!surface) {
return {};
}
// Verify that the cached surface is the same size and format as the requested framebuffer
const auto& params{surface->GetSurfaceParams()};
const auto& pixel_format{
VideoCore::Surface::PixelFormatFromGPUPixelFormat(config.pixel_format)};
ASSERT_MSG(params.width == config.width, "Framebuffer width is different");
ASSERT_MSG(params.height == config.height, "Framebuffer height is different");
if (params.pixel_format != pixel_format) {
LOG_DEBUG(Render_OpenGL, "Framebuffer pixel_format is different");
}
screen_info.display_texture = surface->GetTexture();
screen_info.display_srgb = surface->GetSurfaceParams().srgb_conversion;
return true;
}
void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) {
MICROPROFILE_SCOPE(OpenGL_UBO);
const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
const auto& shader_stage = stages[stage_index];
u32 binding = device.GetBaseBindings(stage_index).uniform_buffer;
for (const auto& entry : shader->GetShaderEntries().const_buffers) {
const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
SetupConstBuffer(binding++, buffer, entry);
}
}
void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
MICROPROFILE_SCOPE(OpenGL_UBO);
const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
u32 binding = 0;
for (const auto& entry : kernel->GetShaderEntries().const_buffers) {
const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
Tegra::Engines::ConstBufferInfo buffer;
buffer.address = config.Address();
buffer.size = config.size;
buffer.enabled = mask[entry.GetIndex()];
SetupConstBuffer(binding++, buffer, entry);
}
}
void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
const GLShader::ConstBufferEntry& entry) {
if (!buffer.enabled) {
// Set values to zero to unbind buffers
bind_ubo_pushbuffer.Push(binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0,
sizeof(float));
return;
}
// Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
// UBO alignment requirements.
const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
const auto alignment = device.GetUniformBufferAlignment();
const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
device.HasFastBufferSubData());
bind_ubo_pushbuffer.Push(binding, cbuf, offset, size);
}
void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
auto& gpu{system.GPU()};
auto& memory_manager{gpu.MemoryManager()};
const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer;
for (const auto& entry : shader->GetShaderEntries().global_memory_entries) {
const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()};
const auto gpu_addr{memory_manager.Read<u64>(addr)};
const auto size{memory_manager.Read<u32>(addr + 8)};
SetupGlobalMemory(binding++, entry, gpu_addr, size);
}
}
void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
auto& gpu{system.GPU()};
auto& memory_manager{gpu.MemoryManager()};
const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
u32 binding = 0;
for (const auto& entry : kernel->GetShaderEntries().global_memory_entries) {
const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()};
const auto gpu_addr{memory_manager.Read<u64>(addr)};
const auto size{memory_manager.Read<u32>(addr + 8)};
SetupGlobalMemory(binding++, entry, gpu_addr, size);
}
}
void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry,
GPUVAddr gpu_addr, std::size_t size) {
const auto alignment{device.GetShaderStorageBufferAlignment()};
const auto [ssbo, buffer_offset] =
buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.IsWritten());
bind_ssbo_pushbuffer.Push(binding, ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
}
void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& shader) {
MICROPROFILE_SCOPE(OpenGL_Texture);
const auto& maxwell3d = system.GPU().Maxwell3D();
u32 binding = device.GetBaseBindings(stage_index).sampler;
for (const auto& entry : shader->GetShaderEntries().samplers) {
const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index);
if (!entry.IsIndexed()) {
const auto texture = GetTextureInfo(maxwell3d, entry, shader_type);
SetupTexture(binding++, texture, entry);
} else {
for (std::size_t i = 0; i < entry.Size(); ++i) {
const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i);
SetupTexture(binding++, texture, entry);
}
}
}
}
void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) {
MICROPROFILE_SCOPE(OpenGL_Texture);
const auto& compute = system.GPU().KeplerCompute();
u32 binding = 0;
for (const auto& entry : kernel->GetShaderEntries().samplers) {
if (!entry.IsIndexed()) {
const auto texture =
GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute);
SetupTexture(binding++, texture, entry);
} else {
for (std::size_t i = 0; i < entry.Size(); ++i) {
const auto texture =
GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute, i);
SetupTexture(binding++, texture, entry);
}
}
}
}
void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
const GLShader::SamplerEntry& entry) {
const auto view = texture_cache.GetTextureSurface(texture.tic, entry);
if (!view) {
// Can occur when texture addr is null or its memory is unmapped/invalid
state.samplers[binding] = 0;
state.textures[binding] = 0;
return;
}
state.textures[binding] = view->GetTexture();
if (view->GetSurfaceParams().IsBuffer()) {
return;
}
state.samplers[binding] = sampler_cache.GetSampler(texture.tsc);
// Apply swizzle to textures that are not buffers.
view->ApplySwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source,
texture.tic.w_source);
}
void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) {
const auto& maxwell3d = system.GPU().Maxwell3D();
u32 binding = device.GetBaseBindings(stage_index).image;
for (const auto& entry : shader->GetShaderEntries().images) {
const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index);
const auto tic = GetTextureInfo(maxwell3d, entry, shader_type).tic;
SetupImage(binding++, tic, entry);
}
}
void RasterizerOpenGL::SetupComputeImages(const Shader& shader) {
const auto& compute = system.GPU().KeplerCompute();
u32 binding = 0;
for (const auto& entry : shader->GetShaderEntries().images) {
const auto tic = GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute).tic;
SetupImage(binding++, tic, entry);
}
}
void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic,
const GLShader::ImageEntry& entry) {
const auto view = texture_cache.GetImageSurface(tic, entry);
if (!view) {
state.images[binding] = 0;
return;
}
if (!tic.IsBuffer()) {
view->ApplySwizzle(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
}
if (entry.IsWritten()) {
view->MarkAsModified(texture_cache.Tick());
}
state.images[binding] = view->GetTexture();
}
void RasterizerOpenGL::SyncViewport() {
const auto& regs = system.GPU().Maxwell3D().regs;
for (std::size_t i = 0; i < Maxwell::NumViewports; ++i) {
const auto& src = regs.viewports[i];
const Common::Rectangle<f32> rect{regs.viewport_transform[i].GetRect()};
glViewportIndexedf(static_cast<GLuint>(i), rect.left, rect.bottom, rect.GetWidth(),
rect.GetHeight());
glDepthRangef(src.depth_range_near, src.depth_range_far);
}
bool flip_y = false;
if (regs.viewport_transform[0].scale_y < 0.0) {
flip_y = !flip_y;
}
if (regs.screen_y_control.y_negate != 0) {
flip_y = !flip_y;
}
state.clip_control.origin = flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT;
state.clip_control.depth_mode =
regs.depth_mode == Tegra::Engines::Maxwell3D::Regs::DepthMode::ZeroToOne
? GL_ZERO_TO_ONE
: GL_NEGATIVE_ONE_TO_ONE;
}
void RasterizerOpenGL::SyncDepthClamp() {
const auto& regs = system.GPU().Maxwell3D().regs;
const auto& state = regs.view_volume_clip_control;
UNIMPLEMENTED_IF_MSG(state.depth_clamp_far != state.depth_clamp_near,
"Unimplemented Depth clamp separation!");
oglEnable(GL_DEPTH_CLAMP, state.depth_clamp_far || state.depth_clamp_near);
}
void RasterizerOpenGL::SyncClipEnabled(
const std::array<bool, Maxwell::Regs::NumClipDistances>& clip_mask) {
const auto& regs = system.GPU().Maxwell3D().regs;
const std::array<bool, Maxwell::Regs::NumClipDistances> reg_state{
regs.clip_distance_enabled.c0 != 0, regs.clip_distance_enabled.c1 != 0,
regs.clip_distance_enabled.c2 != 0, regs.clip_distance_enabled.c3 != 0,
regs.clip_distance_enabled.c4 != 0, regs.clip_distance_enabled.c5 != 0,
regs.clip_distance_enabled.c6 != 0, regs.clip_distance_enabled.c7 != 0};
for (std::size_t i = 0; i < Maxwell::Regs::NumClipDistances; ++i) {
state.clip_distance[i] = reg_state[i] && clip_mask[i];
}
}
void RasterizerOpenGL::SyncClipCoef() {
UNIMPLEMENTED();
}
void RasterizerOpenGL::SyncCullMode() {
const auto& regs = system.GPU().Maxwell3D().regs;
oglEnable(GL_CULL_FACE, regs.cull.enabled);
glCullFace(MaxwellToGL::CullFace(regs.cull.cull_face));
glFrontFace(MaxwellToGL::FrontFace(regs.cull.front_face));
}
void RasterizerOpenGL::SyncPrimitiveRestart() {
const auto& regs = system.GPU().Maxwell3D().regs;
oglEnable(GL_PRIMITIVE_RESTART, regs.primitive_restart.enabled);
glPrimitiveRestartIndex(regs.primitive_restart.index);
}
void RasterizerOpenGL::SyncDepthTestState() {
const auto& regs = system.GPU().Maxwell3D().regs;
glDepthMask(regs.depth_write_enabled ? GL_TRUE : GL_FALSE);
oglEnable(GL_DEPTH_TEST, regs.depth_test_enable);
if (regs.depth_test_enable) {
glDepthFunc(MaxwellToGL::ComparisonOp(regs.depth_test_func));
}
}
void RasterizerOpenGL::SyncStencilTestState() {
auto& maxwell3d = system.GPU().Maxwell3D();
const auto& regs = maxwell3d.regs;
state.stencil.test_enabled = regs.stencil_enable != 0;
if (!regs.stencil_enable) {
return;
}
state.stencil.front.test_func = MaxwellToGL::ComparisonOp(regs.stencil_front_func_func);
state.stencil.front.test_ref = regs.stencil_front_func_ref;
state.stencil.front.test_mask = regs.stencil_front_func_mask;
state.stencil.front.action_stencil_fail = MaxwellToGL::StencilOp(regs.stencil_front_op_fail);
state.stencil.front.action_depth_fail = MaxwellToGL::StencilOp(regs.stencil_front_op_zfail);
state.stencil.front.action_depth_pass = MaxwellToGL::StencilOp(regs.stencil_front_op_zpass);
state.stencil.front.write_mask = regs.stencil_front_mask;
if (regs.stencil_two_side_enable) {
state.stencil.back.test_func = MaxwellToGL::ComparisonOp(regs.stencil_back_func_func);
state.stencil.back.test_ref = regs.stencil_back_func_ref;
state.stencil.back.test_mask = regs.stencil_back_func_mask;
state.stencil.back.action_stencil_fail = MaxwellToGL::StencilOp(regs.stencil_back_op_fail);
state.stencil.back.action_depth_fail = MaxwellToGL::StencilOp(regs.stencil_back_op_zfail);
state.stencil.back.action_depth_pass = MaxwellToGL::StencilOp(regs.stencil_back_op_zpass);
state.stencil.back.write_mask = regs.stencil_back_mask;
} else {
state.stencil.back.test_func = GL_ALWAYS;
state.stencil.back.test_ref = 0;
state.stencil.back.test_mask = 0xFFFFFFFF;
state.stencil.back.write_mask = 0xFFFFFFFF;
state.stencil.back.action_stencil_fail = GL_KEEP;
state.stencil.back.action_depth_fail = GL_KEEP;
state.stencil.back.action_depth_pass = GL_KEEP;
}
}
void RasterizerOpenGL::SyncRasterizeEnable(OpenGLState& current_state) {
const auto& regs = system.GPU().Maxwell3D().regs;
current_state.rasterizer_discard = regs.rasterize_enable == 0;
}
void RasterizerOpenGL::SyncColorMask() {
auto& maxwell3d = system.GPU().Maxwell3D();
const auto& regs = maxwell3d.regs;
if (regs.color_mask_common) {
auto& mask = regs.color_mask[0];
glColorMask(mask.R, mask.B, mask.G, mask.A);
} else {
for (std::size_t i = 0; i < Maxwell::NumRenderTargets; ++i) {
const auto& mask = regs.color_mask[regs.color_mask_common ? 0 : i];
glColorMaski(static_cast<GLuint>(i), mask.R, mask.G, mask.B, mask.A);
}
}
}
void RasterizerOpenGL::SyncMultiSampleState() {
const auto& regs = system.GPU().Maxwell3D().regs;
oglEnable(GL_SAMPLE_ALPHA_TO_COVERAGE, regs.multisample_control.alpha_to_coverage);
oglEnable(GL_SAMPLE_ALPHA_TO_ONE, regs.multisample_control.alpha_to_one);
}
void RasterizerOpenGL::SyncFragmentColorClampState() {
const auto& regs = system.GPU().Maxwell3D().regs;
glClampColor(GL_CLAMP_FRAGMENT_COLOR, regs.frag_color_clamp ? GL_TRUE : GL_FALSE);
}
void RasterizerOpenGL::SyncBlendState() {
auto& maxwell3d = system.GPU().Maxwell3D();
const auto& regs = maxwell3d.regs;
glBlendColor(regs.blend_color.r, regs.blend_color.g, regs.blend_color.b, regs.blend_color.a);
state.independant_blend.enabled = regs.independent_blend_enable;
if (!state.independant_blend.enabled) {
auto& blend = state.blend[0];
const auto& src = regs.blend;
blend.enabled = src.enable[0] != 0;
if (blend.enabled) {
blend.rgb_equation = MaxwellToGL::BlendEquation(src.equation_rgb);
blend.src_rgb_func = MaxwellToGL::BlendFunc(src.factor_source_rgb);
blend.dst_rgb_func = MaxwellToGL::BlendFunc(src.factor_dest_rgb);
blend.a_equation = MaxwellToGL::BlendEquation(src.equation_a);
blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a);
blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a);
}
for (std::size_t i = 1; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
state.blend[i].enabled = false;
}
return;
}
for (std::size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
auto& blend = state.blend[i];
const auto& src = regs.independent_blend[i];
blend.enabled = regs.blend.enable[i] != 0;
if (!blend.enabled)
continue;
blend.rgb_equation = MaxwellToGL::BlendEquation(src.equation_rgb);
blend.src_rgb_func = MaxwellToGL::BlendFunc(src.factor_source_rgb);
blend.dst_rgb_func = MaxwellToGL::BlendFunc(src.factor_dest_rgb);
blend.a_equation = MaxwellToGL::BlendEquation(src.equation_a);
blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a);
blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a);
}
}
void RasterizerOpenGL::SyncLogicOpState() {
const auto& regs = system.GPU().Maxwell3D().regs;
oglEnable(GL_COLOR_LOGIC_OP, regs.logic_op.enable);
if (regs.logic_op.enable) {
glLogicOp(MaxwellToGL::LogicOp(regs.logic_op.operation));
}
}
void RasterizerOpenGL::SyncScissorTest() {
const auto& regs = system.GPU().Maxwell3D().regs;
for (std::size_t index = 0; index < Maxwell::NumViewports; ++index) {
const auto& src = regs.scissor_test[index];
oglEnablei(GL_SCISSOR_TEST, src.enable, static_cast<GLuint>(index));
glScissorIndexed(static_cast<GLuint>(index), src.min_x, src.min_y, src.max_x - src.min_x,
src.max_y - src.min_y);
}
}
void RasterizerOpenGL::SyncTransformFeedback() {
const auto& regs = system.GPU().Maxwell3D().regs;
UNIMPLEMENTED_IF_MSG(regs.tfb_enabled != 0, "Transform feedbacks are not implemented");
}
void RasterizerOpenGL::SyncPointState() {
const auto& regs = system.GPU().Maxwell3D().regs;
// Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid
// in OpenGL).
oglEnable(GL_PROGRAM_POINT_SIZE, regs.vp_point_size.enable);
oglEnable(GL_POINT_SPRITE, regs.point_sprite_enable);
glPointSize(std::max(1.0f, regs.point_size));
}
void RasterizerOpenGL::SyncPolygonOffset() {
auto& maxwell3d = system.GPU().Maxwell3D();
const auto& regs = maxwell3d.regs;
oglEnable(GL_POLYGON_OFFSET_FILL, regs.polygon_offset_fill_enable);
oglEnable(GL_POLYGON_OFFSET_LINE, regs.polygon_offset_line_enable);
oglEnable(GL_POLYGON_OFFSET_POINT, regs.polygon_offset_point_enable);
// Hardware divides polygon offset units by two
glPolygonOffsetClamp(regs.polygon_offset_factor, regs.polygon_offset_units / 2.0f,
regs.polygon_offset_clamp);
}
void RasterizerOpenGL::SyncAlphaTest() {
const auto& regs = system.GPU().Maxwell3D().regs;
UNIMPLEMENTED_IF_MSG(regs.alpha_test_enabled != 0 && regs.rt_control.count > 1,
"Alpha Testing is enabled with more than one rendertarget");
oglEnable(GL_ALPHA_TEST, regs.alpha_test_enabled);
if (regs.alpha_test_enabled) {
glAlphaFunc(MaxwellToGL::ComparisonOp(regs.alpha_test_func), regs.alpha_test_ref);
}
}
void RasterizerOpenGL::SyncFramebufferSRGB() {
const auto& regs = system.GPU().Maxwell3D().regs;
oglEnable(GL_FRAMEBUFFER_SRGB, regs.framebuffer_srgb);
}
} // namespace OpenGL