glsl: Squash constant buffers into a single SSBO when we hit the limit
Avoids compilation errors at the cost of shader build times and runtime performance when a game hits the limit of uniform buffers we can use.
This commit is contained in:
parent
e68ee43a1a
commit
ee21e4ecd3
7 changed files with 173 additions and 79 deletions
|
@ -6,6 +6,7 @@
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <limits>
|
||||||
#include <optional>
|
#include <optional>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
@ -26,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1;
|
||||||
|
|
||||||
constexpr u32 NumStages = 5;
|
constexpr u32 NumStages = 5;
|
||||||
|
|
||||||
constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
|
constexpr std::array LimitUBOs = {
|
||||||
GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS,
|
GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
|
||||||
GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS};
|
GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
|
||||||
|
GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS};
|
||||||
|
|
||||||
constexpr std::array LimitSSBOs = {
|
constexpr std::array LimitSSBOs = {
|
||||||
GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
|
GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
|
||||||
GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
|
GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
|
||||||
GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS};
|
GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS};
|
||||||
|
|
||||||
constexpr std::array LimitSamplers = {
|
constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
|
||||||
GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
|
GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
|
||||||
GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
|
GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
|
||||||
GL_MAX_TEXTURE_IMAGE_UNITS};
|
GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
|
||||||
|
GL_MAX_TEXTURE_IMAGE_UNITS,
|
||||||
|
GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS};
|
||||||
|
|
||||||
constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS,
|
constexpr std::array LimitImages = {
|
||||||
GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
|
GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
|
||||||
GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS,
|
GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
|
||||||
GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS};
|
GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T GetInteger(GLenum pname) {
|
T GetInteger(GLenum pname) {
|
||||||
|
@ -85,6 +89,13 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
|
||||||
return std::exchange(base, base + amount);
|
return std::exchange(base, base + amount);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
|
||||||
|
std::array<u32, Tegra::Engines::MaxShaderTypes> max;
|
||||||
|
std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(),
|
||||||
|
[](GLenum pname) { return GetInteger<u32>(pname); });
|
||||||
|
return max;
|
||||||
|
}
|
||||||
|
|
||||||
std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
|
std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
|
||||||
std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;
|
std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;
|
||||||
|
|
||||||
|
@ -159,7 +170,8 @@ bool IsASTCSupported() {
|
||||||
|
|
||||||
} // Anonymous namespace
|
} // Anonymous namespace
|
||||||
|
|
||||||
Device::Device() : base_bindings{BuildBaseBindings()} {
|
Device::Device()
|
||||||
|
: max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
|
||||||
const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
|
const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
|
||||||
const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
|
const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
|
||||||
const std::vector extensions = GetExtensions();
|
const std::vector extensions = GetExtensions();
|
||||||
|
@ -194,7 +206,9 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
|
||||||
}
|
}
|
||||||
|
|
||||||
Device::Device(std::nullptr_t) {
|
Device::Device(std::nullptr_t) {
|
||||||
uniform_buffer_alignment = 0;
|
max_uniform_buffers.fill(std::numeric_limits<u32>::max());
|
||||||
|
uniform_buffer_alignment = 4;
|
||||||
|
shader_storage_alignment = 4;
|
||||||
max_vertex_attributes = 16;
|
max_vertex_attributes = 16;
|
||||||
max_varyings = 15;
|
max_varyings = 15;
|
||||||
has_warp_intrinsics = true;
|
has_warp_intrinsics = true;
|
||||||
|
@ -202,8 +216,6 @@ Device::Device(std::nullptr_t) {
|
||||||
has_vertex_viewport_layer = true;
|
has_vertex_viewport_layer = true;
|
||||||
has_image_load_formatted = true;
|
has_image_load_formatted = true;
|
||||||
has_variable_aoffi = true;
|
has_variable_aoffi = true;
|
||||||
has_component_indexing_bug = false;
|
|
||||||
has_precise_bug = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Device::TestVariableAoffi() {
|
bool Device::TestVariableAoffi() {
|
||||||
|
|
|
@ -24,6 +24,10 @@ public:
|
||||||
explicit Device();
|
explicit Device();
|
||||||
explicit Device(std::nullptr_t);
|
explicit Device(std::nullptr_t);
|
||||||
|
|
||||||
|
u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
|
||||||
|
return max_uniform_buffers[static_cast<std::size_t>(shader_type)];
|
||||||
|
}
|
||||||
|
|
||||||
const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept {
|
const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept {
|
||||||
return base_bindings[stage_index];
|
return base_bindings[stage_index];
|
||||||
}
|
}
|
||||||
|
@ -92,7 +96,8 @@ private:
|
||||||
static bool TestVariableAoffi();
|
static bool TestVariableAoffi();
|
||||||
static bool TestPreciseBug();
|
static bool TestPreciseBug();
|
||||||
|
|
||||||
std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings;
|
std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
|
||||||
|
std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
|
||||||
std::size_t uniform_buffer_alignment{};
|
std::size_t uniform_buffer_alignment{};
|
||||||
std::size_t shader_storage_alignment{};
|
std::size_t shader_storage_alignment{};
|
||||||
u32 max_vertex_attributes{};
|
u32 max_vertex_attributes{};
|
||||||
|
|
|
@ -54,6 +54,12 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
|
||||||
|
constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
|
||||||
|
NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize;
|
||||||
|
constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
|
||||||
|
NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
|
||||||
|
|
||||||
constexpr std::size_t NumSupportedVertexAttributes = 16;
|
constexpr std::size_t NumSupportedVertexAttributes = 16;
|
||||||
|
|
||||||
template <typename Engine, typename Entry>
|
template <typename Engine, typename Entry>
|
||||||
|
@ -104,6 +110,9 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind
|
||||||
screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} {
|
screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} {
|
||||||
CheckExtensions();
|
CheckExtensions();
|
||||||
|
|
||||||
|
unified_uniform_buffer.Create();
|
||||||
|
glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
|
||||||
|
|
||||||
if (device.UseAssemblyShaders()) {
|
if (device.UseAssemblyShaders()) {
|
||||||
glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
|
glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
|
||||||
for (const GLuint cbuf : staging_cbufs) {
|
for (const GLuint cbuf : staging_cbufs) {
|
||||||
|
@ -842,34 +851,56 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad
|
||||||
MICROPROFILE_SCOPE(OpenGL_UBO);
|
MICROPROFILE_SCOPE(OpenGL_UBO);
|
||||||
const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
|
const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
|
||||||
const auto& shader_stage = stages[stage_index];
|
const auto& shader_stage = stages[stage_index];
|
||||||
|
const auto& entries = shader->GetEntries();
|
||||||
|
const bool use_unified = entries.use_unified_uniforms;
|
||||||
|
const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;
|
||||||
|
|
||||||
u32 binding =
|
const auto base_bindings = device.GetBaseBindings(stage_index);
|
||||||
device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).uniform_buffer;
|
u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
|
||||||
for (const auto& entry : shader->GetEntries().const_buffers) {
|
for (const auto& entry : entries.const_buffers) {
|
||||||
const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
|
const u32 index = entry.GetIndex();
|
||||||
SetupConstBuffer(PARAMETER_LUT[stage_index], binding++, buffer, entry);
|
const auto& buffer = shader_stage.const_buffers[index];
|
||||||
|
SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
|
||||||
|
base_unified_offset + index * Maxwell::MaxConstBufferSize);
|
||||||
|
++binding;
|
||||||
|
}
|
||||||
|
if (use_unified) {
|
||||||
|
const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
|
||||||
|
entries.global_memory_entries.size());
|
||||||
|
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
|
||||||
|
base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
|
void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
|
||||||
MICROPROFILE_SCOPE(OpenGL_UBO);
|
MICROPROFILE_SCOPE(OpenGL_UBO);
|
||||||
const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
|
const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
|
||||||
|
const auto& entries = kernel->GetEntries();
|
||||||
|
const bool use_unified = entries.use_unified_uniforms;
|
||||||
|
|
||||||
u32 binding = 0;
|
u32 binding = 0;
|
||||||
for (const auto& entry : kernel->GetEntries().const_buffers) {
|
for (const auto& entry : entries.const_buffers) {
|
||||||
const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
|
const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
|
||||||
const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
|
const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
|
||||||
Tegra::Engines::ConstBufferInfo buffer;
|
Tegra::Engines::ConstBufferInfo buffer;
|
||||||
buffer.address = config.Address();
|
buffer.address = config.Address();
|
||||||
buffer.size = config.size;
|
buffer.size = config.size;
|
||||||
buffer.enabled = mask[entry.GetIndex()];
|
buffer.enabled = mask[entry.GetIndex()];
|
||||||
SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding++, buffer, entry);
|
SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
|
||||||
|
use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
|
||||||
|
++binding;
|
||||||
|
}
|
||||||
|
if (use_unified) {
|
||||||
|
const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
|
||||||
|
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
|
||||||
|
NUM_CONST_BUFFERS_BYTES_PER_STAGE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
|
void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
|
||||||
const Tegra::Engines::ConstBufferInfo& buffer,
|
const Tegra::Engines::ConstBufferInfo& buffer,
|
||||||
const ConstBufferEntry& entry) {
|
const ConstBufferEntry& entry, bool use_unified,
|
||||||
|
std::size_t unified_offset) {
|
||||||
if (!buffer.enabled) {
|
if (!buffer.enabled) {
|
||||||
// Set values to zero to unbind buffers
|
// Set values to zero to unbind buffers
|
||||||
if (device.UseAssemblyShaders()) {
|
if (device.UseAssemblyShaders()) {
|
||||||
|
@ -885,20 +916,29 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
|
||||||
// UBO alignment requirements.
|
// UBO alignment requirements.
|
||||||
const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
|
const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
|
||||||
|
|
||||||
const auto alignment = device.GetUniformBufferAlignment();
|
const bool fast_upload = !use_unified && device.HasFastBufferSubData();
|
||||||
auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
|
|
||||||
device.HasFastBufferSubData());
|
const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
|
||||||
if (!device.UseAssemblyShaders()) {
|
const GPUVAddr gpu_addr = buffer.address;
|
||||||
glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
|
auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
|
||||||
|
|
||||||
|
if (device.UseAssemblyShaders()) {
|
||||||
|
UNIMPLEMENTED_IF(use_unified);
|
||||||
|
if (offset != 0) {
|
||||||
|
const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
|
||||||
|
glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
|
||||||
|
cbuf = staging_cbuf;
|
||||||
|
offset = 0;
|
||||||
|
}
|
||||||
|
glBindBufferRangeNV(stage, binding, cbuf, offset, size);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (offset != 0) {
|
|
||||||
const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
|
if (use_unified) {
|
||||||
glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
|
glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size);
|
||||||
cbuf = staging_cbuf;
|
} else {
|
||||||
offset = 0;
|
glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
|
||||||
}
|
}
|
||||||
glBindBufferRangeNV(stage, binding, cbuf, offset, size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
|
void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
|
||||||
|
|
|
@ -107,7 +107,8 @@ private:
|
||||||
|
|
||||||
/// Configures a constant buffer.
|
/// Configures a constant buffer.
|
||||||
void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
|
void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
|
||||||
const ConstBufferEntry& entry);
|
const ConstBufferEntry& entry, bool use_unified,
|
||||||
|
std::size_t unified_offset);
|
||||||
|
|
||||||
/// Configures the current global memory entries to use for the draw command.
|
/// Configures the current global memory entries to use for the draw command.
|
||||||
void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
|
void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
|
||||||
|
@ -253,6 +254,7 @@ private:
|
||||||
Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
|
Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
|
||||||
std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
|
std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
|
||||||
std::size_t current_cbuf = 0;
|
std::size_t current_cbuf = 0;
|
||||||
|
OGLBuffer unified_uniform_buffer;
|
||||||
|
|
||||||
/// Number of commands queued to the OpenGL driver. Reseted on flush.
|
/// Number of commands queued to the OpenGL driver. Reseted on flush.
|
||||||
std::size_t num_queued_commands = 0;
|
std::size_t num_queued_commands = 0;
|
||||||
|
|
|
@ -241,8 +241,9 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
|
||||||
entry.bindless_samplers = registry->GetBindlessSamplers();
|
entry.bindless_samplers = registry->GetBindlessSamplers();
|
||||||
params.disk_cache.SaveEntry(std::move(entry));
|
params.disk_cache.SaveEntry(std::move(entry));
|
||||||
|
|
||||||
return std::shared_ptr<CachedShader>(new CachedShader(
|
return std::shared_ptr<CachedShader>(
|
||||||
params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
|
new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
|
||||||
|
MakeEntries(params.device, ir, shader_type), std::move(program)));
|
||||||
}
|
}
|
||||||
|
|
||||||
Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
|
Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
|
||||||
|
@ -265,8 +266,9 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog
|
||||||
entry.bindless_samplers = registry->GetBindlessSamplers();
|
entry.bindless_samplers = registry->GetBindlessSamplers();
|
||||||
params.disk_cache.SaveEntry(std::move(entry));
|
params.disk_cache.SaveEntry(std::move(entry));
|
||||||
|
|
||||||
return std::shared_ptr<CachedShader>(new CachedShader(
|
return std::shared_ptr<CachedShader>(
|
||||||
params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
|
new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
|
||||||
|
MakeEntries(params.device, ir, ShaderType::Compute), std::move(program)));
|
||||||
}
|
}
|
||||||
|
|
||||||
Shader CachedShader::CreateFromCache(const ShaderParameters& params,
|
Shader CachedShader::CreateFromCache(const ShaderParameters& params,
|
||||||
|
@ -348,7 +350,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
|
||||||
PrecompiledShader shader;
|
PrecompiledShader shader;
|
||||||
shader.program = std::move(program);
|
shader.program = std::move(program);
|
||||||
shader.registry = std::move(registry);
|
shader.registry = std::move(registry);
|
||||||
shader.entries = MakeEntries(ir);
|
shader.entries = MakeEntries(device, ir, entry.type);
|
||||||
|
|
||||||
std::scoped_lock lock{mutex};
|
std::scoped_lock lock{mutex};
|
||||||
if (callback) {
|
if (callback) {
|
||||||
|
|
|
@ -61,8 +61,8 @@ struct TextureDerivates {};
|
||||||
using TextureArgument = std::pair<Type, Node>;
|
using TextureArgument = std::pair<Type, Node>;
|
||||||
using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>;
|
using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>;
|
||||||
|
|
||||||
constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
|
constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
|
||||||
static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));
|
constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);
|
||||||
|
|
||||||
constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
|
constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
|
||||||
#define ftou floatBitsToUint
|
#define ftou floatBitsToUint
|
||||||
|
@ -402,6 +402,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
|
||||||
return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
|
return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
|
||||||
|
const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
|
||||||
|
// We waste one UBO for emulation
|
||||||
|
const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
|
||||||
|
return num_ubos > num_available_ubos;
|
||||||
|
}
|
||||||
|
|
||||||
struct GenericVaryingDescription {
|
struct GenericVaryingDescription {
|
||||||
std::string name;
|
std::string name;
|
||||||
u8 first_element = 0;
|
u8 first_element = 0;
|
||||||
|
@ -412,8 +419,9 @@ class GLSLDecompiler final {
|
||||||
public:
|
public:
|
||||||
explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
|
explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
|
||||||
ShaderType stage, std::string_view identifier, std::string_view suffix)
|
ShaderType stage, std::string_view identifier, std::string_view suffix)
|
||||||
: device{device}, ir{ir}, registry{registry}, stage{stage},
|
: device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier},
|
||||||
identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {
|
suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{
|
||||||
|
UseUnifiedUniforms(device, ir, stage)} {
|
||||||
if (stage != ShaderType::Compute) {
|
if (stage != ShaderType::Compute) {
|
||||||
transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
|
transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
|
||||||
}
|
}
|
||||||
|
@ -834,12 +842,24 @@ private:
|
||||||
}
|
}
|
||||||
|
|
||||||
void DeclareConstantBuffers() {
|
void DeclareConstantBuffers() {
|
||||||
|
if (use_unified_uniforms) {
|
||||||
|
const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
|
||||||
|
static_cast<u32>(ir.GetGlobalMemory().size());
|
||||||
|
code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
|
||||||
|
binding);
|
||||||
|
code.AddLine(" uint cbufs[];");
|
||||||
|
code.AddLine("}};");
|
||||||
|
code.AddNewLine();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
u32 binding = device.GetBaseBindings(stage).uniform_buffer;
|
u32 binding = device.GetBaseBindings(stage).uniform_buffer;
|
||||||
for (const auto& buffers : ir.GetConstantBuffers()) {
|
for (const auto [index, info] : ir.GetConstantBuffers()) {
|
||||||
const auto index = buffers.first;
|
const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4;
|
||||||
|
const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements;
|
||||||
code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++,
|
code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++,
|
||||||
GetConstBufferBlock(index));
|
GetConstBufferBlock(index));
|
||||||
code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS);
|
code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), size);
|
||||||
code.AddLine("}};");
|
code.AddLine("}};");
|
||||||
code.AddNewLine();
|
code.AddNewLine();
|
||||||
}
|
}
|
||||||
|
@ -1038,42 +1058,51 @@ private:
|
||||||
|
|
||||||
if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
|
if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
|
||||||
const Node offset = cbuf->GetOffset();
|
const Node offset = cbuf->GetOffset();
|
||||||
|
const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;
|
||||||
|
|
||||||
if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
|
if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
|
||||||
// Direct access
|
// Direct access
|
||||||
const u32 offset_imm = immediate->GetValue();
|
const u32 offset_imm = immediate->GetValue();
|
||||||
ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
|
ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
|
||||||
return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
|
if (use_unified_uniforms) {
|
||||||
offset_imm / (4 * 4), (offset_imm / 4) % 4),
|
return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4),
|
||||||
|
Type::Uint};
|
||||||
|
} else {
|
||||||
|
return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
|
||||||
|
offset_imm / (4 * 4), (offset_imm / 4) % 4),
|
||||||
|
Type::Uint};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Indirect access
|
||||||
|
if (use_unified_uniforms) {
|
||||||
|
return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
|
||||||
|
Visit(offset).AsUint()),
|
||||||
Type::Uint};
|
Type::Uint};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (std::holds_alternative<OperationNode>(*offset)) {
|
const std::string final_offset = code.GenerateTemporary();
|
||||||
// Indirect access
|
code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
|
||||||
const std::string final_offset = code.GenerateTemporary();
|
|
||||||
code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
|
|
||||||
|
|
||||||
if (!device.HasComponentIndexingBug()) {
|
if (!device.HasComponentIndexingBug()) {
|
||||||
return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
|
return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
|
||||||
final_offset, final_offset),
|
final_offset, final_offset),
|
||||||
Type::Uint};
|
Type::Uint};
|
||||||
}
|
|
||||||
|
|
||||||
// AMD's proprietary GLSL compiler emits ill code for variable component access.
|
|
||||||
// To bypass this driver bug generate 4 ifs, one per each component.
|
|
||||||
const std::string pack = code.GenerateTemporary();
|
|
||||||
code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
|
|
||||||
final_offset);
|
|
||||||
|
|
||||||
const std::string result = code.GenerateTemporary();
|
|
||||||
code.AddLine("uint {};", result);
|
|
||||||
for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
|
|
||||||
code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result,
|
|
||||||
pack, GetSwizzle(swizzle));
|
|
||||||
}
|
|
||||||
return {result, Type::Uint};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
UNREACHABLE_MSG("Unmanaged offset node type");
|
// AMD's proprietary GLSL compiler emits ill code for variable component access.
|
||||||
|
// To bypass this driver bug generate 4 ifs, one per each component.
|
||||||
|
const std::string pack = code.GenerateTemporary();
|
||||||
|
code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
|
||||||
|
final_offset);
|
||||||
|
|
||||||
|
const std::string result = code.GenerateTemporary();
|
||||||
|
code.AddLine("uint {};", result);
|
||||||
|
for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
|
||||||
|
code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack,
|
||||||
|
GetSwizzle(swizzle));
|
||||||
|
}
|
||||||
|
return {result, Type::Uint};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (const auto gmem = std::get_if<GmemNode>(&*node)) {
|
if (const auto gmem = std::get_if<GmemNode>(&*node)) {
|
||||||
|
@ -2710,6 +2739,7 @@ private:
|
||||||
const std::string_view identifier;
|
const std::string_view identifier;
|
||||||
const std::string_view suffix;
|
const std::string_view suffix;
|
||||||
const Header header;
|
const Header header;
|
||||||
|
const bool use_unified_uniforms;
|
||||||
std::unordered_map<u8, VaryingTFB> transform_feedback;
|
std::unordered_map<u8, VaryingTFB> transform_feedback;
|
||||||
|
|
||||||
ShaderWriter code;
|
ShaderWriter code;
|
||||||
|
@ -2905,7 +2935,7 @@ void GLSLDecompiler::DecompileAST() {
|
||||||
|
|
||||||
} // Anonymous namespace
|
} // Anonymous namespace
|
||||||
|
|
||||||
ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
|
ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) {
|
||||||
ShaderEntries entries;
|
ShaderEntries entries;
|
||||||
for (const auto& cbuf : ir.GetConstantBuffers()) {
|
for (const auto& cbuf : ir.GetConstantBuffers()) {
|
||||||
entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
|
entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
|
||||||
|
@ -2926,6 +2956,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
|
||||||
entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
|
entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
|
||||||
}
|
}
|
||||||
entries.shader_length = ir.GetLength();
|
entries.shader_length = ir.GetLength();
|
||||||
|
entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
|
||||||
return entries;
|
return entries;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -53,11 +53,13 @@ struct ShaderEntries {
|
||||||
std::vector<GlobalMemoryEntry> global_memory_entries;
|
std::vector<GlobalMemoryEntry> global_memory_entries;
|
||||||
std::vector<SamplerEntry> samplers;
|
std::vector<SamplerEntry> samplers;
|
||||||
std::vector<ImageEntry> images;
|
std::vector<ImageEntry> images;
|
||||||
u32 clip_distances{};
|
|
||||||
std::size_t shader_length{};
|
std::size_t shader_length{};
|
||||||
|
u32 clip_distances{};
|
||||||
|
bool use_unified_uniforms{};
|
||||||
};
|
};
|
||||||
|
|
||||||
ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir);
|
ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
|
||||||
|
Tegra::Engines::ShaderType stage);
|
||||||
|
|
||||||
std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
|
std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
|
||||||
const VideoCommon::Shader::Registry& registry,
|
const VideoCommon::Shader::Registry& registry,
|
||||||
|
|
Loading…
Reference in a new issue