forked from suyu/suyu
gl_shader_cache: Specialize shader workgroup
Drop the usage of ARB_compute_variable_group_size and specialize compute shaders instead. This permits compute to run on AMD and Intel proprietary drivers.
This commit is contained in:
parent
dc9961f341
commit
4f5d8e4342
6 changed files with 75 additions and 69 deletions
|
@ -140,7 +140,7 @@ public:
|
||||||
|
|
||||||
INSERT_PADDING_WORDS(0x3);
|
INSERT_PADDING_WORDS(0x3);
|
||||||
|
|
||||||
BitField<0, 16, u32> shared_alloc;
|
BitField<0, 18, u32> shared_alloc;
|
||||||
|
|
||||||
BitField<16, 16, u32> block_dim_x;
|
BitField<16, 16, u32> block_dim_x;
|
||||||
union {
|
union {
|
||||||
|
|
|
@ -273,8 +273,8 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
|
||||||
SetupDrawGlobalMemory(stage, shader);
|
SetupDrawGlobalMemory(stage, shader);
|
||||||
SetupDrawTextures(stage, shader, base_bindings);
|
SetupDrawTextures(stage, shader, base_bindings);
|
||||||
|
|
||||||
const ProgramVariant variant{base_bindings, primitive_mode};
|
const ProgramVariant variant(base_bindings, primitive_mode);
|
||||||
const auto [program_handle, next_bindings] = shader->GetProgramHandle(variant);
|
const auto [program_handle, next_bindings] = shader->GetHandle(variant);
|
||||||
|
|
||||||
switch (program) {
|
switch (program) {
|
||||||
case Maxwell::ShaderProgram::VertexA:
|
case Maxwell::ShaderProgram::VertexA:
|
||||||
|
@ -725,18 +725,14 @@ bool RasterizerOpenGL::DrawMultiBatch(bool is_indexed) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
|
void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
|
||||||
if (!GLAD_GL_ARB_compute_variable_group_size) {
|
|
||||||
LOG_ERROR(Render_OpenGL, "Compute is currently not supported on this device due to the "
|
|
||||||
"lack of GL_ARB_compute_variable_group_size");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto kernel = shader_cache.GetComputeKernel(code_addr);
|
auto kernel = shader_cache.GetComputeKernel(code_addr);
|
||||||
SetupComputeTextures(kernel);
|
SetupComputeTextures(kernel);
|
||||||
SetupComputeImages(kernel);
|
SetupComputeImages(kernel);
|
||||||
|
|
||||||
const auto [program, next_bindings] = kernel->GetProgramHandle({});
|
const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
|
||||||
state.draw.shader_program = program;
|
const ProgramVariant variant(launch_desc.block_dim_x, launch_desc.block_dim_y,
|
||||||
|
launch_desc.block_dim_z);
|
||||||
|
std::tie(state.draw.shader_program, std::ignore) = kernel->GetHandle(variant);
|
||||||
state.draw.program_pipeline = 0;
|
state.draw.program_pipeline = 0;
|
||||||
|
|
||||||
const std::size_t buffer_size =
|
const std::size_t buffer_size =
|
||||||
|
@ -760,10 +756,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
|
||||||
state.ApplyShaderProgram();
|
state.ApplyShaderProgram();
|
||||||
state.ApplyProgramPipeline();
|
state.ApplyProgramPipeline();
|
||||||
|
|
||||||
const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
|
glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
|
||||||
glDispatchComputeGroupSizeARB(launch_desc.grid_dim_x, launch_desc.grid_dim_y,
|
|
||||||
launch_desc.grid_dim_z, launch_desc.block_dim_x,
|
|
||||||
launch_desc.block_dim_y, launch_desc.block_dim_z);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void RasterizerOpenGL::FlushAll() {}
|
void RasterizerOpenGL::FlushAll() {}
|
||||||
|
|
|
@ -255,7 +255,7 @@ void FillLocker(ConstBufferLocker& locker, const ShaderDiskCacheUsage& usage) {
|
||||||
|
|
||||||
CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramType program_type,
|
CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramType program_type,
|
||||||
const ProgramCode& program_code, const ProgramCode& program_code_b,
|
const ProgramCode& program_code, const ProgramCode& program_code_b,
|
||||||
const ProgramVariant& variant, ConstBufferLocker& locker,
|
ConstBufferLocker& locker, const ProgramVariant& variant,
|
||||||
bool hint_retrievable = false) {
|
bool hint_retrievable = false) {
|
||||||
LOG_INFO(Render_OpenGL, "called. {}", GetShaderId(unique_identifier, program_type));
|
LOG_INFO(Render_OpenGL, "called. {}", GetShaderId(unique_identifier, program_type));
|
||||||
|
|
||||||
|
@ -268,17 +268,11 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramTy
|
||||||
}
|
}
|
||||||
const auto entries = GLShader::GetEntries(ir);
|
const auto entries = GLShader::GetEntries(ir);
|
||||||
|
|
||||||
auto base_bindings{variant.base_bindings};
|
|
||||||
const auto primitive_mode{variant.primitive_mode};
|
|
||||||
|
|
||||||
std::string source = fmt::format(R"(// {}
|
std::string source = fmt::format(R"(// {}
|
||||||
#version 430 core
|
#version 430 core
|
||||||
#extension GL_ARB_separate_shader_objects : enable
|
#extension GL_ARB_separate_shader_objects : enable
|
||||||
)",
|
)",
|
||||||
GetShaderId(unique_identifier, program_type));
|
GetShaderId(unique_identifier, program_type));
|
||||||
if (is_compute) {
|
|
||||||
source += "#extension GL_ARB_compute_variable_group_size : require\n";
|
|
||||||
}
|
|
||||||
if (device.HasShaderBallot()) {
|
if (device.HasShaderBallot()) {
|
||||||
source += "#extension GL_ARB_shader_ballot : require\n";
|
source += "#extension GL_ARB_shader_ballot : require\n";
|
||||||
}
|
}
|
||||||
|
@ -295,6 +289,7 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramTy
|
||||||
}
|
}
|
||||||
source += '\n';
|
source += '\n';
|
||||||
|
|
||||||
|
auto base_bindings = variant.base_bindings;
|
||||||
if (!is_compute) {
|
if (!is_compute) {
|
||||||
source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
|
source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
|
||||||
}
|
}
|
||||||
|
@ -318,13 +313,15 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramTy
|
||||||
|
|
||||||
if (program_type == ProgramType::Geometry) {
|
if (program_type == ProgramType::Geometry) {
|
||||||
const auto [glsl_topology, debug_name, max_vertices] =
|
const auto [glsl_topology, debug_name, max_vertices] =
|
||||||
GetPrimitiveDescription(primitive_mode);
|
GetPrimitiveDescription(variant.primitive_mode);
|
||||||
|
|
||||||
source += "layout (" + std::string(glsl_topology) + ") in;\n\n";
|
source += fmt::format("layout ({}) in;\n\n", glsl_topology);
|
||||||
source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n';
|
source += fmt::format("#define MAX_VERTEX_INPUT {}\n", max_vertices);
|
||||||
}
|
}
|
||||||
if (program_type == ProgramType::Compute) {
|
if (program_type == ProgramType::Compute) {
|
||||||
source += "layout (local_size_variable) in;\n";
|
source +=
|
||||||
|
fmt::format("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;\n",
|
||||||
|
variant.block_x, variant.block_y, variant.block_z);
|
||||||
}
|
}
|
||||||
|
|
||||||
source += '\n';
|
source += '\n';
|
||||||
|
@ -422,58 +419,53 @@ Shader CachedShader::CreateFromCache(const ShaderParameters& params,
|
||||||
unspecialized.code_b));
|
unspecialized.code_b));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVariant& variant) {
|
std::tuple<GLuint, BaseBindings> CachedShader::GetHandle(const ProgramVariant& variant) {
|
||||||
UpdateVariant();
|
EnsureValidLockerVariant();
|
||||||
|
|
||||||
const auto [entry, is_cache_miss] = curr_variant->programs.try_emplace(variant);
|
const auto [entry, is_cache_miss] = curr_locker_variant->programs.try_emplace(variant);
|
||||||
auto& program = entry->second;
|
auto& program = entry->second;
|
||||||
if (is_cache_miss) {
|
if (is_cache_miss) {
|
||||||
program = BuildShader(device, unique_identifier, program_type, program_code, program_code_b,
|
program = BuildShader(device, unique_identifier, program_type, program_code, program_code_b,
|
||||||
variant, *curr_variant->locker);
|
*curr_locker_variant->locker, variant);
|
||||||
disk_cache.SaveUsage(GetUsage(variant, *curr_variant->locker));
|
disk_cache.SaveUsage(GetUsage(variant, *curr_locker_variant->locker));
|
||||||
|
|
||||||
LabelGLObject(GL_PROGRAM, program->handle, cpu_addr);
|
LabelGLObject(GL_PROGRAM, program->handle, cpu_addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto base_bindings = variant.base_bindings;
|
auto base_bindings = variant.base_bindings;
|
||||||
base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size());
|
base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size());
|
||||||
if (program_type != ProgramType::Compute) {
|
base_bindings.cbuf += STAGE_RESERVED_UBOS;
|
||||||
base_bindings.cbuf += STAGE_RESERVED_UBOS;
|
|
||||||
}
|
|
||||||
base_bindings.gmem += static_cast<u32>(entries.global_memory_entries.size());
|
base_bindings.gmem += static_cast<u32>(entries.global_memory_entries.size());
|
||||||
base_bindings.sampler += static_cast<u32>(entries.samplers.size());
|
base_bindings.sampler += static_cast<u32>(entries.samplers.size());
|
||||||
|
|
||||||
return {program->handle, base_bindings};
|
return {program->handle, base_bindings};
|
||||||
}
|
}
|
||||||
|
|
||||||
void CachedShader::UpdateVariant() {
|
bool CachedShader::EnsureValidLockerVariant() {
|
||||||
if (curr_variant && !curr_variant->locker->IsConsistent()) {
|
const auto previous_variant = curr_locker_variant;
|
||||||
curr_variant = nullptr;
|
if (curr_locker_variant && !curr_locker_variant->locker->IsConsistent()) {
|
||||||
|
curr_locker_variant = nullptr;
|
||||||
}
|
}
|
||||||
if (!curr_variant) {
|
if (!curr_locker_variant) {
|
||||||
for (auto& variant : locker_variants) {
|
for (auto& variant : locker_variants) {
|
||||||
if (variant->locker->IsConsistent()) {
|
if (variant->locker->IsConsistent()) {
|
||||||
curr_variant = variant.get();
|
curr_locker_variant = variant.get();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!curr_variant) {
|
if (!curr_locker_variant) {
|
||||||
auto& new_variant = locker_variants.emplace_back();
|
auto& new_variant = locker_variants.emplace_back();
|
||||||
new_variant = std::make_unique<LockerVariant>();
|
new_variant = std::make_unique<LockerVariant>();
|
||||||
new_variant->locker = MakeLocker(system, program_type);
|
new_variant->locker = MakeLocker(system, program_type);
|
||||||
curr_variant = new_variant.get();
|
curr_locker_variant = new_variant.get();
|
||||||
}
|
}
|
||||||
|
return previous_variant == curr_locker_variant;
|
||||||
}
|
}
|
||||||
|
|
||||||
ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant,
|
ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant,
|
||||||
const ConstBufferLocker& locker) const {
|
const ConstBufferLocker& locker) const {
|
||||||
ShaderDiskCacheUsage usage;
|
return ShaderDiskCacheUsage{unique_identifier, variant, locker.GetKeys(),
|
||||||
usage.unique_identifier = unique_identifier;
|
locker.GetBoundSamplers(), locker.GetBindlessSamplers()};
|
||||||
usage.variant = variant;
|
|
||||||
usage.keys = locker.GetKeys();
|
|
||||||
usage.bound_samplers = locker.GetBoundSamplers();
|
|
||||||
usage.bindless_samplers = locker.GetBindlessSamplers();
|
|
||||||
return usage;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
|
ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
|
||||||
|
@ -534,9 +526,10 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
|
||||||
if (!shader) {
|
if (!shader) {
|
||||||
auto locker{MakeLocker(system, unspecialized.program_type)};
|
auto locker{MakeLocker(system, unspecialized.program_type)};
|
||||||
FillLocker(*locker, usage);
|
FillLocker(*locker, usage);
|
||||||
|
|
||||||
shader = BuildShader(device, usage.unique_identifier, unspecialized.program_type,
|
shader = BuildShader(device, usage.unique_identifier, unspecialized.program_type,
|
||||||
unspecialized.code, unspecialized.code_b, usage.variant,
|
unspecialized.code, unspecialized.code_b, *locker,
|
||||||
*locker, true);
|
usage.variant, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::scoped_lock lock{mutex};
|
std::scoped_lock lock{mutex};
|
||||||
|
|
|
@ -86,7 +86,7 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Gets the GL program handle for the shader
|
/// Gets the GL program handle for the shader
|
||||||
std::tuple<GLuint, BaseBindings> GetProgramHandle(const ProgramVariant& variant);
|
std::tuple<GLuint, BaseBindings> GetHandle(const ProgramVariant& variant);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct LockerVariant {
|
struct LockerVariant {
|
||||||
|
@ -98,7 +98,7 @@ private:
|
||||||
GLShader::ShaderEntries entries, ProgramCode program_code,
|
GLShader::ShaderEntries entries, ProgramCode program_code,
|
||||||
ProgramCode program_code_b);
|
ProgramCode program_code_b);
|
||||||
|
|
||||||
void UpdateVariant();
|
bool EnsureValidLockerVariant();
|
||||||
|
|
||||||
ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant,
|
ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant,
|
||||||
const VideoCommon::Shader::ConstBufferLocker& locker) const;
|
const VideoCommon::Shader::ConstBufferLocker& locker) const;
|
||||||
|
@ -117,7 +117,7 @@ private:
|
||||||
ProgramCode program_code;
|
ProgramCode program_code;
|
||||||
ProgramCode program_code_b;
|
ProgramCode program_code_b;
|
||||||
|
|
||||||
LockerVariant* curr_variant = nullptr;
|
LockerVariant* curr_locker_variant = nullptr;
|
||||||
std::vector<std::unique_ptr<LockerVariant>> locker_variants;
|
std::vector<std::unique_ptr<LockerVariant>> locker_variants;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -52,11 +52,11 @@ struct BindlessSamplerKey {
|
||||||
Tegra::Engines::SamplerDescriptor sampler{};
|
Tegra::Engines::SamplerDescriptor sampler{};
|
||||||
};
|
};
|
||||||
|
|
||||||
constexpr u32 NativeVersion = 6;
|
constexpr u32 NativeVersion = 7;
|
||||||
|
|
||||||
// Making sure sizes doesn't change by accident
|
// Making sure sizes doesn't change by accident
|
||||||
static_assert(sizeof(BaseBindings) == 16);
|
static_assert(sizeof(BaseBindings) == 16);
|
||||||
static_assert(sizeof(ProgramVariant) == 20);
|
static_assert(sizeof(ProgramVariant) == 28);
|
||||||
|
|
||||||
ShaderCacheVersionHash GetShaderCacheVersionHash() {
|
ShaderCacheVersionHash GetShaderCacheVersionHash() {
|
||||||
ShaderCacheVersionHash hash{};
|
ShaderCacheVersionHash hash{};
|
||||||
|
|
|
@ -44,32 +44,49 @@ struct BaseBindings {
|
||||||
u32 sampler{};
|
u32 sampler{};
|
||||||
u32 image{};
|
u32 image{};
|
||||||
|
|
||||||
bool operator==(const BaseBindings& rhs) const {
|
bool operator==(const BaseBindings& rhs) const noexcept {
|
||||||
return std::tie(cbuf, gmem, sampler, image) ==
|
return std::tie(cbuf, gmem, sampler, image) ==
|
||||||
std::tie(rhs.cbuf, rhs.gmem, rhs.sampler, rhs.image);
|
std::tie(rhs.cbuf, rhs.gmem, rhs.sampler, rhs.image);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool operator!=(const BaseBindings& rhs) const {
|
bool operator!=(const BaseBindings& rhs) const noexcept {
|
||||||
return !operator==(rhs);
|
return !operator==(rhs);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
static_assert(std::is_trivially_copyable_v<BaseBindings>);
|
static_assert(std::is_trivially_copyable_v<BaseBindings>);
|
||||||
|
|
||||||
/// Describes the different variants a single program can be compiled.
|
/// Describes the different variants a program can be compiled with.
|
||||||
struct ProgramVariant {
|
struct ProgramVariant final {
|
||||||
BaseBindings base_bindings;
|
ProgramVariant() = default;
|
||||||
GLenum primitive_mode{};
|
|
||||||
|
|
||||||
bool operator==(const ProgramVariant& rhs) const {
|
/// Graphics constructor.
|
||||||
return std::tie(base_bindings, primitive_mode) ==
|
explicit constexpr ProgramVariant(BaseBindings base_bindings, GLenum primitive_mode) noexcept
|
||||||
std::tie(rhs.base_bindings, rhs.primitive_mode);
|
: base_bindings{base_bindings}, primitive_mode{primitive_mode} {}
|
||||||
|
|
||||||
|
/// Compute constructor.
|
||||||
|
explicit constexpr ProgramVariant(u32 block_x, u32 block_y, u32 block_z) noexcept
|
||||||
|
: block_x{block_x}, block_y{static_cast<u16>(block_y)}, block_z{static_cast<u16>(block_z)} {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool operator!=(const ProgramVariant& rhs) const {
|
// Graphics specific parameters.
|
||||||
|
BaseBindings base_bindings{};
|
||||||
|
GLenum primitive_mode{};
|
||||||
|
|
||||||
|
// Compute specific parameters.
|
||||||
|
u32 block_x{};
|
||||||
|
u16 block_y{};
|
||||||
|
u16 block_z{};
|
||||||
|
|
||||||
|
bool operator==(const ProgramVariant& rhs) const noexcept {
|
||||||
|
return std::tie(base_bindings, primitive_mode, block_x, block_y, block_z) ==
|
||||||
|
std::tie(rhs.base_bindings, rhs.primitive_mode, rhs.block_x, rhs.block_y,
|
||||||
|
rhs.block_z);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool operator!=(const ProgramVariant& rhs) const noexcept {
|
||||||
return !operator==(rhs);
|
return !operator==(rhs);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(std::is_trivially_copyable_v<ProgramVariant>);
|
static_assert(std::is_trivially_copyable_v<ProgramVariant>);
|
||||||
|
|
||||||
/// Describes how a shader is used.
|
/// Describes how a shader is used.
|
||||||
|
@ -108,8 +125,11 @@ struct hash<OpenGL::BaseBindings> {
|
||||||
template <>
|
template <>
|
||||||
struct hash<OpenGL::ProgramVariant> {
|
struct hash<OpenGL::ProgramVariant> {
|
||||||
std::size_t operator()(const OpenGL::ProgramVariant& variant) const noexcept {
|
std::size_t operator()(const OpenGL::ProgramVariant& variant) const noexcept {
|
||||||
return std::hash<OpenGL::BaseBindings>()(variant.base_bindings) ^
|
return std::hash<OpenGL::BaseBindings>{}(variant.base_bindings) ^
|
||||||
(static_cast<std::size_t>(variant.primitive_mode) << 6);
|
(static_cast<std::size_t>(variant.primitive_mode) << 6) ^
|
||||||
|
static_cast<std::size_t>(variant.block_x) ^
|
||||||
|
(static_cast<std::size_t>(variant.block_y) << 32) ^
|
||||||
|
(static_cast<std::size_t>(variant.block_z) << 48);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -117,7 +137,7 @@ template <>
|
||||||
struct hash<OpenGL::ShaderDiskCacheUsage> {
|
struct hash<OpenGL::ShaderDiskCacheUsage> {
|
||||||
std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const noexcept {
|
std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const noexcept {
|
||||||
return static_cast<std::size_t>(usage.unique_identifier) ^
|
return static_cast<std::size_t>(usage.unique_identifier) ^
|
||||||
std::hash<OpenGL::ProgramVariant>()(usage.variant);
|
std::hash<OpenGL::ProgramVariant>{}(usage.variant);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue