From 5665d055476fa793192523c3cb6fe06369d58674 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 4 Jul 2021 22:48:41 -0400 Subject: [PATCH] astc_decoder: Optimize the use EncodingData This buffer was a list of EncodingData structures sorted by their bit length, with some duplication from the cpu decoder implementation. We can take advantage of its sorted property to optimize its usage in the shader. Thanks to wwylele for the optimization idea. --- src/video_core/host_shaders/astc_decoder.comp | 50 ++++++------- .../renderer_opengl/util_shaders.cpp | 13 ++-- src/video_core/renderer_opengl/util_shaders.h | 1 - .../renderer_vulkan/vk_compute_pass.cpp | 42 +++-------- src/video_core/textures/astc.cpp | 70 +++++++++++++++++++ src/video_core/textures/astc.h | 70 ------------------- 6 files changed, 108 insertions(+), 138 deletions(-) diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index c37f15bfd8..7f4efa31af 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -10,18 +10,16 @@ #define END_PUSH_CONSTANTS }; #define UNIFORM(n) #define BINDING_INPUT_BUFFER 0 -#define BINDING_ENC_BUFFER 1 -#define BINDING_SWIZZLE_BUFFER 2 -#define BINDING_OUTPUT_IMAGE 3 +#define BINDING_SWIZZLE_BUFFER 1 +#define BINDING_OUTPUT_IMAGE 2 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv #define BEGIN_PUSH_CONSTANTS #define END_PUSH_CONSTANTS #define UNIFORM(n) layout(location = n) uniform -#define BINDING_SWIZZLE_BUFFER 0 -#define BINDING_INPUT_BUFFER 1 -#define BINDING_ENC_BUFFER 2 +#define BINDING_INPUT_BUFFER 0 +#define BINDING_SWIZZLE_BUFFER 1 #define BINDING_OUTPUT_IMAGE 0 #endif @@ -64,11 +62,6 @@ layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { uint astc_data[]; }; -// ASTC Encodings data -layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues { - EncodingData encoding_values[]; -}; - layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image; const uint GOB_SIZE_X = 64; @@ -94,6 +87,19 @@ const int JUST_BITS = 0; const int QUINT = 1; const int TRIT = 2; +// ASTC Encodings data, sorted in ascending order based on their BitLength value +// (see GetBitLength() function) +EncodingData encoding_values[22] = EncodingData[]( + EncodingData(JUST_BITS, 0, 0, 0), EncodingData(JUST_BITS, 1, 0, 0), EncodingData(TRIT, 0, 0, 0), + EncodingData(JUST_BITS, 2, 0, 0), EncodingData(QUINT, 0, 0, 0), EncodingData(TRIT, 1, 0, 0), + EncodingData(JUST_BITS, 3, 0, 0), EncodingData(QUINT, 1, 0, 0), EncodingData(TRIT, 2, 0, 0), + EncodingData(JUST_BITS, 4, 0, 0), EncodingData(QUINT, 2, 0, 0), EncodingData(TRIT, 3, 0, 0), + EncodingData(JUST_BITS, 5, 0, 0), EncodingData(QUINT, 3, 0, 0), EncodingData(TRIT, 4, 0, 0), + EncodingData(JUST_BITS, 6, 0, 0), EncodingData(QUINT, 4, 0, 0), EncodingData(TRIT, 5, 0, 0), + EncodingData(JUST_BITS, 7, 0, 0), EncodingData(QUINT, 5, 0, 0), EncodingData(TRIT, 6, 0, 0), + EncodingData(JUST_BITS, 8, 0, 0) +); + // The following constants are expanded variants of the Replicate() // function calls corresponding to the following arguments: // value: index into the generated table @@ -596,22 +602,16 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { for (uint i = 0; i < num_partitions; i++) { num_values += ((modes[i] >> 2) + 1) << 1; } - int range = 256; - while (--range > 0) { - EncodingData val = encoding_values[range]; + // Find the largest encoding that's within color_data_bits + // TODO(ameerj): profile with binary search + int range = 0; + while (++range < encoding_values.length()) { uint bit_length = GetBitLength(num_values, range); - if (bit_length <= color_data_bits) { - while (--range > 0) { - EncodingData newval = encoding_values[range]; - if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) { - break; - } - } - ++range; + if (bit_length > color_data_bits) { break; } } - DecodeIntegerSequence(range, num_values); + DecodeIntegerSequence(range - 1, num_values); uint out_index = 0; for (int itr = 0; itr < result_index; ++itr) { if (out_index >= num_values) { @@ -1110,10 +1110,10 @@ TexelWeightParams DecodeBlockInfo(uint block_index) { } weight_index -= 2; if ((mode_layout != 9) && ((mode & 0x200) != 0)) { - const int max_weights[6] = int[6](9, 11, 15, 19, 23, 31); + const int max_weights[6] = int[6](7, 8, 9, 10, 11, 12); params.max_weight = max_weights[weight_index]; } else { - const int max_weights[6] = int[6](1, 2, 3, 4, 5, 7); + const int max_weights[6] = int[6](1, 2, 3, 4, 5, 6); params.max_weight = max_weights[weight_index]; } return params; diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 37a4d1d9db..a2b2647005 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -60,19 +60,15 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); swizzle_table_buffer.Create(); - astc_buffer.Create(); glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); - glNamedBufferStorage(astc_buffer.handle, sizeof(ASTC_ENCODINGS_VALUES), &ASTC_ENCODINGS_VALUES, - 0); } UtilShaders::~UtilShaders() = default; void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, std::span swizzles) { - static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; - static constexpr GLuint BINDING_INPUT_BUFFER = 1; - static constexpr GLuint BINDING_ENC_BUFFER = 2; + static constexpr GLuint BINDING_INPUT_BUFFER = 0; + static constexpr GLuint BINDING_SWIZZLE_BUFFER = 1; static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; const Extent2D tile_size{ @@ -81,7 +77,6 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, }; program_manager.BindComputeProgram(astc_decoder_program.handle); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_buffer.handle); glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); glUniform2ui(1, tile_size.width, tile_size.height); @@ -103,11 +98,11 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, glUniform1ui(6, params.block_height); glUniform1ui(7, params.block_height_mask); - glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0, - GL_WRITE_ONLY, GL_RGBA8); // ASTC texture data glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, image.guest_size_bytes - swizzle.buffer_offset); + glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0, + GL_WRITE_ONLY, GL_RGBA8); glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers); } diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h index 53d65f368e..ef881e35f6 100644 --- a/src/video_core/renderer_opengl/util_shaders.h +++ b/src/video_core/renderer_opengl/util_shaders.h @@ -62,7 +62,6 @@ private: ProgramManager& program_manager; OGLBuffer swizzle_table_buffer; - OGLBuffer astc_buffer; OGLProgram astc_decoder_program; OGLProgram block_linear_unswizzle_2d_program; diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 561cf5e11f..328813a572 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -30,16 +30,13 @@ namespace Vulkan { using Tegra::Texture::SWIZZLE_TABLE; -using Tegra::Texture::ASTC::ASTC_ENCODINGS_VALUES; -using namespace Tegra::Texture::ASTC; namespace { constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0; -constexpr u32 ASTC_BINDING_ENC_BUFFER = 1; -constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 2; -constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 3; -constexpr size_t ASTC_NUM_BINDINGS = 4; +constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 1; +constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 2; +constexpr size_t ASTC_NUM_BINDINGS = 3; template inline constexpr VkPushConstantRange COMPUTE_PUSH_CONSTANT_RANGE{ @@ -75,7 +72,7 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ .score = 2, }; -constexpr std::array ASTC_DESCRIPTOR_SET_BINDINGS{{ +constexpr std::array ASTC_DESCRIPTOR_SET_BINDINGS{{ { .binding = ASTC_BINDING_INPUT_BUFFER, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, @@ -83,13 +80,6 @@ constexpr std::array ASTC_DESCRIPTOR_SET_BINDIN .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = nullptr, }, - { - .binding = ASTC_BINDING_ENC_BUFFER, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }, { .binding = ASTC_BINDING_SWIZZLE_BUFFER, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, @@ -108,12 +98,12 @@ constexpr std::array ASTC_DESCRIPTOR_SET_BINDIN constexpr DescriptorBankInfo ASTC_BANK_INFO{ .uniform_buffers = 0, - .storage_buffers = 3, + .storage_buffers = 2, .texture_buffers = 0, .image_buffers = 0, .textures = 0, .images = 1, - .score = 4, + .score = 3, }; constexpr VkDescriptorUpdateTemplateEntryKHR INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE{ @@ -135,14 +125,6 @@ constexpr std::array .offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry), .stride = sizeof(DescriptorUpdateEntry), }, - { - .dstBinding = ASTC_BINDING_ENC_BUFFER, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = ASTC_BINDING_ENC_BUFFER * sizeof(DescriptorUpdateEntry), - .stride = sizeof(DescriptorUpdateEntry), - }, { .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER, .dstArrayElement = 0, @@ -355,7 +337,7 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_, ASTCDecoderPass::~ASTCDecoderPass() = default; void ASTCDecoderPass::MakeDataBuffer() { - constexpr size_t TOTAL_BUFFER_SIZE = sizeof(ASTC_ENCODINGS_VALUES) + sizeof(SWIZZLE_TABLE); + constexpr size_t TOTAL_BUFFER_SIZE = sizeof(SWIZZLE_TABLE); data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, @@ -369,11 +351,7 @@ void ASTCDecoderPass::MakeDataBuffer() { data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload); const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload); - std::memcpy(staging_ref.mapped_span.data(), &ASTC_ENCODINGS_VALUES, - sizeof(ASTC_ENCODINGS_VALUES)); - // Tack on the swizzle table at the end of the buffer - std::memcpy(staging_ref.mapped_span.data() + sizeof(ASTC_ENCODINGS_VALUES), &SWIZZLE_TABLE, - sizeof(SWIZZLE_TABLE)); + std::memcpy(staging_ref.mapped_span.data(), &SWIZZLE_TABLE, sizeof(SWIZZLE_TABLE)); scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer, TOTAL_BUFFER_SIZE](vk::CommandBuffer cmdbuf) { @@ -443,9 +421,7 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, update_descriptor_queue.Acquire(); update_descriptor_queue.AddBuffer(map.buffer, input_offset, image.guest_size_bytes - swizzle.buffer_offset); - update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(ASTC_ENCODINGS_VALUES)); - update_descriptor_queue.AddBuffer(*data_buffer, sizeof(ASTC_ENCODINGS_VALUES), - sizeof(SWIZZLE_TABLE)); + update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(SWIZZLE_TABLE)); update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level)); const void* const descriptor_data{update_descriptor_queue.UpdateData()}; diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 26c19d75ba..25161df1f6 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -151,6 +151,76 @@ private: const IntType& m_Bits; }; +enum class IntegerEncoding { JustBits, Quint, Trit }; + +struct IntegerEncodedValue { + constexpr IntegerEncodedValue() = default; + + constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_) + : encoding{encoding_}, num_bits{num_bits_} {} + + constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const { + return encoding == other.encoding && num_bits == other.num_bits; + } + + // Returns the number of bits required to encode num_vals values. + u32 GetBitLength(u32 num_vals) const { + u32 total_bits = num_bits * num_vals; + if (encoding == IntegerEncoding::Trit) { + total_bits += (num_vals * 8 + 4) / 5; + } else if (encoding == IntegerEncoding::Quint) { + total_bits += (num_vals * 7 + 2) / 3; + } + return total_bits; + } + + IntegerEncoding encoding{}; + u32 num_bits = 0; + u32 bit_value = 0; + union { + u32 quint_value = 0; + u32 trit_value; + }; +}; + +// Returns a new instance of this struct that corresponds to the +// can take no more than mav_value values +static constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) { + while (mav_value > 0) { + u32 check = mav_value + 1; + + // Is mav_value a power of two? + if (!(check & (check - 1))) { + return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value)); + } + + // Is mav_value of the type 3*2^n - 1? + if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { + return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1)); + } + + // Is mav_value of the type 5*2^n - 1? + if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { + return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1)); + } + + // Apparently it can't be represented with a bounded integer sequence... + // just iterate. + mav_value--; + } + return IntegerEncodedValue(IntegerEncoding::JustBits, 0); +} + +static constexpr std::array MakeEncodedValues() { + std::array encodings{}; + for (std::size_t i = 0; i < encodings.size(); ++i) { + encodings[i] = CreateEncoding(static_cast(i)); + } + return encodings; +} + +static constexpr std::array ASTC_ENCODINGS_VALUES = MakeEncodedValues(); + namespace Tegra::Texture::ASTC { using IntegerEncodedVector = boost::container::static_vector< IntegerEncodedValue, 256, diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h index 9e148afc4a..14d2beec0c 100644 --- a/src/video_core/textures/astc.h +++ b/src/video_core/textures/astc.h @@ -9,76 +9,6 @@ namespace Tegra::Texture::ASTC { -enum class IntegerEncoding { JustBits, Quint, Trit }; - -struct IntegerEncodedValue { - constexpr IntegerEncodedValue() = default; - - constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_) - : encoding{encoding_}, num_bits{num_bits_} {} - - constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const { - return encoding == other.encoding && num_bits == other.num_bits; - } - - // Returns the number of bits required to encode num_vals values. - u32 GetBitLength(u32 num_vals) const { - u32 total_bits = num_bits * num_vals; - if (encoding == IntegerEncoding::Trit) { - total_bits += (num_vals * 8 + 4) / 5; - } else if (encoding == IntegerEncoding::Quint) { - total_bits += (num_vals * 7 + 2) / 3; - } - return total_bits; - } - - IntegerEncoding encoding{}; - u32 num_bits = 0; - u32 bit_value = 0; - union { - u32 quint_value = 0; - u32 trit_value; - }; -}; - -// Returns a new instance of this struct that corresponds to the -// can take no more than mav_value values -constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) { - while (mav_value > 0) { - u32 check = mav_value + 1; - - // Is mav_value a power of two? - if (!(check & (check - 1))) { - return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value)); - } - - // Is mav_value of the type 3*2^n - 1? - if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { - return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1)); - } - - // Is mav_value of the type 5*2^n - 1? - if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { - return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1)); - } - - // Apparently it can't be represented with a bounded integer sequence... - // just iterate. - mav_value--; - } - return IntegerEncodedValue(IntegerEncoding::JustBits, 0); -} - -constexpr std::array MakeEncodedValues() { - std::array encodings{}; - for (std::size_t i = 0; i < encodings.size(); ++i) { - encodings[i] = CreateEncoding(static_cast(i)); - } - return encodings; -} - -constexpr std::array ASTC_ENCODINGS_VALUES = MakeEncodedValues(); - void Decompress(std::span data, uint32_t width, uint32_t height, uint32_t depth, uint32_t block_width, uint32_t block_height, std::span output);