texture_cache: Add async texture decoding

2023-02-22 00:26:07 -05:00 · 2023-02-22 00:26:07 -05:00 · 090bc588e5
commit 090bc588e5
parent 8f3e2a1b48
4 changed files with 89 additions and 0 deletions
--- a/src/common/scratch_buffer.h
+++ b/src/common/scratch_buffer.h
@ -23,6 +23,7 @@ public:
          buffer{Common::make_unique_for_overwrite<T[]>(initial_capacity)} {}

    ~ScratchBuffer() = default;
+    ScratchBuffer(ScratchBuffer&&) = default;

    /// This will only grow the buffer's capacity if size is greater than the current capacity.
    /// The previously held data will remain intact.
--- a/src/video_core/texture_cache/image_base.h
+++ b/src/video_core/texture_cache/image_base.h
@ -38,6 +38,9 @@ enum class ImageFlagBits : u32 {
    Rescaled = 1 << 13,
    CheckingRescalable = 1 << 14,
    IsRescalable = 1 << 15,
+
+    AsynchronousDecode = 1 << 16,
+    IsDecoding = 1 << 17, ///< Is currently being decoded asynchornously.
 };
 DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits)

--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@ -85,6 +85,11 @@ void TextureCache<P>::RunGarbageCollector() {
        }
        --num_iterations;
        auto& image = slot_images[image_id];
+        if (True(image.flags & ImageFlagBits::IsDecoding)) {
+            // This image is still being decoded, deleting it will invalidate the slot
+            // used by the async decoder thread.
+            return false;
+        }
        const bool must_download =
            image.IsSafeDownload() && False(image.flags & ImageFlagBits::BadOverlap);
        if (!high_priority_mode &&
@ -133,6 +138,8 @@ void TextureCache<P>::TickFrame() {
    sentenced_images.Tick();
    sentenced_framebuffers.Tick();
    sentenced_image_view.Tick();
+    TickAsyncDecode();
+
    runtime.TickFrame();
    critical_gc = 0;
    ++frame_tick;
@ -777,6 +784,10 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
        LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
        return;
    }
+    if (True(image.flags & ImageFlagBits::AsynchronousDecode)) {
+        QueueAsyncDecode(image, image_id);
+        return;
+    }
    auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
    UploadImageContents(image, staging);
    runtime.InsertUploadMemoryBarrier();
@ -989,6 +1000,64 @@ u64 TextureCache<P>::GetScaledImageSizeBytes(const ImageBase& image) {
    return fitted_size;
 }

+template <class P>
+void TextureCache<P>::QueueAsyncDecode(Image& image, ImageId image_id) {
+    UNIMPLEMENTED_IF(False(image.flags & ImageFlagBits::Converted));
+
+    image.flags |= ImageFlagBits::IsDecoding;
+    auto decode = std::make_unique<AsyncDecodeContext>();
+    auto* decode_ptr = decode.get();
+    decode->image_id = image_id;
+    async_decodes.push_back(std::move(decode));
+
+    Common::ScratchBuffer<u8> local_unswizzle_data_buffer(image.unswizzled_size_bytes);
+    const size_t guest_size_bytes = image.guest_size_bytes;
+    swizzle_data_buffer.resize_destructive(guest_size_bytes);
+    gpu_memory->ReadBlockUnsafe(image.gpu_addr, swizzle_data_buffer.data(), guest_size_bytes);
+    auto copies = UnswizzleImage(*gpu_memory, image.gpu_addr, image.info, swizzle_data_buffer,
+                                 local_unswizzle_data_buffer);
+    const size_t out_size = MapSizeBytes(image);
+
+    auto func = [out_size, copies, info = image.info,
+                 input = std::move(local_unswizzle_data_buffer),
+                 async_decode = decode_ptr]() mutable {
+        async_decode->decoded_data.resize_destructive(out_size);
+        std::span copies_span{copies.data(), copies.size()};
+        ConvertImage(input, info, async_decode->decoded_data, copies_span);
+
+        // TODO: Do we need this lock?
+        std::unique_lock lock{async_decode->mutex};
+        async_decode->copies = std::move(copies);
+        async_decode->complete = true;
+    };
+    texture_decode_worker.QueueWork(std::move(func));
+}
+
+template <class P>
+void TextureCache<P>::TickAsyncDecode() {
+    bool has_uploads{};
+    auto i = async_decodes.begin();
+    while (i != async_decodes.end()) {
+        auto* async_decode = i->get();
+        std::unique_lock lock{async_decode->mutex};
+        if (!async_decode->complete) {
+            ++i;
+            continue;
+        }
+        Image& image = slot_images[async_decode->image_id];
+        auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
+        std::memcpy(staging.mapped_span.data(), async_decode->decoded_data.data(),
+                    async_decode->decoded_data.size());
+        image.UploadMemory(staging, async_decode->copies);
+        image.flags &= ~ImageFlagBits::IsDecoding;
+        has_uploads = true;
+        i = async_decodes.erase(i);
+    }
+    if (has_uploads) {
+        runtime.InsertUploadMemoryBarrier();
+    }
+}
+
 template <class P>
 bool TextureCache<P>::ScaleUp(Image& image) {
    const bool has_copy = image.HasScaled();
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@ -3,6 +3,7 @@

 #pragma once

+#include <atomic>
 #include <deque>
 #include <limits>
 #include <mutex>
@ -18,6 +19,7 @@
 #include "common/lru_cache.h"
 #include "common/polyfill_ranges.h"
 #include "common/scratch_buffer.h"
+#include "common/thread_worker.h"
 #include "video_core/compatible_formats.h"
 #include "video_core/control/channel_state_cache.h"
 #include "video_core/delayed_destruction_ring.h"
@ -54,6 +56,14 @@ struct ImageViewInOut {
    ImageViewId id{};
 };

+struct AsyncDecodeContext {
+    ImageId image_id;
+    Common::ScratchBuffer<u8> decoded_data;
+    std::vector<BufferImageCopy> copies;
+    std::mutex mutex;
+    std::atomic_bool complete;
+};
+
 using TextureCacheGPUMap = std::unordered_map<u64, std::vector<ImageId>, Common::IdentityHash<u64>>;

 class TextureCacheChannelInfo : public ChannelInfo {
@ -377,6 +387,9 @@ private:
    bool ScaleDown(Image& image);
    u64 GetScaledImageSizeBytes(const ImageBase& image);

+    void QueueAsyncDecode(Image& image, ImageId image_id);
+    void TickAsyncDecode();
+
    Runtime& runtime;

    VideoCore::RasterizerInterface& rasterizer;
@ -430,6 +443,9 @@ private:

    u64 modification_tick = 0;
    u64 frame_tick = 0;
+
+    Common::ThreadWorker texture_decode_worker{1, "TextureDecoder"};
+    std::vector<std::unique_ptr<AsyncDecodeContext>> async_decodes;
 };

 } // namespace VideoCommon