video_core: NVDEC Implementation

This commit aims to implement the NVDEC (Nvidia Decoder) functionality, with video frame decoding being handled by the FFmpeg library. The process begins with Ioctl commands being sent to the NVDEC and VIC (Video Image Composer) emulated devices. These allocate the necessary GPU buffers for the frame data, along with providing information on the incoming video data. A Submit command then signals the GPU to process and decode the frame data. To decode the frame, the respective codec's header must be manually composed from the information provided by NVDEC, then sent with the raw frame data to the ffmpeg library. Currently, H264 and VP9 are supported, with VP9 having some minor artifacting issues related mainly to the reference frame composition in its uncompressed header. Async GPU is not properly implemented at the moment. Co-Authored-By: David <25727384+ogniK5377@users.noreply.github.com>
2020-10-26 23:07:36 -04:00 · 2020-10-26 23:07:36 -04:00 · eb67a45ca8
commit eb67a45ca8
parent 2f6ba54483
53 changed files with 4033 additions and 310 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -263,6 +263,7 @@ if (CONAN_REQUIRED_LIBS)
        libzip:with_openssl=False
        libzip:enable_windows_crypto=False
    )
    conan_check(VERSION 1.24.0 REQUIRED)
    # Add the bincrafters remote
    conan_add_remote(NAME bincrafters
@ -354,6 +355,19 @@ if (NOT LIBUSB_FOUND)
    set(LIBUSB_LIBRARIES usb)
 endif()
 # Use system installed ffmpeg.
 if (NOT MSVC)
    find_package(FFmpeg REQUIRED)
 else()
    set(FFMPEG_EXT_NAME "ffmpeg-4.2.1")
    set(FFMPEG_PATH "${CMAKE_BINARY_DIR}/externals/${FFMPEG_EXT_NAME}")
    download_bundled_external("ffmpeg/" ${FFMPEG_EXT_NAME} "")
    set(FFMPEG_FOUND YES)
    set(FFMPEG_INCLUDE_DIR "${FFMPEG_PATH}/include" CACHE PATH "Path to FFmpeg headers" FORCE)
    set(FFMPEG_LIBRARY_DIR "${FFMPEG_PATH}/bin" CACHE PATH "Path to FFmpeg library" FORCE)
    set(FFMPEG_DLL_DIR "${FFMPEG_PATH}/bin" CACHE PATH "Path to FFmpeg dll's" FORCE)
 endif()
 # Prefer the -pthread flag on Linux.
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
--- a/CMakeModules/CopyYuzuFFmpegDeps.cmake
+++ b/CMakeModules/CopyYuzuFFmpegDeps.cmake
@ -0,0 +1,10 @@
 function(copy_yuzu_FFmpeg_deps target_dir)
    include(WindowsCopyFiles)
    set(DLL_DEST "${CMAKE_BINARY_DIR}/bin/$<CONFIG>/")
    windows_copy_files(${target_dir} ${FFMPEG_DLL_DIR} ${DLL_DEST}
        avcodec-58.dll
        avutil-56.dll
        swresample-3.dll
        swscale-5.dll
    )
 endfunction(copy_yuzu_FFmpeg_deps)
--- a/externals/find-modules/FindFFmpeg.cmake
+++ b/externals/find-modules/FindFFmpeg.cmake
@ -0,0 +1,100 @@
 # - Try to find ffmpeg libraries (libavcodec, libavformat and libavutil)
 # Once done this will define
 #
 # FFMPEG_FOUND - system has ffmpeg or libav
 # FFMPEG_INCLUDE_DIR - the ffmpeg include directory
 # FFMPEG_LIBRARIES - Link these to use ffmpeg
 # FFMPEG_LIBAVCODEC
 # FFMPEG_LIBAVFORMAT
 # FFMPEG_LIBAVUTIL
 #
 # Copyright (c) 2008 Andreas Schneider <mail@cynapses.org>
 # Modified for other libraries by Lasse Kärkkäinen <tronic>
 # Modified for Hedgewars by Stepik777
 # Modified for FFmpeg-example Tuukka Pasanen 2018
 # Modified for yuzu toastUnlimted 2020
 #
 # Redistribution and use is allowed according to the terms of the New
 # BSD license.
 #
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(FFMPEG
  FOUND_VAR FFMPEG_FOUND
  REQUIRED_VARS
      FFMPEG_LIBRARY
      FFMPEG_INCLUDE_DIR
  VERSION_VAR FFMPEG_VERSION
 )
 if(FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
  # in cache already
  set(FFMPEG_FOUND TRUE)
 else()
  # use pkg-config to get the directories and then use these values
  # in the FIND_PATH() and FIND_LIBRARY() calls
  find_package(PkgConfig)
  if(PKG_CONFIG_FOUND)
    pkg_check_modules(_FFMPEG_AVCODEC libavcodec)
    pkg_check_modules(_FFMPEG_AVUTIL libavutil)
    pkg_check_modules(_FFMPEG_SWSCALE libswscale)
  endif()
  find_path(FFMPEG_AVCODEC_INCLUDE_DIR
    NAMES libavcodec/avcodec.h
    PATHS ${_FFMPEG_AVCODEC_INCLUDE_DIRS}
      /usr/include
      /usr/local/include
      /opt/local/include
      /sw/include
    PATH_SUFFIXES ffmpeg libav)
  find_library(FFMPEG_LIBAVCODEC
    NAMES avcodec
    PATHS ${_FFMPEG_AVCODEC_LIBRARY_DIRS}
      /usr/lib
      /usr/local/lib
      /opt/local/lib
      /sw/lib)
  find_library(FFMPEG_LIBAVUTIL
    NAMES avutil
    PATHS ${_FFMPEG_AVUTIL_LIBRARY_DIRS}
      /usr/lib
      /usr/local/lib
      /opt/local/lib
      /sw/lib)
  find_library(FFMPEG_LIBSWSCALE
    NAMES swscale
    PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS}
      /usr/lib
      /usr/local/lib
      /opt/local/lib
      /sw/lib)
  if(FFMPEG_LIBAVCODEC AND FFMPEG_LIBAVUTIL AND FFMPEG_LIBSWSCALE)
    set(FFMPEG_FOUND TRUE)
  endif()
  if(FFMPEG_FOUND)
    set(FFMPEG_INCLUDE_DIR ${FFMPEG_AVCODEC_INCLUDE_DIR})
    set(FFMPEG_LIBRARIES
      ${FFMPEG_LIBAVCODEC}
      ${FFMPEG_LIBAVUTIL}
      ${FFMPEG_LIBSWSCALE})
  endif()
  if(FFMPEG_FOUND)
    if(NOT FFMPEG_FIND_QUIETLY)
      message(STATUS
      "Found FFMPEG or Libav: ${FFMPEG_LIBRARIES}, ${FFMPEG_INCLUDE_DIR}")
    endif()
  else()
    if(FFMPEG_FIND_REQUIRED)
      message(FATAL_ERROR
      "Could not find libavcodec or libavutil or libswscale")
    endif()
  endif()
 endif()
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@ -150,6 +150,8 @@ add_library(common STATIC
    scope_exit.h
    spin_lock.cpp
    spin_lock.h
    stream.cpp
    stream.h
    string_util.cpp
    string_util.h
    swap.h
--- a/src/common/stream.cpp
+++ b/src/common/stream.cpp
@ -0,0 +1,47 @@
 // Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #include <stdexcept>
 #include "common/common_types.h"
 #include "common/stream.h"
 namespace Common {
 Stream::Stream() = default;
 Stream::~Stream() = default;
 void Stream::Seek(s32 offset, SeekOrigin origin) {
    if (origin == SeekOrigin::SetOrigin) {
        if (offset < 0) {
            position = 0;
        } else if (position >= buffer.size()) {
            position = buffer.size();
        } else {
            position = offset;
        }
    } else if (origin == SeekOrigin::FromCurrentPos) {
        Seek(static_cast<s32>(position) + offset, SeekOrigin::SetOrigin);
    } else if (origin == SeekOrigin::FromEnd) {
        Seek(static_cast<s32>(buffer.size()) - offset, SeekOrigin::SetOrigin);
    }
 }
 u8 Stream::ReadByte() {
    if (position < buffer.size()) {
        return buffer[position++];
    } else {
        throw std::out_of_range("Attempting to read a byte not within the buffer range");
    }
 }
 void Stream::WriteByte(u8 byte) {
    if (position == buffer.size()) {
        buffer.push_back(byte);
        position++;
    } else {
        buffer.insert(buffer.begin() + position, byte);
    }
 }
 } // namespace Common
--- a/src/common/stream.h
+++ b/src/common/stream.h
@ -0,0 +1,50 @@
 // Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #pragma once
 #include <vector>
 #include "common/common_types.h"
 namespace Common {
 enum class SeekOrigin {
    SetOrigin,
    FromCurrentPos,
    FromEnd,
 };
 class Stream {
 public:
    /// Stream creates a bitstream and provides common functionality on the stream.
    explicit Stream();
    ~Stream();
    /// Reposition bitstream "cursor" to the specified offset from origin
    void Seek(s32 offset, SeekOrigin origin);
    /// Reads next byte in the stream buffer and increments position
    u8 ReadByte();
    /// Writes byte at current position
    void WriteByte(u8 byte);
    std::size_t GetPosition() const {
        return position;
    }
    std::vector<u8>& GetBuffer() {
        return buffer;
    }
    const std::vector<u8>& GetBuffer() const {
        return buffer;
    }
 private:
    std::vector<u8> buffer;
    std::size_t position{0};
 };
 } // namespace Common
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@ -439,6 +439,8 @@ add_library(core STATIC
    hle/service/nvdrv/devices/nvhost_gpu.h
    hle/service/nvdrv/devices/nvhost_nvdec.cpp
    hle/service/nvdrv/devices/nvhost_nvdec.h
    hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
    hle/service/nvdrv/devices/nvhost_nvdec_common.h
    hle/service/nvdrv/devices/nvhost_nvjpg.cpp
    hle/service/nvdrv/devices/nvhost_nvjpg.h
    hle/service/nvdrv/devices/nvhost_vic.cpp
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
@ -2,15 +2,17 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #include <cstring>
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/core.h"
 #include "core/hle/service/nvdrv/devices/nvhost_nvdec.h"
 #include "video_core/memory_manager.h"
 #include "video_core/renderer_base.h"
 namespace Service::Nvidia::Devices {
-nvhost_nvdec::nvhost_nvdec(Core::System& system) : nvdevice(system) {}
+nvhost_nvdec::nvhost_nvdec(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
    : nvhost_nvdec_common(system, std::move(nvmap_dev)) {}
 nvhost_nvdec::~nvhost_nvdec() = default;
 u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@ -21,7 +23,7 @@ u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::
    switch (static_cast<IoctlCommand>(command.raw)) {
    case IoctlCommand::IocSetNVMAPfdCommand:
-        return SetNVMAPfd(input, output);
+        return SetNVMAPfd(input);
    case IoctlCommand::IocSubmit:
        return Submit(input, output);
    case IoctlCommand::IocGetSyncpoint:
@ -29,79 +31,29 @@ u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::
    case IoctlCommand::IocGetWaitbase:
        return GetWaitbase(input, output);
    case IoctlCommand::IocMapBuffer:
-        return MapBuffer(input, output);
+    case IoctlCommand::IocMapBuffer2:
    case IoctlCommand::IocMapBuffer3:
    case IoctlCommand::IocMapBufferEx:
-        return MapBufferEx(input, output);
+        return MapBuffer(input, output);
-    case IoctlCommand::IocUnmapBufferEx:
+    case IoctlCommand::IocUnmapBufferEx: {
-        return UnmapBufferEx(input, output);
+        // This command is sent when the video stream has ended, flush all video contexts
        // This is usually sent in the folowing order: vic, nvdec, vic.
        // Inform the GPU to clear any remaining nvdec buffers when this is detected.
        LOG_INFO(Service_NVDRV, "NVDEC video stream ended");
        Tegra::ChCommandHeaderList cmdlist(1);
        cmdlist[0] = Tegra::ChCommandHeader{0xDEADB33F};
        system.GPU().PushCommandBuffer(cmdlist);
        [[fallthrough]]; // fallthrough to unmap buffers
    };
    case IoctlCommand::IocUnmapBuffer:
    case IoctlCommand::IocUnmapBuffer2:
    case IoctlCommand::IocUnmapBuffer3:
        return UnmapBuffer(input, output);
    case IoctlCommand::IocSetSubmitTimeout:
        return SetSubmitTimeout(input, output);
    }
-    UNIMPLEMENTED_MSG("Unimplemented ioctl");
+    UNIMPLEMENTED_MSG("Unimplemented ioctl 0x{:X}", command.raw);
    return 0;
 }
 u32 nvhost_nvdec::SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlSetNvmapFD params{};
    std::memcpy(&params, input.data(), sizeof(IoctlSetNvmapFD));
    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
    nvmap_fd = params.nvmap_fd;
    return 0;
 }
 u32 nvhost_nvdec::Submit(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlSubmit params{};
    std::memcpy(&params, input.data(), sizeof(IoctlSubmit));
    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
    std::memcpy(output.data(), &params, sizeof(IoctlSubmit));
    return 0;
 }
 u32 nvhost_nvdec::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlGetSyncpoint params{};
    std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
    params.value = 0; // Seems to be hard coded at 0
    std::memcpy(output.data(), &params, sizeof(IoctlGetSyncpoint));
    return 0;
 }
 u32 nvhost_nvdec::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlGetWaitbase params{};
    std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
    params.value = 0; // Seems to be hard coded at 0
    std::memcpy(output.data(), &params, sizeof(IoctlGetWaitbase));
    return 0;
 }
 u32 nvhost_nvdec::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlMapBuffer params{};
    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
                params.address_1);
    params.address_1 = 0;
    params.address_2 = 0;
    std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
    return 0;
 }
 u32 nvhost_nvdec::MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlMapBufferEx params{};
    std::memcpy(&params, input.data(), sizeof(IoctlMapBufferEx));
    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
                params.address_1);
    params.address_1 = 0;
    params.address_2 = 0;
    std::memcpy(output.data(), &params, sizeof(IoctlMapBufferEx));
    return 0;
 }
 u32 nvhost_nvdec::UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlUnmapBufferEx params{};
    std::memcpy(&params, input.data(), sizeof(IoctlUnmapBufferEx));
    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
    std::memcpy(output.data(), &params, sizeof(IoctlUnmapBufferEx));
    return 0;
 }
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
@ -4,16 +4,14 @@
 #pragma once
-#include <vector>
+#include <memory>
-#include "common/common_types.h"
+#include "core/hle/service/nvdrv/devices/nvhost_nvdec_common.h"
 #include "common/swap.h"
 #include "core/hle/service/nvdrv/devices/nvdevice.h"
 namespace Service::Nvidia::Devices {
-class nvhost_nvdec final : public nvdevice {
+class nvhost_nvdec final : public nvhost_nvdec_common {
 public:
-    explicit nvhost_nvdec(Core::System& system);
+    explicit nvhost_nvdec(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
    ~nvhost_nvdec() override;
    u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@ -27,62 +25,15 @@ private:
        IocGetSyncpoint = 0xC0080002,
        IocGetWaitbase = 0xC0080003,
        IocMapBuffer = 0xC01C0009,
        IocMapBuffer2 = 0xC16C0009,
        IocMapBuffer3 = 0xC15C0009,
        IocMapBufferEx = 0xC0A40009,
-        IocUnmapBufferEx = 0xC0A4000A,
+        IocUnmapBuffer = 0xC0A4000A,
        IocUnmapBuffer2 = 0xC16C000A,
        IocUnmapBufferEx = 0xC01C000A,
        IocUnmapBuffer3 = 0xC15C000A,
        IocSetSubmitTimeout = 0x40040007,
    };
    struct IoctlSetNvmapFD {
        u32_le nvmap_fd;
    };
    static_assert(sizeof(IoctlSetNvmapFD) == 0x4, "IoctlSetNvmapFD is incorrect size");
    struct IoctlSubmit {
        INSERT_PADDING_BYTES(0x40); // TODO(DarkLordZach): RE this structure
    };
    static_assert(sizeof(IoctlSubmit) == 0x40, "IoctlSubmit has incorrect size");
    struct IoctlGetSyncpoint {
        u32 unknown; // seems to be ignored? Nintendo added this
        u32 value;
    };
    static_assert(sizeof(IoctlGetSyncpoint) == 0x08, "IoctlGetSyncpoint has incorrect size");
    struct IoctlGetWaitbase {
        u32 unknown; // seems to be ignored? Nintendo added this
        u32 value;
    };
    static_assert(sizeof(IoctlGetWaitbase) == 0x08, "IoctlGetWaitbase has incorrect size");
    struct IoctlMapBuffer {
        u32 unknown;
        u32 address_1;
        u32 address_2;
        INSERT_PADDING_BYTES(0x10); // TODO(DarkLordZach): RE this structure
    };
    static_assert(sizeof(IoctlMapBuffer) == 0x1C, "IoctlMapBuffer is incorrect size");
    struct IoctlMapBufferEx {
        u32 unknown;
        u32 address_1;
        u32 address_2;
        INSERT_PADDING_BYTES(0x98); // TODO(DarkLordZach): RE this structure
    };
    static_assert(sizeof(IoctlMapBufferEx) == 0xA4, "IoctlMapBufferEx has incorrect size");
    struct IoctlUnmapBufferEx {
        INSERT_PADDING_BYTES(0xA4); // TODO(DarkLordZach): RE this structure
    };
    static_assert(sizeof(IoctlUnmapBufferEx) == 0xA4, "IoctlUnmapBufferEx has incorrect size");
    u32_le nvmap_fd{};
    u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output);
    u32 Submit(const std::vector<u8>& input, std::vector<u8>& output);
    u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output);
    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
    u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
    u32 MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
    u32 UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
 };
 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
@ -0,0 +1,234 @@
 // Copyright 2020 yuzu emulator team
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #include <algorithm>
 #include <cstring>
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "core/core.h"
 #include "core/hle/service/nvdrv/devices/nvhost_nvdec_common.h"
 #include "core/hle/service/nvdrv/devices/nvmap.h"
 #include "core/memory.h"
 #include "video_core/memory_manager.h"
 #include "video_core/renderer_base.h"
 namespace Service::Nvidia::Devices {
 namespace {
 // Splice vectors will copy count amount of type T from the input vector into the dst vector.
 template <typename T>
 std::size_t SpliceVectors(const std::vector<u8>& input, std::vector<T>& dst, std::size_t count,
                          std::size_t offset) {
    std::memcpy(dst.data(), input.data() + offset, count * sizeof(T));
    offset += count * sizeof(T);
    return offset;
 }
 // Write vectors will write data to the output buffer
 template <typename T>
 std::size_t WriteVectors(std::vector<u8>& dst, const std::vector<T>& src, std::size_t offset) {
    std::memcpy(dst.data() + offset, src.data(), src.size() * sizeof(T));
    offset += src.size() * sizeof(T);
    return offset;
 }
 } // Anonymous namespace
 namespace NvErrCodes {
 constexpr u32 Success{};
 constexpr u32 OutOfMemory{static_cast<u32>(-12)};
 constexpr u32 InvalidInput{static_cast<u32>(-22)};
 } // namespace NvErrCodes
 nvhost_nvdec_common::nvhost_nvdec_common(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
    : nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
 nvhost_nvdec_common::~nvhost_nvdec_common() = default;
 u32 nvhost_nvdec_common::SetNVMAPfd(const std::vector<u8>& input) {
    IoctlSetNvmapFD params{};
    std::memcpy(&params, input.data(), sizeof(IoctlSetNvmapFD));
    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
    nvmap_fd = params.nvmap_fd;
    return 0;
 }
 u32 nvhost_nvdec_common::Submit(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlSubmit params{};
    std::memcpy(&params, input.data(), sizeof(IoctlSubmit));
    LOG_DEBUG(Service_NVDRV, "called NVDEC Submit, cmd_buffer_count={}", params.cmd_buffer_count);
    // Instantiate param buffers
    std::size_t offset = sizeof(IoctlSubmit);
    std::vector<CommandBuffer> command_buffers(params.cmd_buffer_count);
    std::vector<Reloc> relocs(params.relocation_count);
    std::vector<u32> reloc_shifts(params.relocation_count);
    std::vector<SyncptIncr> syncpt_increments(params.syncpoint_count);
    std::vector<SyncptIncr> wait_checks(params.syncpoint_count);
    std::vector<Fence> fences(params.fence_count);
    // Splice input into their respective buffers
    offset = SpliceVectors(input, command_buffers, params.cmd_buffer_count, offset);
    offset = SpliceVectors(input, relocs, params.relocation_count, offset);
    offset = SpliceVectors(input, reloc_shifts, params.relocation_count, offset);
    offset = SpliceVectors(input, syncpt_increments, params.syncpoint_count, offset);
    offset = SpliceVectors(input, wait_checks, params.syncpoint_count, offset);
    offset = SpliceVectors(input, fences, params.fence_count, offset);
    // TODO(ameerj): For async gpu, utilize fences for syncpoint 'max' increment
    auto& gpu = system.GPU();
    for (const auto& cmd_buffer : command_buffers) {
        auto object = nvmap_dev->GetObject(cmd_buffer.memory_id);
        ASSERT_OR_EXECUTE(object, return NvErrCodes::InvalidInput;);
        const auto map = FindBufferMap(object->dma_map_addr);
        if (!map) {
            LOG_ERROR(Service_NVDRV, "Tried to submit an invalid offset 0x{:X} dma 0x{:X}",
                      object->addr, object->dma_map_addr);
            return 0;
        }
        Tegra::ChCommandHeaderList cmdlist(cmd_buffer.word_count);
        gpu.MemoryManager().ReadBlock(map->StartAddr() + cmd_buffer.offset, cmdlist.data(),
                                      cmdlist.size() * sizeof(u32));
        gpu.PushCommandBuffer(cmdlist);
    }
    std::memcpy(output.data(), &params, sizeof(IoctlSubmit));
    // Some games expect command_buffers to be written back
    offset = sizeof(IoctlSubmit);
    offset = WriteVectors(output, command_buffers, offset);
    offset = WriteVectors(output, relocs, offset);
    offset = WriteVectors(output, reloc_shifts, offset);
    offset = WriteVectors(output, syncpt_increments, offset);
    offset = WriteVectors(output, wait_checks, offset);
    return NvErrCodes::Success;
 }
 u32 nvhost_nvdec_common::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlGetSyncpoint params{};
    std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
    LOG_DEBUG(Service_NVDRV, "called GetSyncpoint, id={}", params.param);
    // We found that implementing this causes deadlocks with async gpu, along with degraded
    // performance. TODO: RE the nvdec async implementation
    params.value = 0;
    std::memcpy(output.data(), &params, sizeof(IoctlGetSyncpoint));
    return NvErrCodes::Success;
 }
 u32 nvhost_nvdec_common::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlGetWaitbase params{};
    std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
    params.value = 0; // Seems to be hard coded at 0
    std::memcpy(output.data(), &params, sizeof(IoctlGetWaitbase));
    return 0;
 }
 u32 nvhost_nvdec_common::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlMapBuffer params{};
    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
    std::vector<MapBufferEntry> cmd_buffer_handles(params.num_entries);
    SpliceVectors(input, cmd_buffer_handles, params.num_entries, sizeof(IoctlMapBuffer));
    auto& gpu = system.GPU();
    for (auto& cmf_buff : cmd_buffer_handles) {
        auto object{nvmap_dev->GetObject(cmf_buff.map_handle)};
        if (!object) {
            LOG_ERROR(Service_NVDRV, "invalid cmd_buffer nvmap_handle={:X}", cmf_buff.map_handle);
            std::memcpy(output.data(), &params, output.size());
            return NvErrCodes::InvalidInput;
        }
        if (object->dma_map_addr == 0) {
            // NVDEC and VIC memory is in the 32-bit address space
            // MapAllocate32 will attempt to map a lower 32-bit value in the shared gpu memory space
            const GPUVAddr low_addr = gpu.MemoryManager().MapAllocate32(object->addr, object->size);
            object->dma_map_addr = static_cast<u32>(low_addr);
            // Ensure that the dma_map_addr is indeed in the lower 32-bit address space.
            ASSERT(object->dma_map_addr == low_addr);
        }
        if (!object->dma_map_addr) {
            LOG_ERROR(Service_NVDRV, "failed to map size={}", object->size);
        } else {
            cmf_buff.map_address = object->dma_map_addr;
            AddBufferMap(object->dma_map_addr, object->size, object->addr,
                         object->status == nvmap::Object::Status::Allocated);
        }
    }
    std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
    std::memcpy(output.data() + sizeof(IoctlMapBuffer), cmd_buffer_handles.data(),
                cmd_buffer_handles.size() * sizeof(MapBufferEntry));
    return NvErrCodes::Success;
 }
 u32 nvhost_nvdec_common::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlMapBuffer params{};
    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
    std::vector<MapBufferEntry> cmd_buffer_handles(params.num_entries);
    SpliceVectors(input, cmd_buffer_handles, params.num_entries, sizeof(IoctlMapBuffer));
    auto& gpu = system.GPU();
    for (auto& cmf_buff : cmd_buffer_handles) {
        const auto object{nvmap_dev->GetObject(cmf_buff.map_handle)};
        if (!object) {
            LOG_ERROR(Service_NVDRV, "invalid cmd_buffer nvmap_handle={:X}", cmf_buff.map_handle);
            std::memcpy(output.data(), &params, output.size());
            return NvErrCodes::InvalidInput;
        }
        if (const auto size{RemoveBufferMap(object->dma_map_addr)}; size) {
            gpu.MemoryManager().Unmap(object->dma_map_addr, *size);
        } else {
            // This occurs quite frequently, however does not seem to impact functionality
            LOG_DEBUG(Service_NVDRV, "invalid offset=0x{:X} dma=0x{:X}", object->addr,
                      object->dma_map_addr);
        }
        object->dma_map_addr = 0;
    }
    std::memset(output.data(), 0, output.size());
    return NvErrCodes::Success;
 }
 u32 nvhost_nvdec_common::SetSubmitTimeout(const std::vector<u8>& input, std::vector<u8>& output) {
    std::memcpy(&submit_timeout, input.data(), input.size());
    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
    return NvErrCodes::Success;
 }
 std::optional<nvhost_nvdec_common::BufferMap> nvhost_nvdec_common::FindBufferMap(
    GPUVAddr gpu_addr) const {
    const auto it = std::find_if(
        buffer_mappings.begin(), buffer_mappings.upper_bound(gpu_addr), [&](const auto& entry) {
            return (gpu_addr >= entry.second.StartAddr() && gpu_addr < entry.second.EndAddr());
        });
    ASSERT(it != buffer_mappings.end());
    return it->second;
 }
 void nvhost_nvdec_common::AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr,
                                       bool is_allocated) {
    buffer_mappings.insert_or_assign(gpu_addr, BufferMap{gpu_addr, size, cpu_addr, is_allocated});
 }
 std::optional<std::size_t> nvhost_nvdec_common::RemoveBufferMap(GPUVAddr gpu_addr) {
    const auto iter{buffer_mappings.find(gpu_addr)};
    if (iter == buffer_mappings.end()) {
        return std::nullopt;
    }
    std::size_t size = 0;
    if (iter->second.IsAllocated()) {
        size = iter->second.Size();
    }
    buffer_mappings.erase(iter);
    return size;
 }
 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
@ -0,0 +1,168 @@
 // Copyright 2020 yuzu emulator team
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #pragma once
 #include <map>
 #include <vector>
 #include "common/common_types.h"
 #include "common/swap.h"
 #include "core/hle/service/nvdrv/devices/nvdevice.h"
 namespace Service::Nvidia::Devices {
 class nvmap;
 class nvhost_nvdec_common : public nvdevice {
 public:
    explicit nvhost_nvdec_common(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
    ~nvhost_nvdec_common() override;
    virtual u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
                      std::vector<u8>& output, std::vector<u8>& output2, IoctlCtrl& ctrl,
                      IoctlVersion version) = 0;
 protected:
    class BufferMap final {
    public:
        constexpr BufferMap() = default;
        constexpr BufferMap(GPUVAddr start_addr, std::size_t size)
            : start_addr{start_addr}, end_addr{start_addr + size} {}
        constexpr BufferMap(GPUVAddr start_addr, std::size_t size, VAddr cpu_addr,
                            bool is_allocated)
            : start_addr{start_addr}, end_addr{start_addr + size}, cpu_addr{cpu_addr},
              is_allocated{is_allocated} {}
        constexpr VAddr StartAddr() const {
            return start_addr;
        }
        constexpr VAddr EndAddr() const {
            return end_addr;
        }
        constexpr std::size_t Size() const {
            return end_addr - start_addr;
        }
        constexpr VAddr CpuAddr() const {
            return cpu_addr;
        }
        constexpr bool IsAllocated() const {
            return is_allocated;
        }
    private:
        GPUVAddr start_addr{};
        GPUVAddr end_addr{};
        VAddr cpu_addr{};
        bool is_allocated{};
    };
    struct IoctlSetNvmapFD {
        u32_le nvmap_fd;
    };
    static_assert(sizeof(IoctlSetNvmapFD) == 4, "IoctlSetNvmapFD is incorrect size");
    struct IoctlSubmitCommandBuffer {
        u32_le id;
        u32_le offset;
        u32_le count;
    };
    static_assert(sizeof(IoctlSubmitCommandBuffer) == 0xC,
                  "IoctlSubmitCommandBuffer is incorrect size");
    struct IoctlSubmit {
        u32_le cmd_buffer_count;
        u32_le relocation_count;
        u32_le syncpoint_count;
        u32_le fence_count;
    };
    static_assert(sizeof(IoctlSubmit) == 0x10, "IoctlSubmit has incorrect size");
    struct CommandBuffer {
        s32 memory_id;
        u32 offset;
        s32 word_count;
    };
    static_assert(sizeof(CommandBuffer) == 0xC, "CommandBuffer has incorrect size");
    struct Reloc {
        s32 cmdbuffer_memory;
        s32 cmdbuffer_offset;
        s32 target;
        s32 target_offset;
    };
    static_assert(sizeof(Reloc) == 0x10, "CommandBuffer has incorrect size");
    struct SyncptIncr {
        u32 id;
        u32 increments;
    };
    static_assert(sizeof(SyncptIncr) == 0x8, "CommandBuffer has incorrect size");
    struct Fence {
        u32 id;
        u32 value;
    };
    static_assert(sizeof(Fence) == 0x8, "CommandBuffer has incorrect size");
    struct IoctlGetSyncpoint {
        // Input
        u32_le param;
        // Output
        u32_le value;
    };
    static_assert(sizeof(IoctlGetSyncpoint) == 8, "IocGetIdParams has wrong size");
    struct IoctlGetWaitbase {
        u32_le unknown; // seems to be ignored? Nintendo added this
        u32_le value;
    };
    static_assert(sizeof(IoctlGetWaitbase) == 0x8, "IoctlGetWaitbase is incorrect size");
    struct IoctlMapBuffer {
        u32_le num_entries;
        u32_le data_address; // Ignored by the driver.
        u32_le attach_host_ch_das;
    };
    static_assert(sizeof(IoctlMapBuffer) == 0x0C, "IoctlMapBuffer is incorrect size");
    struct IocGetIdParams {
        // Input
        u32_le param;
        // Output
        u32_le value;
    };
    static_assert(sizeof(IocGetIdParams) == 8, "IocGetIdParams has wrong size");
    // Used for mapping and unmapping command buffers
    struct MapBufferEntry {
        u32_le map_handle;
        u32_le map_address;
    };
    static_assert(sizeof(IoctlMapBuffer) == 0x0C, "IoctlMapBuffer is incorrect size");
    /// Ioctl command implementations
    u32 SetNVMAPfd(const std::vector<u8>& input);
    u32 Submit(const std::vector<u8>& input, std::vector<u8>& output);
    u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output);
    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
    u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
    u32 UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
    u32 SetSubmitTimeout(const std::vector<u8>& input, std::vector<u8>& output);
    std::optional<BufferMap> FindBufferMap(GPUVAddr gpu_addr) const;
    void AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr, bool is_allocated);
    std::optional<std::size_t> RemoveBufferMap(GPUVAddr gpu_addr);
    u32_le nvmap_fd{};
    u32_le submit_timeout{};
    std::shared_ptr<nvmap> nvmap_dev;
    // This is expected to be ordered, therefore we must use a map, not unordered_map
    std::map<GPUVAddr, BufferMap> buffer_mappings;
 };
 }; // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@ -2,15 +2,17 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #include <cstring>
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/core.h"
 #include "core/hle/service/nvdrv/devices/nvhost_vic.h"
 #include "video_core/memory_manager.h"
 #include "video_core/renderer_base.h"
 namespace Service::Nvidia::Devices {
 nvhost_vic::nvhost_vic(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
    : nvhost_nvdec_common(system, std::move(nvmap_dev)) {}
 nvhost_vic::nvhost_vic(Core::System& system) : nvdevice(system) {}
 nvhost_vic::~nvhost_vic() = default;
 u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@ -21,7 +23,7 @@ u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::ve
    switch (static_cast<IoctlCommand>(command.raw)) {
    case IoctlCommand::IocSetNVMAPfdCommand:
-        return SetNVMAPfd(input, output);
+        return SetNVMAPfd(input);
    case IoctlCommand::IocSubmit:
        return Submit(input, output);
    case IoctlCommand::IocGetSyncpoint:
@ -29,83 +31,19 @@ u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::ve
    case IoctlCommand::IocGetWaitbase:
        return GetWaitbase(input, output);
    case IoctlCommand::IocMapBuffer:
-        return MapBuffer(input, output);
+    case IoctlCommand::IocMapBuffer2:
    case IoctlCommand::IocMapBuffer3:
    case IoctlCommand::IocMapBuffer4:
    case IoctlCommand::IocMapBufferEx:
        return MapBuffer(input, output);
    case IoctlCommand::IocUnmapBuffer:
    case IoctlCommand::IocUnmapBuffer2:
    case IoctlCommand::IocUnmapBuffer3:
    case IoctlCommand::IocUnmapBufferEx:
-        return UnmapBufferEx(input, output);
+        return UnmapBuffer(input, output);
    }
-    UNIMPLEMENTED_MSG("Unimplemented ioctl");
+    UNIMPLEMENTED_MSG("Unimplemented ioctl 0x{:X}", command.raw);
    return 0;
 }
 u32 nvhost_vic::SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlSetNvmapFD params{};
    std::memcpy(&params, input.data(), sizeof(IoctlSetNvmapFD));
    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
    nvmap_fd = params.nvmap_fd;
    return 0;
 }
 u32 nvhost_vic::Submit(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlSubmit params{};
    std::memcpy(&params, input.data(), sizeof(IoctlSubmit));
    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
    // Workaround for Luigi's Mansion 3, as nvhost_vic is not implemented for asynch GPU
    params.command_buffer = {};
    std::memcpy(output.data(), &params, sizeof(IoctlSubmit));
    return 0;
 }
 u32 nvhost_vic::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlGetSyncpoint params{};
    std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
    params.value = 0; // Seems to be hard coded at 0
    std::memcpy(output.data(), &params, sizeof(IoctlGetSyncpoint));
    return 0;
 }
 u32 nvhost_vic::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlGetWaitbase params{};
    std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
    params.value = 0; // Seems to be hard coded at 0
    std::memcpy(output.data(), &params, sizeof(IoctlGetWaitbase));
    return 0;
 }
 u32 nvhost_vic::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlMapBuffer params{};
    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
                params.address_1);
    params.address_1 = 0;
    params.address_2 = 0;
    std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
    return 0;
 }
 u32 nvhost_vic::MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlMapBufferEx params{};
    std::memcpy(&params, input.data(), sizeof(IoctlMapBufferEx));
    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
                params.address_1);
    params.address_1 = 0;
    params.address_2 = 0;
    std::memcpy(output.data(), &params, sizeof(IoctlMapBufferEx));
    return 0;
 }
 u32 nvhost_vic::UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlUnmapBufferEx params{};
    std::memcpy(&params, input.data(), sizeof(IoctlUnmapBufferEx));
    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
    std::memcpy(output.data(), &params, sizeof(IoctlUnmapBufferEx));
    return 0;
 }
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.h
@ -4,19 +4,15 @@
 #pragma once
-#include <array>
+#include "core/hle/service/nvdrv/devices/nvhost_nvdec_common.h"
 #include <vector>
 #include "common/common_types.h"
 #include "common/swap.h"
 #include "core/hle/service/nvdrv/devices/nvdevice.h"
 namespace Service::Nvidia::Devices {
 class nvmap;
-class nvhost_vic final : public nvdevice {
+class nvhost_vic final : public nvhost_nvdec_common {
 public:
-    explicit nvhost_vic(Core::System& system);
+    explicit nvhost_vic(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
-    ~nvhost_vic() override;
+    ~nvhost_vic();
    u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
              std::vector<u8>& output, std::vector<u8>& output2, IoctlCtrl& ctrl,
              IoctlVersion version) override;
@ -28,74 +24,14 @@ private:
        IocGetSyncpoint = 0xC0080002,
        IocGetWaitbase = 0xC0080003,
        IocMapBuffer = 0xC01C0009,
        IocMapBuffer2 = 0xC0340009,
        IocMapBuffer3 = 0xC0140009,
        IocMapBuffer4 = 0xC00C0009,
        IocMapBufferEx = 0xC03C0009,
-        IocUnmapBufferEx = 0xC03C000A,
+        IocUnmapBuffer = 0xC03C000A,
        IocUnmapBuffer2 = 0xC034000A,
        IocUnmapBuffer3 = 0xC00C000A,
        IocUnmapBufferEx = 0xC01C000A,
    };
    struct IoctlSetNvmapFD {
        u32_le nvmap_fd;
    };
    static_assert(sizeof(IoctlSetNvmapFD) == 4, "IoctlSetNvmapFD is incorrect size");
    struct IoctlSubmitCommandBuffer {
        u32 id;
        u32 offset;
        u32 count;
    };
    static_assert(sizeof(IoctlSubmitCommandBuffer) == 0xC,
                  "IoctlSubmitCommandBuffer is incorrect size");
    struct IoctlSubmit {
        u32 command_buffer_count;
        u32 relocations_count;
        u32 syncpt_count;
        u32 wait_count;
        std::array<IoctlSubmitCommandBuffer, 4> command_buffer;
    };
    static_assert(sizeof(IoctlSubmit) == 0x40, "IoctlSubmit is incorrect size");
    struct IoctlGetSyncpoint {
        u32 unknown; // seems to be ignored? Nintendo added this
        u32 value;
    };
    static_assert(sizeof(IoctlGetSyncpoint) == 0x8, "IoctlGetSyncpoint is incorrect size");
    struct IoctlGetWaitbase {
        u32 unknown; // seems to be ignored? Nintendo added this
        u32 value;
    };
    static_assert(sizeof(IoctlGetWaitbase) == 0x8, "IoctlGetWaitbase is incorrect size");
    struct IoctlMapBuffer {
        u32 unknown;
        u32 address_1;
        u32 address_2;
        INSERT_PADDING_BYTES(0x10); // TODO(DarkLordZach): RE this structure
    };
    static_assert(sizeof(IoctlMapBuffer) == 0x1C, "IoctlMapBuffer is incorrect size");
    struct IoctlMapBufferEx {
        u32 unknown;
        u32 address_1;
        u32 address_2;
        INSERT_PADDING_BYTES(0x30); // TODO(DarkLordZach): RE this structure
    };
    static_assert(sizeof(IoctlMapBufferEx) == 0x3C, "IoctlMapBufferEx is incorrect size");
    struct IoctlUnmapBufferEx {
        INSERT_PADDING_BYTES(0x3C); // TODO(DarkLordZach): RE this structure
    };
    static_assert(sizeof(IoctlUnmapBufferEx) == 0x3C, "IoctlUnmapBufferEx is incorrect size");
    u32_le nvmap_fd{};
    u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output);
    u32 Submit(const std::vector<u8>& input, std::vector<u8>& output);
    u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output);
    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
    u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
    u32 MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
    u32 UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
 };
 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvmap.h
+++ b/src/core/hle/service/nvdrv/devices/nvmap.h
@ -37,6 +37,7 @@ public:
        VAddr addr;
        Status status;
        u32 refcount;
        u32 dma_map_addr;
    };
    std::shared_ptr<Object> GetObject(u32 handle) const {
--- a/src/core/hle/service/nvdrv/nvdrv.cpp
+++ b/src/core/hle/service/nvdrv/nvdrv.cpp
@ -51,9 +51,9 @@ Module::Module(Core::System& system) {
    devices["/dev/nvmap"] = nvmap_dev;
    devices["/dev/nvdisp_disp0"] = std::make_shared<Devices::nvdisp_disp0>(system, nvmap_dev);
    devices["/dev/nvhost-ctrl"] = std::make_shared<Devices::nvhost_ctrl>(system, events_interface);
-    devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>(system);
+    devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>(system, nvmap_dev);
    devices["/dev/nvhost-nvjpg"] = std::make_shared<Devices::nvhost_nvjpg>(system);
-    devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>(system);
+    devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>(system, nvmap_dev);
 }
 Module::~Module() = default;
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@ -63,6 +63,7 @@ void LogSettings() {
    log_setting("Renderer_GPUAccuracyLevel", values.gpu_accuracy.GetValue());
    log_setting("Renderer_UseAsynchronousGpuEmulation",
                values.use_asynchronous_gpu_emulation.GetValue());
    log_setting("Renderer_UseNvdecEmulation", values.use_nvdec_emulation.GetValue());
    log_setting("Renderer_UseVsync", values.use_vsync.GetValue());
    log_setting("Renderer_UseAssemblyShaders", values.use_assembly_shaders.GetValue());
    log_setting("Renderer_UseAsynchronousShaders", values.use_asynchronous_shaders.GetValue());
@ -119,6 +120,7 @@ void RestoreGlobalState() {
    values.use_disk_shader_cache.SetGlobal(true);
    values.gpu_accuracy.SetGlobal(true);
    values.use_asynchronous_gpu_emulation.SetGlobal(true);
    values.use_nvdec_emulation.SetGlobal(true);
    values.use_vsync.SetGlobal(true);
    values.use_assembly_shaders.SetGlobal(true);
    values.use_asynchronous_shaders.SetGlobal(true);
--- a/src/core/settings.h
+++ b/src/core/settings.h
@ -111,6 +111,7 @@ struct Values {
    Setting<bool> use_disk_shader_cache;
    Setting<GPUAccuracy> gpu_accuracy;
    Setting<bool> use_asynchronous_gpu_emulation;
    Setting<bool> use_nvdec_emulation;
    Setting<bool> use_vsync;
    Setting<bool> use_assembly_shaders;
    Setting<bool> use_asynchronous_shaders;
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@ -206,6 +206,8 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) {
             TranslateGPUAccuracyLevel(Settings::values.gpu_accuracy.GetValue()));
    AddField(field_type, "Renderer_UseAsynchronousGpuEmulation",
             Settings::values.use_asynchronous_gpu_emulation.GetValue());
    AddField(field_type, "Renderer_UseNvdecEmulation",
             Settings::values.use_nvdec_emulation.GetValue());
    AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync.GetValue());
    AddField(field_type, "Renderer_UseAssemblyShaders",
             Settings::values.use_assembly_shaders.GetValue());
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@ -5,6 +5,24 @@ add_library(video_core STATIC
    buffer_cache/buffer_cache.h
    buffer_cache/map_interval.cpp
    buffer_cache/map_interval.h
    cdma_pusher.cpp
    cdma_pusher.h
    command_classes/codecs/codec.cpp
    command_classes/codecs/codec.h
    command_classes/codecs/h264.cpp
    command_classes/codecs/h264.h
    command_classes/codecs/vp9.cpp
    command_classes/codecs/vp9.h
    command_classes/codecs/vp9_types.h
    command_classes/host1x.cpp
    command_classes/host1x.h
    command_classes/nvdec.cpp
    command_classes/nvdec.h
    command_classes/nvdec_common.h
    command_classes/sync_manager.cpp
    command_classes/sync_manager.h
    command_classes/vic.cpp
    command_classes/vic.h
    compatible_formats.cpp
    compatible_formats.h
    dirty_flags.cpp
@ -250,6 +268,14 @@ create_target_directory_groups(video_core)
 target_link_libraries(video_core PUBLIC common core)
 target_link_libraries(video_core PRIVATE glad xbyak)
 if (MSVC)
    target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
    target_link_libraries(video_core PUBLIC ${FFMPEG_LIBRARY_DIR}/swscale.lib ${FFMPEG_LIBRARY_DIR}/avcodec.lib ${FFMPEG_LIBRARY_DIR}/avutil.lib)
 else()
    target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
    target_link_libraries(video_core PRIVATE ${FFMPEG_LIBRARIES})
 endif()
 add_dependencies(video_core host_shaders)
 target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE})
--- a/src/video_core/cdma_pusher.cpp
+++ b/src/video_core/cdma_pusher.cpp
@ -0,0 +1,171 @@
 // MIT License
 //
 // Copyright (c) Ryujinx Team and Contributors
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
 // associated documentation files (the "Software"), to deal in the Software without restriction,
 // including without limitation the rights to use, copy, modify, merge, publish, distribute,
 // sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or
 // substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
 // NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 //
 #include "command_classes/host1x.h"
 #include "command_classes/nvdec.h"
 #include "command_classes/vic.h"
 #include "common/bit_util.h"
 #include "video_core/cdma_pusher.h"
 #include "video_core/command_classes/nvdec_common.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 namespace Tegra {
 CDmaPusher::CDmaPusher(GPU& gpu)
    : gpu(gpu), nvdec_processor(std::make_shared<Nvdec>(gpu)),
      vic_processor(std::make_unique<Vic>(gpu, nvdec_processor)),
      host1x_processor(std::make_unique<Host1x>(gpu)),
      nvdec_sync(std::make_unique<SyncptIncrManager>(gpu)),
      vic_sync(std::make_unique<SyncptIncrManager>(gpu)) {}
 CDmaPusher::~CDmaPusher() = default;
 void CDmaPusher::Push(ChCommandHeaderList&& entries) {
    cdma_queue.push(std::move(entries));
 }
 void CDmaPusher::DispatchCalls() {
    while (!cdma_queue.empty()) {
        Step();
    }
 }
 void CDmaPusher::Step() {
    const auto entries{cdma_queue.front()};
    cdma_queue.pop();
    std::vector<u32> values(entries.size());
    std::memcpy(values.data(), entries.data(), entries.size() * sizeof(u32));
    for (const u32 value : values) {
        if (mask != 0) {
            const u32 lbs = Common::CountTrailingZeroes32(mask);
            mask &= ~(1U << lbs);
            ExecuteCommand(static_cast<u32>(offset + lbs), value);
            continue;
        } else if (count != 0) {
            --count;
            ExecuteCommand(static_cast<u32>(offset), value);
            if (incrementing) {
                ++offset;
            }
            continue;
        }
        const auto mode = static_cast<ChSubmissionMode>((value >> 28) & 0xf);
        switch (mode) {
        case ChSubmissionMode::SetClass: {
            mask = value & 0x3f;
            offset = (value >> 16) & 0xfff;
            current_class = static_cast<ChClassId>((value >> 6) & 0x3ff);
            break;
        }
        case ChSubmissionMode::Incrementing:
        case ChSubmissionMode::NonIncrementing:
            count = value & 0xffff;
            offset = (value >> 16) & 0xfff;
            incrementing = mode == ChSubmissionMode::Incrementing;
            break;
        case ChSubmissionMode::Mask:
            mask = value & 0xffff;
            offset = (value >> 16) & 0xfff;
            break;
        case ChSubmissionMode::Immediate: {
            const u32 data = value & 0xfff;
            offset = (value >> 16) & 0xfff;
            ExecuteCommand(static_cast<u32>(offset), data);
            break;
        }
        default:
            UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode));
            break;
        }
    }
 }
 void CDmaPusher::ExecuteCommand(u32 offset, u32 data) {
    switch (current_class) {
    case ChClassId::NvDec:
        ThiStateWrite(nvdec_thi_state, offset, {data});
        switch (static_cast<ThiMethod>(offset)) {
        case ThiMethod::IncSyncpt: {
            LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method");
            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
            if (cond == 0) {
                nvdec_sync->Increment(syncpoint_id);
            } else {
                nvdec_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id);
                nvdec_sync->SignalDone(syncpoint_id);
            }
            break;
        }
        case ThiMethod::SetMethod1:
            LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}",
                      static_cast<u32>(nvdec_thi_state.method_0));
            nvdec_processor->ProcessMethod(
                static_cast<Tegra::Nvdec::Method>(nvdec_thi_state.method_0), {data});
            break;
        default:
            break;
        }
        break;
    case ChClassId::GraphicsVic:
        ThiStateWrite(vic_thi_state, static_cast<u32>(offset), {data});
        switch (static_cast<ThiMethod>(offset)) {
        case ThiMethod::IncSyncpt: {
            LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method");
            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
            if (cond == 0) {
                vic_sync->Increment(syncpoint_id);
            } else {
                vic_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id);
                vic_sync->SignalDone(syncpoint_id);
            }
            break;
        }
        case ThiMethod::SetMethod1:
            LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})",
                      static_cast<u32>(vic_thi_state.method_0));
            vic_processor->ProcessMethod(static_cast<Tegra::Vic::Method>(vic_thi_state.method_0),
                                         {data});
            break;
        default:
            break;
        }
        break;
    case ChClassId::Host1x:
        // This device is mainly for syncpoint synchronization
        LOG_DEBUG(Service_NVDRV, "Host1X Class Method");
        host1x_processor->ProcessMethod(static_cast<Tegra::Host1x::Method>(offset), {data});
        break;
    default:
        UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class));
        break;
    }
 }
 void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 offset, const std::vector<u32>& arguments) {
    u8* const state_offset = reinterpret_cast<u8*>(&state) + sizeof(u32) * offset;
    std::memcpy(state_offset, arguments.data(), sizeof(u32) * arguments.size());
 }
 } // namespace Tegra
--- a/src/video_core/cdma_pusher.h
+++ b/src/video_core/cdma_pusher.h
@ -0,0 +1,138 @@
 // Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #pragma once
 #include <memory>
 #include <unordered_map>
 #include <vector>
 #include <queue>
 #include "common/bit_field.h"
 #include "common/common_types.h"
 #include "video_core/command_classes/sync_manager.h"
 namespace Tegra {
 class GPU;
 class Nvdec;
 class Vic;
 class Host1x;
 enum class ChSubmissionMode : u32 {
    SetClass = 0,
    Incrementing = 1,
    NonIncrementing = 2,
    Mask = 3,
    Immediate = 4,
    Restart = 5,
    Gather = 6,
 };
 enum class ChClassId : u32 {
    NoClass = 0x0,
    Host1x = 0x1,
    VideoEncodeMpeg = 0x20,
    VideoEncodeNvEnc = 0x21,
    VideoStreamingVi = 0x30,
    VideoStreamingIsp = 0x32,
    VideoStreamingIspB = 0x34,
    VideoStreamingViI2c = 0x36,
    GraphicsVic = 0x5d,
    Graphics3D = 0x60,
    GraphicsGpu = 0x61,
    Tsec = 0xe0,
    TsecB = 0xe1,
    NvJpg = 0xc0,
    NvDec = 0xf0
 };
 enum class ChMethod : u32 {
    Empty = 0,
    SetMethod = 0x10,
    SetData = 0x11,
 };
 union ChCommandHeader {
    u32 raw;
    BitField<0, 16, u32> value;
    BitField<16, 12, ChMethod> method_offset;
    BitField<28, 4, ChSubmissionMode> submission_mode;
 };
 static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size");
 struct ChCommand {
    ChClassId class_id{};
    int method_offset{};
    std::vector<u32> arguments;
 };
 using ChCommandHeaderList = std::vector<Tegra::ChCommandHeader>;
 using ChCommandList = std::vector<Tegra::ChCommand>;
 struct ThiRegisters {
    u32_le increment_syncpt{};
    INSERT_PADDING_WORDS(1);
    u32_le increment_syncpt_error{};
    u32_le ctx_switch_incremement_syncpt{};
    INSERT_PADDING_WORDS(4);
    u32_le ctx_switch{};
    INSERT_PADDING_WORDS(1);
    u32_le ctx_syncpt_eof{};
    INSERT_PADDING_WORDS(5);
    u32_le method_0{};
    u32_le method_1{};
    INSERT_PADDING_WORDS(12);
    u32_le int_status{};
    u32_le int_mask{};
 };
 enum class ThiMethod : u32 {
    IncSyncpt = offsetof(ThiRegisters, increment_syncpt) / sizeof(u32),
    SetMethod0 = offsetof(ThiRegisters, method_0) / sizeof(u32),
    SetMethod1 = offsetof(ThiRegisters, method_1) / sizeof(u32),
 };
 class CDmaPusher {
 public:
    explicit CDmaPusher(GPU& gpu);
    ~CDmaPusher();
    /// Push NVDEC command buffer entries into queue
    void Push(ChCommandHeaderList&& entries);
    /// Process queued command buffer entries
    void DispatchCalls();
    /// Process one queue element
    void Step();
    /// Invoke command class devices to execute the command based on the current state
    void ExecuteCommand(u32 offset, u32 data);
 private:
    /// Write arguments value to the ThiRegisters member at the specified offset
    void ThiStateWrite(ThiRegisters& state, u32 offset, const std::vector<u32>& arguments);
    GPU& gpu;
    std::shared_ptr<Tegra::Nvdec> nvdec_processor;
    std::unique_ptr<Tegra::Vic> vic_processor;
    std::unique_ptr<Tegra::Host1x> host1x_processor;
    std::unique_ptr<SyncptIncrManager> nvdec_sync;
    std::unique_ptr<SyncptIncrManager> vic_sync;
    ChClassId current_class{};
    ThiRegisters vic_thi_state{};
    ThiRegisters nvdec_thi_state{};
    s32 count{};
    s32 offset{};
    s32 mask{};
    bool incrementing{};
    // Queue of command lists to be processed
    std::queue<ChCommandHeaderList> cdma_queue;
 };
 } // namespace Tegra
--- a/src/video_core/command_classes/codecs/codec.cpp
+++ b/src/video_core/command_classes/codecs/codec.cpp
@ -0,0 +1,114 @@
 // Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #include <cstring>
 #include <fstream>
 #include "common/assert.h"
 #include "video_core/command_classes/codecs/codec.h"
 #include "video_core/command_classes/codecs/h264.h"
 #include "video_core/command_classes/codecs/vp9.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 extern "C" {
 #include <libavutil/opt.h>
 }
 namespace Tegra {
 Codec::Codec(GPU& gpu_)
    : gpu(gpu_), h264_decoder(std::make_unique<Decoder::H264>(gpu)),
      vp9_decoder(std::make_unique<Decoder::VP9>(gpu)) {}
 Codec::~Codec() {
    if (!initialized) {
        return;
    }
    // Free libav memory
    avcodec_send_packet(av_codec_ctx, nullptr);
    avcodec_receive_frame(av_codec_ctx, av_frame);
    avcodec_flush_buffers(av_codec_ctx);
    av_frame_unref(av_frame);
    av_free(av_frame);
    avcodec_close(av_codec_ctx);
 }
 void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {
    LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", static_cast<u32>(codec));
    current_codec = codec;
 }
 void Codec::StateWrite(u32 offset, u64 arguments) {
    u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u64);
    std::memcpy(state_offset, &arguments, sizeof(u64));
 }
 void Codec::Decode() {
    bool is_first_frame = false;
    if (!initialized) {
        if (current_codec == NvdecCommon::VideoCodec::H264) {
            av_codec = avcodec_find_decoder(AV_CODEC_ID_H264);
        } else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
            av_codec = avcodec_find_decoder(AV_CODEC_ID_VP9);
        } else {
            LOG_ERROR(Service_NVDRV, "Unknown video codec {}", static_cast<u32>(current_codec));
            return;
        }
        av_codec_ctx = avcodec_alloc_context3(av_codec);
        av_frame = av_frame_alloc();
        av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);
        // TODO(ameerj): libavcodec gpu hw acceleration
        const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr);
        if (av_error < 0) {
            LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed.");
            av_frame_unref(av_frame);
            av_free(av_frame);
            avcodec_close(av_codec_ctx);
            return;
        }
        initialized = true;
        is_first_frame = true;
    }
    bool vp9_hidden_frame = false;
    AVPacket packet{};
    av_init_packet(&packet);
    std::vector<u8> frame_data;
    if (current_codec == NvdecCommon::VideoCodec::H264) {
        frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame);
    } else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
        frame_data = vp9_decoder->ComposeFrameHeader(state);
        vp9_hidden_frame = vp9_decoder->WasFrameHidden();
    }
    packet.data = frame_data.data();
    packet.size = static_cast<int>(frame_data.size());
    avcodec_send_packet(av_codec_ctx, &packet);
    if (!vp9_hidden_frame) {
        // Only receive/store visible frames
        avcodec_receive_frame(av_codec_ctx, av_frame);
    }
 }
 AVFrame* Codec::GetCurrentFrame() {
    return av_frame;
 }
 const AVFrame* Codec::GetCurrentFrame() const {
    return av_frame;
 }
 NvdecCommon::VideoCodec Codec::GetCurrentCodec() const {
    return current_codec;
 }
 } // namespace Tegra
--- a/src/video_core/command_classes/codecs/codec.h
+++ b/src/video_core/command_classes/codecs/codec.h
@ -0,0 +1,68 @@
 // Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #pragma once
 #include <memory>
 #include <vector>
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "video_core/command_classes/nvdec_common.h"
 extern "C" {
 #if defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic ignored "-Wconversion"
 #endif
 #include <libavcodec/avcodec.h>
 #if defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic pop
 #endif
 }
 namespace Tegra {
 class GPU;
 struct VicRegisters;
 namespace Decoder {
 class H264;
 class VP9;
 } // namespace Decoder
 class Codec {
 public:
    explicit Codec(GPU& gpu);
    ~Codec();
    /// Sets NVDEC video stream codec
    void SetTargetCodec(NvdecCommon::VideoCodec codec);
    /// Populate NvdecRegisters state with argument value at the provided offset
    void StateWrite(u32 offset, u64 arguments);
    /// Call decoders to construct headers, decode AVFrame with ffmpeg
    void Decode();
    /// Returns most recently decoded frame
    AVFrame* GetCurrentFrame();
    const AVFrame* GetCurrentFrame() const;
    /// Returns the value of current_codec
    NvdecCommon::VideoCodec GetCurrentCodec() const;
 private:
    bool initialized{};
    NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None};
    AVCodec* av_codec{nullptr};
    AVCodecContext* av_codec_ctx{nullptr};
    AVFrame* av_frame{nullptr};
    GPU& gpu;
    std::unique_ptr<Decoder::H264> h264_decoder;
    std::unique_ptr<Decoder::VP9> vp9_decoder;
    NvdecCommon::NvdecRegisters state{};
 };
 } // namespace Tegra
--- a/src/video_core/command_classes/codecs/h264.cpp
+++ b/src/video_core/command_classes/codecs/h264.cpp
@ -0,0 +1,276 @@
 // MIT License
 //
 // Copyright (c) Ryujinx Team and Contributors
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
 // associated documentation files (the "Software"), to deal in the Software without restriction,
 // including without limitation the rights to use, copy, modify, merge, publish, distribute,
 // sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or
 // substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
 // NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 //
 #include "common/bit_util.h"
 #include "video_core/command_classes/codecs/h264.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 namespace Tegra::Decoder {
 H264::H264(GPU& gpu_) : gpu(gpu_) {}
 H264::~H264() = default;
 std::vector<u8>& H264::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state, bool is_first_frame) {
    H264DecoderContext context{};
    gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext));
    const s32 frame_number = static_cast<s32>((context.h264_parameter_set.flags >> 46) & 0x1ffff);
    if (!is_first_frame && frame_number != 0) {
        frame.resize(context.frame_data_size);
        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size());
    } else {
        /// Encode header
        H264BitWriter writer{};
        writer.WriteU(1, 24);
        writer.WriteU(0, 1);
        writer.WriteU(3, 2);
        writer.WriteU(7, 5);
        writer.WriteU(100, 8);
        writer.WriteU(0, 8);
        writer.WriteU(31, 8);
        writer.WriteUe(0);
        const s32 chroma_format_idc = (context.h264_parameter_set.flags >> 12) & 0x3;
        writer.WriteUe(chroma_format_idc);
        if (chroma_format_idc == 3) {
            writer.WriteBit(false);
        }
        writer.WriteUe(0);
        writer.WriteUe(0);
        writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag
        writer.WriteBit(false); // Scaling matrix present flag
        const s32 order_cnt_type = static_cast<s32>((context.h264_parameter_set.flags >> 14) & 3);
        writer.WriteUe(static_cast<s32>((context.h264_parameter_set.flags >> 8) & 0xf));
        writer.WriteUe(order_cnt_type);
        if (order_cnt_type == 0) {
            writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt);
        } else if (order_cnt_type == 1) {
            writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);
            writer.WriteSe(0);
            writer.WriteSe(0);
            writer.WriteUe(0);
        }
        const s32 pic_height = context.h264_parameter_set.pic_height_in_map_units /
                               (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
        writer.WriteUe(16);
        writer.WriteBit(false);
        writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
        writer.WriteUe(pic_height - 1);
        writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0);
        if (!context.h264_parameter_set.frame_mbs_only_flag) {
            writer.WriteBit(((context.h264_parameter_set.flags >> 0) & 1) != 0);
        }
        writer.WriteBit(((context.h264_parameter_set.flags >> 1) & 1) != 0);
        writer.WriteBit(false); // Frame cropping flag
        writer.WriteBit(false); // VUI parameter present flag
        writer.End();
        // H264 PPS
        writer.WriteU(1, 24);
        writer.WriteU(0, 1);
        writer.WriteU(3, 2);
        writer.WriteU(8, 5);
        writer.WriteUe(0);
        writer.WriteUe(0);
        writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag);
        writer.WriteBit(false);
        writer.WriteUe(0);
        writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active);
        writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active);
        writer.WriteBit(((context.h264_parameter_set.flags >> 2) & 1) != 0);
        writer.WriteU(static_cast<s32>((context.h264_parameter_set.flags >> 32) & 0x3), 2);
        s32 pic_init_qp = static_cast<s32>((context.h264_parameter_set.flags >> 16) & 0x3f);
        pic_init_qp = (pic_init_qp << 26) >> 26;
        writer.WriteSe(pic_init_qp);
        writer.WriteSe(0);
        s32 chroma_qp_index_offset =
            static_cast<s32>((context.h264_parameter_set.flags >> 22) & 0x1f);
        chroma_qp_index_offset = (chroma_qp_index_offset << 27) >> 27;
        writer.WriteSe(chroma_qp_index_offset);
        writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_flag != 0);
        writer.WriteBit(((context.h264_parameter_set.flags >> 3) & 1) != 0);
        writer.WriteBit(context.h264_parameter_set.redundant_pic_count_flag != 0);
        writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0);
        writer.WriteBit(true);
        for (s32 index = 0; index < 6; index++) {
            writer.WriteBit(true);
            const auto matrix_x4 =
                std::vector<u8>(context.scaling_matrix_4.begin(), context.scaling_matrix_4.end());
            writer.WriteScalingList(matrix_x4, index * 16, 16);
        }
        if (context.h264_parameter_set.transform_8x8_mode_flag) {
            for (s32 index = 0; index < 2; index++) {
                writer.WriteBit(true);
                const auto matrix_x8 = std::vector<u8>(context.scaling_matrix_8.begin(),
                                                       context.scaling_matrix_8.end());
                writer.WriteScalingList(matrix_x8, index * 64, 64);
            }
        }
        s32 chroma_qp_index_offset2 =
            static_cast<s32>((context.h264_parameter_set.flags >> 27) & 0x1f);
        chroma_qp_index_offset2 = (chroma_qp_index_offset2 << 27) >> 27;
        writer.WriteSe(chroma_qp_index_offset2);
        writer.End();
        const auto& encoded_header = writer.GetByteArray();
        frame.resize(encoded_header.size() + context.frame_data_size);
        std::memcpy(frame.data(), encoded_header.data(), encoded_header.size());
        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset,
                                      frame.data() + encoded_header.size(),
                                      context.frame_data_size);
    }
    return frame;
 }
 H264BitWriter::H264BitWriter() = default;
 H264BitWriter::~H264BitWriter() = default;
 void H264BitWriter::WriteU(s32 value, s32 value_sz) {
    WriteBits(value, value_sz);
 }
 void H264BitWriter::WriteSe(s32 value) {
    WriteExpGolombCodedInt(value);
 }
 void H264BitWriter::WriteUe(s32 value) {
    WriteExpGolombCodedUInt((u32)value);
 }
 void H264BitWriter::End() {
    WriteBit(true);
    Flush();
 }
 void H264BitWriter::WriteBit(bool state) {
    WriteBits(state ? 1 : 0, 1);
 }
 void H264BitWriter::WriteScalingList(const std::vector<u8>& list, s32 start, s32 count) {
    std::vector<u8> scan(count);
    if (count == 16) {
        std::memcpy(scan.data(), zig_zag_scan.data(), scan.size());
    } else {
        std::memcpy(scan.data(), zig_zag_direct.data(), scan.size());
    }
    u8 last_scale = 8;
    for (s32 index = 0; index < count; index++) {
        const u8 value = list[start + scan[index]];
        const s32 delta_scale = static_cast<s32>(value - last_scale);
        WriteSe(delta_scale);
        last_scale = value;
    }
 }
 std::vector<u8>& H264BitWriter::GetByteArray() {
    return byte_array;
 }
 const std::vector<u8>& H264BitWriter::GetByteArray() const {
    return byte_array;
 }
 void H264BitWriter::WriteBits(s32 value, s32 bit_count) {
    s32 value_pos = 0;
    s32 remaining = bit_count;
    while (remaining > 0) {
        s32 copy_size = remaining;
        const s32 free_bits = GetFreeBufferBits();
        if (copy_size > free_bits) {
            copy_size = free_bits;
        }
        const s32 mask = (1 << copy_size) - 1;
        const s32 src_shift = (bit_count - value_pos) - copy_size;
        const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
        buffer |= ((value >> src_shift) & mask) << dst_shift;
        value_pos += copy_size;
        buffer_pos += copy_size;
        remaining -= copy_size;
    }
 }
 void H264BitWriter::WriteExpGolombCodedInt(s32 value) {
    const s32 sign = value <= 0 ? 0 : 1;
    if (value < 0) {
        value = -value;
    }
    value = (value << 1) - sign;
    WriteExpGolombCodedUInt(value);
 }
 void H264BitWriter::WriteExpGolombCodedUInt(u32 value) {
    const s32 size = 32 - Common::CountLeadingZeroes32(static_cast<s32>(value + 1));
    WriteBits(1, size);
    value -= (1U << (size - 1)) - 1;
    WriteBits(static_cast<s32>(value), size - 1);
 }
 s32 H264BitWriter::GetFreeBufferBits() {
    if (buffer_pos == buffer_size) {
        Flush();
    }
    return buffer_size - buffer_pos;
 }
 void H264BitWriter::Flush() {
    if (buffer_pos == 0) {
        return;
    }
    byte_array.push_back(static_cast<u8>(buffer));
    buffer = 0;
    buffer_pos = 0;
 }
 } // namespace Tegra::Decoder
--- a/src/video_core/command_classes/codecs/h264.h
+++ b/src/video_core/command_classes/codecs/h264.h
@ -0,0 +1,130 @@
 // MIT License
 //
 // Copyright (c) Ryujinx Team and Contributors
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
 // associated documentation files (the "Software"), to deal in the Software without restriction,
 // including without limitation the rights to use, copy, modify, merge, publish, distribute,
 // sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or
 // substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
 // NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 //
 #pragma once
 #include <vector>
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "video_core/command_classes/nvdec_common.h"
 namespace Tegra {
 class GPU;
 namespace Decoder {
 class H264BitWriter {
 public:
    H264BitWriter();
    ~H264BitWriter();
    /// The following Write methods are based on clause 9.1 in the H.264 specification.
    /// WriteSe and WriteUe write in the Exp-Golomb-coded syntax
    void WriteU(s32 value, s32 value_sz);
    void WriteSe(s32 value);
    void WriteUe(s32 value);
    /// Finalize the bitstream
    void End();
    /// append a bit to the stream, equivalent value to the state parameter
    void WriteBit(bool state);
    /// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification
    /// Writes the scaling matrices of the sream
    void WriteScalingList(const std::vector<u8>& list, s32 start, s32 count);
    /// Return the bitstream as a vector.
    std::vector<u8>& GetByteArray();
    const std::vector<u8>& GetByteArray() const;
 private:
    // ZigZag LUTs from libavcodec.
    static constexpr std::array<u8, 64> zig_zag_direct{
        0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,  12, 19, 26, 33, 40, 48,
        41, 34, 27, 20, 13, 6,  7,  14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23,
        30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
    };
    static constexpr std::array<u8, 16> zig_zag_scan{
        0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4,
        1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4,
    };
    void WriteBits(s32 value, s32 bit_count);
    void WriteExpGolombCodedInt(s32 value);
    void WriteExpGolombCodedUInt(u32 value);
    s32 GetFreeBufferBits();
    void Flush();
    s32 buffer_size{8};
    s32 buffer{};
    s32 buffer_pos{};
    std::vector<u8> byte_array;
 };
 class H264 {
 public:
    explicit H264(GPU& gpu);
    ~H264();
    /// Compose the H264 header of the frame for FFmpeg decoding
    std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state,
                                        bool is_first_frame = false);
 private:
    struct H264ParameterSet {
        u32 log2_max_pic_order_cnt{};
        u32 delta_pic_order_always_zero_flag{};
        u32 frame_mbs_only_flag{};
        u32 pic_width_in_mbs{};
        u32 pic_height_in_map_units{};
        INSERT_PADDING_WORDS(1);
        u32 entropy_coding_mode_flag{};
        u32 bottom_field_pic_order_flag{};
        u32 num_refidx_l0_default_active{};
        u32 num_refidx_l1_default_active{};
        u32 deblocking_filter_control_flag{};
        u32 redundant_pic_count_flag{};
        u32 transform_8x8_mode_flag{};
        INSERT_PADDING_WORDS(9);
        u64 flags{};
        u32 frame_number{};
        u32 frame_number2{};
    };
    static_assert(sizeof(H264ParameterSet) == 0x68, "H264ParameterSet is an invalid size");
    struct H264DecoderContext {
        INSERT_PADDING_BYTES(0x48);
        u32 frame_data_size{};
        INSERT_PADDING_BYTES(0xc);
        H264ParameterSet h264_parameter_set{};
        INSERT_PADDING_BYTES(0x100);
        std::array<u8, 0x60> scaling_matrix_4;
        std::array<u8, 0x80> scaling_matrix_8;
    };
    static_assert(sizeof(H264DecoderContext) == 0x2a0, "H264DecoderContext is an invalid size");
    std::vector<u8> frame;
    GPU& gpu;
 };
 } // namespace Decoder
 } // namespace Tegra
--- a/src/video_core/command_classes/codecs/vp9.cpp
+++ b/src/video_core/command_classes/codecs/vp9.cpp
--- a/src/video_core/command_classes/codecs/vp9.h
+++ b/src/video_core/command_classes/codecs/vp9.h
@ -0,0 +1,216 @@
 // Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #pragma once
 #include <unordered_map>
 #include <vector>
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/stream.h"
 #include "video_core/command_classes/codecs/vp9_types.h"
 #include "video_core/command_classes/nvdec_common.h"
 namespace Tegra {
 class GPU;
 enum class FrameType { KeyFrame = 0, InterFrame = 1 };
 namespace Decoder {
 /// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
 /// VP9 header bitstreams.
 class VpxRangeEncoder {
 public:
    VpxRangeEncoder();
    ~VpxRangeEncoder();
    /// Writes the rightmost value_size bits from value into the stream
    void Write(s32 value, s32 value_size);
    /// Writes a single bit with half probability
    void Write(bool bit);
    /// Writes a bit to the base_stream encoded with probability
    void Write(bool bit, s32 probability);
    /// Signal the end of the bitstream
    void End();
    std::vector<u8>& GetBuffer() {
        return base_stream.GetBuffer();
    }
    const std::vector<u8>& GetBuffer() const {
        return base_stream.GetBuffer();
    }
 private:
    u8 PeekByte();
    Common::Stream base_stream{};
    u32 low_value{};
    u32 range{0xff};
    s32 count{-24};
    s32 half_probability{128};
    static constexpr std::array<s32, 256> norm_lut{
        0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    };
 };
 class VpxBitStreamWriter {
 public:
    VpxBitStreamWriter();
    ~VpxBitStreamWriter();
    /// Write an unsigned integer value
    void WriteU(u32 value, u32 value_size);
    /// Write a signed integer value
    void WriteS(s32 value, u32 value_size);
    /// Based on 6.2.10 of VP9 Spec, writes a delta coded value
    void WriteDeltaQ(u32 value);
    /// Write a single bit.
    void WriteBit(bool state);
    /// Pushes current buffer into buffer_array, resets buffer
    void Flush();
    /// Returns byte_array
    std::vector<u8>& GetByteArray();
    /// Returns const byte_array
    const std::vector<u8>& GetByteArray() const;
 private:
    /// Write bit_count bits from value into buffer
    void WriteBits(u32 value, u32 bit_count);
    /// Gets next available position in buffer, invokes Flush() if buffer is full
    s32 GetFreeBufferBits();
    s32 buffer_size{8};
    s32 buffer{};
    s32 buffer_pos{};
    std::vector<u8> byte_array;
 };
 class VP9 {
 public:
    explicit VP9(GPU& gpu);
    ~VP9();
    /// Composes the VP9 frame from the GPU state information. Based on the official VP9 spec
    /// documentation
    std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state);
    /// Returns true if the most recent frame was a hidden frame.
    bool WasFrameHidden() const {
        return hidden;
    }
 private:
    /// Generates compressed header probability updates in the bitstream writer
    template <typename T, std::size_t N>
    void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
                                const std::array<T, N>& old_prob);
    /// Generates compressed header probability updates in the bitstream writer
    /// If probs are not equal, WriteProbabilityDelta is invoked
    void WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
    /// Generates compressed header probability deltas in the bitstream writer
    void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
    /// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification
    s32 RemapProbability(s32 new_prob, s32 old_prob);
    /// Recenters probability. Based on section 6.3.6 of VP9 Specification
    s32 RecenterNonNeg(s32 new_prob, s32 old_prob);
    /// Inverse of 6.3.4 Decode term subexp
    void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value);
    /// Writes if the value is less than the test value
    bool WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test);
    /// Writes probability updates for the Coef probabilities
    void WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
                                    const std::array<u8, 2304>& new_prob,
                                    const std::array<u8, 2304>& old_prob);
    /// Write probabilities for 4-byte aligned structures
    template <typename T, std::size_t N>
    void WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
                                        const std::array<T, N>& old_prob);
    /// Write motion vector probability updates. 6.3.17 in the spec
    void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
    /// 6.2.14 Tile size calculation
    s32 CalcMinLog2TileCols(s32 frame_width);
    s32 CalcMaxLog2TileCols(s32 frame_width);
    /// Returns VP9 information from NVDEC provided offset and size
    Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state);
    /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct
    void InsertEntropy(u64 offset, Vp9EntropyProbs& dst);
    /// Returns frame to be decoded after buffering
    Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state);
    /// Use NVDEC providied information to compose the headers for the current frame
    std::vector<u8> ComposeCompressedHeader();
    VpxBitStreamWriter ComposeUncompressedHeader();
    GPU& gpu;
    std::vector<u8> frame;
    std::array<s8, 4> loop_filter_ref_deltas{};
    std::array<s8, 2> loop_filter_mode_deltas{};
    bool hidden;
    s64 current_frame_number = -2; // since we buffer 2 frames
    s32 grace_period = 6;          // frame offsets need to stabilize
    std::array<FrameContexts, 4> frame_ctxs{};
    Vp9FrameContainer next_frame{};
    Vp9FrameContainer next_next_frame{};
    bool swap_next_golden{};
    Vp9PictureInfo current_frame_info{};
    Vp9EntropyProbs prev_frame_probs{};
    s32 diff_update_probability = 252;
    s32 frame_sync_code = 0x498342;
    static constexpr std::array<s32, 254> map_lut = {
        20,  21,  22,  23,  24,  25,  0,   26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
        36,  37,  1,   38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  2,   50,
        51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  3,   62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  4,   74,  75,  76,  77,  78,  79,  80,  81,  82,
        83,  84,  85,  5,   86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  6,
        98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 7,   110, 111, 112, 113,
        114, 115, 116, 117, 118, 119, 120, 121, 8,   122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 9,   134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145,
        10,  146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11,  158, 159, 160,
        161, 162, 163, 164, 165, 166, 167, 168, 169, 12,  170, 171, 172, 173, 174, 175, 176,
        177, 178, 179, 180, 181, 13,  182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192,
        193, 14,  194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15,  206, 207,
        208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 16,  218, 219, 220, 221, 222, 223,
        224, 225, 226, 227, 228, 229, 17,  230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
        240, 241, 18,  242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19,
    };
 };
 } // namespace Decoder
 } // namespace Tegra
--- a/src/video_core/command_classes/codecs/vp9_types.h
+++ b/src/video_core/command_classes/codecs/vp9_types.h
@ -0,0 +1,369 @@
 // Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #pragma once
 #include <algorithm>
 #include <list>
 #include <vector>
 #include "common/cityhash.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "video_core/command_classes/nvdec_common.h"
 namespace Tegra {
 class GPU;
 namespace Decoder {
 struct Vp9FrameDimensions {
    s16 width{};
    s16 height{};
    s16 luma_pitch{};
    s16 chroma_pitch{};
 };
 static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size");
 enum FrameFlags : u32 {
    IsKeyFrame = 1 << 0,
    LastFrameIsKeyFrame = 1 << 1,
    FrameSizeChanged = 1 << 2,
    ErrorResilientMode = 1 << 3,
    LastShowFrame = 1 << 4,
    IntraOnly = 1 << 5,
 };
 enum class MvJointType {
    MvJointZero = 0,   /* Zero vector */
    MvJointHnzvz = 1,  /* Vert zero, hor nonzero */
    MvJointHzvnz = 2,  /* Hor zero, vert nonzero */
    MvJointHnzvnz = 3, /* Both components nonzero */
 };
 enum class MvClassType {
    MvClass0 = 0,   /* (0, 2]     integer pel */
    MvClass1 = 1,   /* (2, 4]     integer pel */
    MvClass2 = 2,   /* (4, 8]     integer pel */
    MvClass3 = 3,   /* (8, 16]    integer pel */
    MvClass4 = 4,   /* (16, 32]   integer pel */
    MvClass5 = 5,   /* (32, 64]   integer pel */
    MvClass6 = 6,   /* (64, 128]  integer pel */
    MvClass7 = 7,   /* (128, 256] integer pel */
    MvClass8 = 8,   /* (256, 512] integer pel */
    MvClass9 = 9,   /* (512, 1024] integer pel */
    MvClass10 = 10, /* (1024,2048] integer pel */
 };
 enum class BlockSize {
    Block4x4 = 0,
    Block4x8 = 1,
    Block8x4 = 2,
    Block8x8 = 3,
    Block8x16 = 4,
    Block16x8 = 5,
    Block16x16 = 6,
    Block16x32 = 7,
    Block32x16 = 8,
    Block32x32 = 9,
    Block32x64 = 10,
    Block64x32 = 11,
    Block64x64 = 12,
    BlockSizes = 13,
    BlockInvalid = BlockSizes
 };
 enum class PredictionMode {
    DcPred = 0,   // Average of above and left pixels
    VPred = 1,    // Vertical
    HPred = 2,    // Horizontal
    D45Pred = 3,  // Directional 45  deg = round(arctan(1 / 1) * 180 / pi)
    D135Pred = 4, // Directional 135 deg = 180 - 45
    D117Pred = 5, // Directional 117 deg = 180 - 63
    D153Pred = 6, // Directional 153 deg = 180 - 27
    D207Pred = 7, // Directional 207 deg = 180 + 27
    D63Pred = 8,  // Directional 63  deg = round(arctan(2 / 1) * 180 / pi)
    TmPred = 9,   // True-motion
    NearestMv = 10,
    NearMv = 11,
    ZeroMv = 12,
    NewMv = 13,
    MbModeCount = 14
 };
 enum class TxSize {
    Tx4x4 = 0,   // 4x4 transform
    Tx8x8 = 1,   // 8x8 transform
    Tx16x16 = 2, // 16x16 transform
    Tx32x32 = 3, // 32x32 transform
    TxSizes = 4
 };
 enum class TxMode {
    Only4X4 = 0,      // Only 4x4 transform used
    Allow8X8 = 1,     // Allow block transform size up to 8x8
    Allow16X16 = 2,   // Allow block transform size up to 16x16
    Allow32X32 = 3,   // Allow block transform size up to 32x32
    TxModeSelect = 4, // Transform specified for each block
    TxModes = 5
 };
 enum class reference_mode {
    SingleReference = 0,
    CompoundReference = 1,
    ReferenceModeSelect = 2,
    ReferenceModes = 3
 };
 struct Segmentation {
    u8 enabled{};
    u8 update_map{};
    u8 temporal_update{};
    u8 abs_delta{};
    std::array<u32, 8> feature_mask{};
    std::array<std::array<s16, 4>, 8> feature_data{};
 };
 static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size");
 struct LoopFilter {
    u8 mode_ref_delta_enabled{};
    std::array<s8, 4> ref_deltas{};
    std::array<s8, 2> mode_deltas{};
 };
 static_assert(sizeof(LoopFilter) == 0x7, "LoopFilter is an invalid size");
 struct Vp9EntropyProbs {
    std::array<u8, 36> y_mode_prob{};
    std::array<u8, 64> partition_prob{};
    std::array<u8, 2304> coef_probs{};
    std::array<u8, 8> switchable_interp_prob{};
    std::array<u8, 28> inter_mode_prob{};
    std::array<u8, 4> intra_inter_prob{};
    std::array<u8, 5> comp_inter_prob{};
    std::array<u8, 10> single_ref_prob{};
    std::array<u8, 5> comp_ref_prob{};
    std::array<u8, 6> tx_32x32_prob{};
    std::array<u8, 4> tx_16x16_prob{};
    std::array<u8, 2> tx_8x8_prob{};
    std::array<u8, 3> skip_probs{};
    std::array<u8, 3> joints{};
    std::array<u8, 2> sign{};
    std::array<u8, 20> classes{};
    std::array<u8, 2> class_0{};
    std::array<u8, 20> prob_bits{};
    std::array<u8, 12> class_0_fr{};
    std::array<u8, 6> fr{};
    std::array<u8, 2> class_0_hp{};
    std::array<u8, 2> high_precision{};
 };
 static_assert(sizeof(Vp9EntropyProbs) == 0x9F4, "Vp9EntropyProbs is an invalid size");
 struct Vp9PictureInfo {
    bool is_key_frame{};
    bool intra_only{};
    bool last_frame_was_key{};
    bool frame_size_changed{};
    bool error_resilient_mode{};
    bool last_frame_shown{};
    bool show_frame{};
    std::array<s8, 4> ref_frame_sign_bias{};
    s32 base_q_index{};
    s32 y_dc_delta_q{};
    s32 uv_dc_delta_q{};
    s32 uv_ac_delta_q{};
    bool lossless{};
    s32 transform_mode{};
    bool allow_high_precision_mv{};
    s32 interp_filter{};
    s32 reference_mode{};
    s8 comp_fixed_ref{};
    std::array<s8, 2> comp_var_ref{};
    s32 log2_tile_cols{};
    s32 log2_tile_rows{};
    bool segment_enabled{};
    bool segment_map_update{};
    bool segment_map_temporal_update{};
    s32 segment_abs_delta{};
    std::array<u32, 8> segment_feature_enable{};
    std::array<std::array<s16, 4>, 8> segment_feature_data{};
    bool mode_ref_delta_enabled{};
    bool use_prev_in_find_mv_refs{};
    std::array<s8, 4> ref_deltas{};
    std::array<s8, 2> mode_deltas{};
    Vp9EntropyProbs entropy{};
    Vp9FrameDimensions frame_size{};
    u8 first_level{};
    u8 sharpness_level{};
    u32 bitstream_size{};
    std::array<u64, 4> frame_offsets{};
    std::array<bool, 4> refresh_frame{};
 };
 struct Vp9FrameContainer {
    Vp9PictureInfo info{};
    std::vector<u8> bit_stream;
 };
 struct PictureInfo {
    INSERT_PADDING_WORDS(12);
    u32 bitstream_size{};
    INSERT_PADDING_WORDS(5);
    Vp9FrameDimensions last_frame_size{};
    Vp9FrameDimensions golden_frame_size{};
    Vp9FrameDimensions alt_frame_size{};
    Vp9FrameDimensions current_frame_size{};
    u32 vp9_flags{};
    std::array<s8, 4> ref_frame_sign_bias{};
    u8 first_level{};
    u8 sharpness_level{};
    u8 base_q_index{};
    u8 y_dc_delta_q{};
    u8 uv_ac_delta_q{};
    u8 uv_dc_delta_q{};
    u8 lossless{};
    u8 tx_mode{};
    u8 allow_high_precision_mv{};
    u8 interp_filter{};
    u8 reference_mode{};
    s8 comp_fixed_ref{};
    std::array<s8, 2> comp_var_ref{};
    u8 log2_tile_cols{};
    u8 log2_tile_rows{};
    Segmentation segmentation{};
    LoopFilter loop_filter{};
    INSERT_PADDING_BYTES(5);
    u32 surface_params{};
    INSERT_PADDING_WORDS(3);
    Vp9PictureInfo Convert() const {
        return Vp9PictureInfo{
            .is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0,
            .intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0,
            .last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0,
            .frame_size_changed = (vp9_flags & FrameFlags::FrameSizeChanged) != 0,
            .error_resilient_mode = (vp9_flags & FrameFlags::ErrorResilientMode) != 0,
            .last_frame_shown = (vp9_flags & FrameFlags::LastShowFrame) != 0,
            .ref_frame_sign_bias = ref_frame_sign_bias,
            .base_q_index = base_q_index,
            .y_dc_delta_q = y_dc_delta_q,
            .uv_dc_delta_q = uv_dc_delta_q,
            .uv_ac_delta_q = uv_ac_delta_q,
            .lossless = lossless != 0,
            .transform_mode = tx_mode,
            .allow_high_precision_mv = allow_high_precision_mv != 0,
            .interp_filter = interp_filter,
            .reference_mode = reference_mode,
            .comp_fixed_ref = comp_fixed_ref,
            .comp_var_ref = comp_var_ref,
            .log2_tile_cols = log2_tile_cols,
            .log2_tile_rows = log2_tile_rows,
            .segment_enabled = segmentation.enabled != 0,
            .segment_map_update = segmentation.update_map != 0,
            .segment_map_temporal_update = segmentation.temporal_update != 0,
            .segment_abs_delta = segmentation.abs_delta,
            .segment_feature_enable = segmentation.feature_mask,
            .segment_feature_data = segmentation.feature_data,
            .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0,
            .use_prev_in_find_mv_refs = !(vp9_flags == (FrameFlags::ErrorResilientMode)) &&
                                        !(vp9_flags == (FrameFlags::FrameSizeChanged)) &&
                                        !(vp9_flags == (FrameFlags::IntraOnly)) &&
                                        (vp9_flags == (FrameFlags::LastShowFrame)) &&
                                        !(vp9_flags == (FrameFlags::LastFrameIsKeyFrame)),
            .ref_deltas = loop_filter.ref_deltas,
            .mode_deltas = loop_filter.mode_deltas,
            .frame_size = current_frame_size,
            .first_level = first_level,
            .sharpness_level = sharpness_level,
            .bitstream_size = bitstream_size,
        };
    }
 };
 static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size");
 struct EntropyProbs {
    INSERT_PADDING_BYTES(1024);
    std::array<std::array<u8, 4>, 7> inter_mode_prob{};
    std::array<u8, 4> intra_inter_prob{};
    INSERT_PADDING_BYTES(80);
    std::array<std::array<u8, 1>, 2> tx_8x8_prob{};
    std::array<std::array<u8, 2>, 2> tx_16x16_prob{};
    std::array<std::array<u8, 3>, 2> tx_32x32_prob{};
    std::array<u8, 4> y_mode_prob_e8{};
    std::array<std::array<u8, 8>, 4> y_mode_prob_e0e7{};
    INSERT_PADDING_BYTES(64);
    std::array<std::array<u8, 4>, 16> partition_prob{};
    INSERT_PADDING_BYTES(10);
    std::array<std::array<u8, 2>, 4> switchable_interp_prob{};
    std::array<u8, 5> comp_inter_prob{};
    std::array<u8, 4> skip_probs{};
    std::array<u8, 3> joints{};
    std::array<u8, 2> sign{};
    std::array<std::array<u8, 1>, 2> class_0{};
    std::array<std::array<u8, 3>, 2> fr{};
    std::array<u8, 2> class_0_hp{};
    std::array<u8, 2> high_precision{};
    std::array<std::array<u8, 10>, 2> classes{};
    std::array<std::array<std::array<u8, 3>, 2>, 2> class_0_fr{};
    std::array<std::array<u8, 10>, 2> pred_bits{};
    std::array<std::array<u8, 2>, 5> single_ref_prob{};
    std::array<u8, 5> comp_ref_prob{};
    INSERT_PADDING_BYTES(17);
    std::array<std::array<std::array<std::array<std::array<std::array<u8, 4>, 6>, 6>, 2>, 2>, 4>
        coef_probs{};
    void Convert(Vp9EntropyProbs& fc) {
        std::memcpy(fc.inter_mode_prob.data(), inter_mode_prob.data(), fc.inter_mode_prob.size());
        std::memcpy(fc.intra_inter_prob.data(), intra_inter_prob.data(),
                    fc.intra_inter_prob.size());
        std::memcpy(fc.tx_8x8_prob.data(), tx_8x8_prob.data(), fc.tx_8x8_prob.size());
        std::memcpy(fc.tx_16x16_prob.data(), tx_16x16_prob.data(), fc.tx_16x16_prob.size());
        std::memcpy(fc.tx_32x32_prob.data(), tx_32x32_prob.data(), fc.tx_32x32_prob.size());
        for (s32 i = 0; i < 4; i++) {
            for (s32 j = 0; j < 9; j++) {
                fc.y_mode_prob[j + 9 * i] = j < 8 ? y_mode_prob_e0e7[i][j] : y_mode_prob_e8[i];
            }
        }
        std::memcpy(fc.partition_prob.data(), partition_prob.data(), fc.partition_prob.size());
        std::memcpy(fc.switchable_interp_prob.data(), switchable_interp_prob.data(),
                    fc.switchable_interp_prob.size());
        std::memcpy(fc.comp_inter_prob.data(), comp_inter_prob.data(), fc.comp_inter_prob.size());
        std::memcpy(fc.skip_probs.data(), skip_probs.data(), fc.skip_probs.size());
        std::memcpy(fc.joints.data(), joints.data(), fc.joints.size());
        std::memcpy(fc.sign.data(), sign.data(), fc.sign.size());
        std::memcpy(fc.class_0.data(), class_0.data(), fc.class_0.size());
        std::memcpy(fc.fr.data(), fr.data(), fc.fr.size());
        std::memcpy(fc.class_0_hp.data(), class_0_hp.data(), fc.class_0_hp.size());
        std::memcpy(fc.high_precision.data(), high_precision.data(), fc.high_precision.size());
        std::memcpy(fc.classes.data(), classes.data(), fc.classes.size());
        std::memcpy(fc.class_0_fr.data(), class_0_fr.data(), fc.class_0_fr.size());
        std::memcpy(fc.prob_bits.data(), pred_bits.data(), fc.prob_bits.size());
        std::memcpy(fc.single_ref_prob.data(), single_ref_prob.data(), fc.single_ref_prob.size());
        std::memcpy(fc.comp_ref_prob.data(), comp_ref_prob.data(), fc.comp_ref_prob.size());
        std::memcpy(fc.coef_probs.data(), coef_probs.data(), fc.coef_probs.size());
    }
 };
 static_assert(sizeof(EntropyProbs) == 0xEA0, "EntropyProbs is an invalid size");
 enum class Ref { Last, Golden, AltRef };
 struct RefPoolElement {
    s64 frame{};
    Ref ref{};
    bool refresh{};
 };
 struct FrameContexts {
    s64 from{};
    bool adapted{};
    Vp9EntropyProbs probs{};
 };
 }; // namespace Decoder
 }; // namespace Tegra
--- a/src/video_core/command_classes/host1x.cpp
+++ b/src/video_core/command_classes/host1x.cpp
@ -0,0 +1,39 @@
 // Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #include "common/assert.h"
 #include "video_core/command_classes/host1x.h"
 #include "video_core/gpu.h"
 Tegra::Host1x::Host1x(GPU& gpu_) : gpu(gpu_) {}
 Tegra::Host1x::~Host1x() = default;
 void Tegra::Host1x::StateWrite(u32 offset, u32 arguments) {
    u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u32);
    std::memcpy(state_offset, &arguments, sizeof(u32));
 }
 void Tegra::Host1x::ProcessMethod(Host1x::Method method, const std::vector<u32>& arguments) {
    StateWrite(static_cast<u32>(method), arguments[0]);
    switch (method) {
    case Method::WaitSyncpt:
        Execute(arguments[0]);
        break;
    case Method::LoadSyncptPayload32:
        syncpoint_value = arguments[0];
        break;
    case Method::WaitSyncpt32:
        Execute(arguments[0]);
        break;
    default:
        UNIMPLEMENTED_MSG("Host1x method 0x{:X}", static_cast<u32>(method));
        break;
    }
 }
 void Tegra::Host1x::Execute(u32 data) {
    // This method waits on a valid syncpoint.
    // TODO: Implement when proper Async is in place
 }
--- a/src/video_core/command_classes/host1x.h
+++ b/src/video_core/command_classes/host1x.h
@ -0,0 +1,78 @@
 // Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #pragma once
 #include <vector>
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 namespace Tegra {
 class GPU;
 class Nvdec;
 class Host1x {
 public:
    struct Host1xClassRegisters {
        u32 incr_syncpt{};
        u32 incr_syncpt_ctrl{};
        u32 incr_syncpt_error{};
        INSERT_PADDING_WORDS(5);
        u32 wait_syncpt{};
        u32 wait_syncpt_base{};
        u32 wait_syncpt_incr{};
        u32 load_syncpt_base{};
        u32 incr_syncpt_base{};
        u32 clear{};
        u32 wait{};
        u32 wait_with_interrupt{};
        u32 delay_use{};
        u32 tick_count_high{};
        u32 tick_count_low{};
        u32 tick_ctrl{};
        INSERT_PADDING_WORDS(23);
        u32 ind_ctrl{};
        u32 ind_off2{};
        u32 ind_off{};
        std::array<u32, 31> ind_data{};
        INSERT_PADDING_WORDS(1);
        u32 load_syncpoint_payload32{};
        u32 stall_ctrl{};
        u32 wait_syncpt32{};
        u32 wait_syncpt_base32{};
        u32 load_syncpt_base32{};
        u32 incr_syncpt_base32{};
        u32 stall_count_high{};
        u32 stall_count_low{};
        u32 xref_ctrl{};
        u32 channel_xref_high{};
        u32 channel_xref_low{};
    };
    static_assert(sizeof(Host1xClassRegisters) == 0x164, "Host1xClassRegisters is an invalid size");
    enum class Method : u32 {
        WaitSyncpt = offsetof(Host1xClassRegisters, wait_syncpt) / 4,
        LoadSyncptPayload32 = offsetof(Host1xClassRegisters, load_syncpoint_payload32) / 4,
        WaitSyncpt32 = offsetof(Host1xClassRegisters, wait_syncpt32) / 4,
    };
    explicit Host1x(GPU& gpu);
    ~Host1x();
    /// Writes the method into the state, Invoke Execute() if encountered
    void ProcessMethod(Host1x::Method method, const std::vector<u32>& arguments);
 private:
    /// For Host1x, execute is waiting on a syncpoint previously written into the state
    void Execute(u32 data);
    /// Write argument into the provided offset
    void StateWrite(u32 offset, u32 arguments);
    u32 syncpoint_value{};
    Host1xClassRegisters state{};
    GPU& gpu;
 };
 } // namespace Tegra
--- a/src/video_core/command_classes/nvdec.cpp
+++ b/src/video_core/command_classes/nvdec.cpp
@ -0,0 +1,56 @@
 // Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #include <bitset>
 #include "common/assert.h"
 #include "common/bit_util.h"
 #include "core/memory.h"
 #include "video_core/command_classes/nvdec.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 namespace Tegra {
 Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {}
 Nvdec::~Nvdec() = default;
 void Nvdec::ProcessMethod(Nvdec::Method method, const std::vector<u32>& arguments) {
    if (method == Method::SetVideoCodec) {
        codec->StateWrite(static_cast<u32>(method), arguments[0]);
    } else {
        codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8);
    }
    switch (method) {
    case Method::SetVideoCodec:
        codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0]));
        break;
    case Method::Execute:
        Execute();
        break;
    }
 }
 AVFrame* Nvdec::GetFrame() {
    return codec->GetCurrentFrame();
 }
 const AVFrame* Nvdec::GetFrame() const {
    return codec->GetCurrentFrame();
 }
 void Nvdec::Execute() {
    switch (codec->GetCurrentCodec()) {
    case NvdecCommon::VideoCodec::H264:
    case NvdecCommon::VideoCodec::Vp9:
        codec->Decode();
        break;
    default:
        UNIMPLEMENTED_MSG("Unknown codec {}", static_cast<u32>(codec->GetCurrentCodec()));
        break;
    }
 }
 } // namespace Tegra
--- a/src/video_core/command_classes/nvdec.h
+++ b/src/video_core/command_classes/nvdec.h
@ -0,0 +1,39 @@
 // Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #pragma once
 #include <vector>
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "video_core/command_classes/codecs/codec.h"
 namespace Tegra {
 class GPU;
 class Nvdec {
 public:
    enum class Method : u32 {
        SetVideoCodec = 0x80,
        Execute = 0xc0,
    };
    explicit Nvdec(GPU& gpu);
    ~Nvdec();
    /// Writes the method into the state, Invoke Execute() if encountered
    void ProcessMethod(Nvdec::Method method, const std::vector<u32>& arguments);
    /// Return most recently decoded frame
    AVFrame* GetFrame();
    const AVFrame* GetFrame() const;
 private:
    /// Invoke codec to decode a frame
    void Execute();
    GPU& gpu;
    std::unique_ptr<Tegra::Codec> codec;
 };
 } // namespace Tegra
--- a/src/video_core/command_classes/nvdec_common.h
+++ b/src/video_core/command_classes/nvdec_common.h
@ -0,0 +1,48 @@
 // Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #pragma once
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 namespace Tegra::NvdecCommon {
 struct NvdecRegisters {
    INSERT_PADDING_WORDS(256);
    u64 set_codec_id{};
    INSERT_PADDING_WORDS(254);
    u64 set_platform_id{};
    u64 picture_info_offset{};
    u64 frame_bitstream_offset{};
    u64 frame_number{};
    u64 h264_slice_data_offsets{};
    u64 h264_mv_dump_offset{};
    INSERT_PADDING_WORDS(6);
    u64 frame_stats_offset{};
    u64 h264_last_surface_luma_offset{};
    u64 h264_last_surface_chroma_offset{};
    std::array<u64, 17> surface_luma_offset{};
    std::array<u64, 17> surface_chroma_offset{};
    INSERT_PADDING_WORDS(132);
    u64 vp9_entropy_probs_offset{};
    u64 vp9_backward_updates_offset{};
    u64 vp9_last_frame_segmap_offset{};
    u64 vp9_curr_frame_segmap_offset{};
    INSERT_PADDING_WORDS(2);
    u64 vp9_last_frame_mvs_offset{};
    u64 vp9_curr_frame_mvs_offset{};
    INSERT_PADDING_WORDS(2);
 };
 static_assert(sizeof(NvdecRegisters) == (0xBC0), "NvdecRegisters is incorrect size");
 enum class VideoCodec : u32 {
    None = 0x0,
    H264 = 0x3,
    Vp8 = 0x5,
    H265 = 0x7,
    Vp9 = 0x9,
 };
 } // namespace Tegra::NvdecCommon
--- a/src/video_core/command_classes/sync_manager.cpp
+++ b/src/video_core/command_classes/sync_manager.cpp
@ -0,0 +1,60 @@
 // MIT License
 //
 // Copyright (c) Ryujinx Team and Contributors
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
 // associated documentation files (the "Software"), to deal in the Software without restriction,
 // including without limitation the rights to use, copy, modify, merge, publish, distribute,
 // sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or
 // substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
 // NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 //
 #include <algorithm>
 #include "sync_manager.h"
 #include "video_core/gpu.h"
 namespace Tegra {
 SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {}
 SyncptIncrManager::~SyncptIncrManager() = default;
 void SyncptIncrManager::Increment(u32 id) {
    increments.push_back(SyncptIncr{0, id, true});
    IncrementAllDone();
 }
 u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) {
    const u32 handle = current_id++;
    increments.push_back(SyncptIncr{handle, class_id, id});
    return handle;
 }
 void SyncptIncrManager::SignalDone(u32 handle) {
    auto done_incr = std::find_if(increments.begin(), increments.end(),
                                  [handle](SyncptIncr incr) { return incr.id == handle; });
    if (done_incr != increments.end()) {
        const SyncptIncr incr = *done_incr;
        *done_incr = SyncptIncr{incr.id, incr.class_id, incr.syncpt_id, true};
    }
    IncrementAllDone();
 }
 void SyncptIncrManager::IncrementAllDone() {
    std::size_t done_count = 0;
    for (; done_count < increments.size(); ++done_count) {
        if (!increments[done_count].complete) {
            break;
        }
        gpu.IncrementSyncPoint(increments[done_count].syncpt_id);
    }
    increments.erase(increments.begin(), increments.begin() + done_count);
 }
 } // namespace Tegra
--- a/src/video_core/command_classes/sync_manager.h
+++ b/src/video_core/command_classes/sync_manager.h
@ -0,0 +1,64 @@
 // MIT License
 //
 // Copyright (c) Ryujinx Team and Contributors
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
 // associated documentation files (the "Software"), to deal in the Software without restriction,
 // including without limitation the rights to use, copy, modify, merge, publish, distribute,
 // sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or
 // substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
 // NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 //
 #pragma once
 #include <mutex>
 #include <vector>
 #include "common/common_types.h"
 namespace Tegra {
 class GPU;
 struct SyncptIncr {
    u32 id;
    u32 class_id;
    u32 syncpt_id;
    bool complete;
    SyncptIncr(u32 id, u32 syncpt_id_, u32 class_id_, bool done = false)
        : id(id), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {}
 };
 class SyncptIncrManager {
 public:
    explicit SyncptIncrManager(GPU& gpu);
    ~SyncptIncrManager();
    /// Add syncpoint id and increment all
    void Increment(u32 id);
    /// Returns a handle to increment later
    u32 IncrementWhenDone(u32 class_id, u32 id);
    /// IncrememntAllDone, including handle
    void SignalDone(u32 handle);
    /// Increment all sequential pending increments that are already done.
    void IncrementAllDone();
 private:
    std::vector<SyncptIncr> increments;
    std::mutex increment_lock;
    u32 current_id{};
    GPU& gpu;
 };
 } // namespace Tegra
--- a/src/video_core/command_classes/vic.cpp
+++ b/src/video_core/command_classes/vic.cpp
@ -0,0 +1,180 @@
 // Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #include <array>
 #include "common/assert.h"
 #include "video_core/command_classes/nvdec.h"
 #include "video_core/command_classes/vic.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 #include "video_core/texture_cache/surface_params.h"
 extern "C" {
 #include <libswscale/swscale.h>
 }
 namespace Tegra {
 Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_)
    : gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {}
 Vic::~Vic() = default;
 void Vic::VicStateWrite(u32 offset, u32 arguments) {
    u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32);
    std::memcpy(state_offset, &arguments, sizeof(u32));
 }
 void Vic::ProcessMethod(Vic::Method method, const std::vector<u32>& arguments) {
    LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast<u32>(method));
    VicStateWrite(static_cast<u32>(method), arguments[0]);
    const u64 arg = static_cast<u64>(arguments[0]) << 8;
    switch (method) {
    case Method::Execute:
        Execute();
        break;
    case Method::SetConfigStructOffset:
        config_struct_address = arg;
        break;
    case Method::SetOutputSurfaceLumaOffset:
        output_surface_luma_address = arg;
        break;
    case Method::SetOutputSurfaceChromaUOffset:
        output_surface_chroma_u_address = arg;
        break;
    case Method::SetOutputSurfaceChromaVOffset:
        output_surface_chroma_v_address = arg;
        break;
    default:
        break;
    }
 }
 void Vic::Execute() {
    if (output_surface_luma_address == 0) {
        LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Recieved 0x{:X}",
                  vic_state.output_surface.luma_offset);
        return;
    }
    const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)};
    const VideoPixelFormat pixel_format =
        static_cast<VideoPixelFormat>(config.pixel_format.Value());
    switch (pixel_format) {
    case VideoPixelFormat::BGRA8:
    case VideoPixelFormat::RGBA8: {
        LOG_TRACE(Service_NVDRV, "Writing RGB Frame");
        const auto* frame = nvdec_processor->GetFrame();
        if (!frame || frame->width == 0 || frame->height == 0) {
            return;
        }
        if (scaler_ctx == nullptr || frame->width != scaler_width ||
            frame->height != scaler_height) {
            const AVPixelFormat target_format =
                (pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA;
            sws_freeContext(scaler_ctx);
            scaler_ctx = nullptr;
            // FFmpeg returns all frames in YUV420, convert it into expected format
            scaler_ctx =
                sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width,
                               frame->height, target_format, 0, nullptr, nullptr, nullptr);
            scaler_width = frame->width;
            scaler_height = frame->height;
        }
        // Get Converted frame
        const std::size_t linear_size = frame->width * frame->height * 4;
        using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>;
        AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free};
        const int converted_stride{frame->width * 4};
        u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
        sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height,
                  &converted_frame_buf_addr, &converted_stride);
        const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
        if (blk_kind != 0) {
            // swizzle pitch linear to block linear
            const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
            const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1,
                                                            block_height, 0);
            std::vector<u8> swizzled_data(size);
            Tegra::Texture::CopySwizzledData(frame->width, frame->height, 1, 4, 4,
                                             swizzled_data.data(), converted_frame_buffer.get(),
                                             false, block_height, 0, 1);
            gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size);
            gpu.Maxwell3D().OnMemoryWrite();
        } else {
            // send pitch linear frame
            gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
                                           linear_size);
            gpu.Maxwell3D().OnMemoryWrite();
        }
        break;
    }
    case VideoPixelFormat::Yuv420: {
        LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");
        const auto* frame = nvdec_processor->GetFrame();
        if (!frame || frame->width == 0 || frame->height == 0) {
            return;
        }
        const std::size_t surface_width = config.surface_width_minus1 + 1;
        const std::size_t surface_height = config.surface_height_minus1 + 1;
        const std::size_t half_width = surface_width / 2;
        const std::size_t half_height = config.surface_height_minus1 / 2;
        const std::size_t aligned_width = (surface_width + 0xff) & ~0xff;
        const auto* luma_ptr = frame->data[0];
        const auto* chroma_b_ptr = frame->data[1];
        const auto* chroma_r_ptr = frame->data[2];
        const auto stride = frame->linesize[0];
        const auto half_stride = frame->linesize[1];
        std::vector<u8> luma_buffer(aligned_width * surface_height);
        std::vector<u8> chroma_buffer(aligned_width * half_height);
        // Populate luma buffer
        for (std::size_t y = 0; y < surface_height - 1; ++y) {
            std::size_t src = y * stride;
            std::size_t dst = y * aligned_width;
            std::size_t size = surface_width;
            for (std::size_t offset = 0; offset < size; ++offset) {
                luma_buffer[dst + offset] = luma_ptr[src + offset];
            }
        }
        gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
                                       luma_buffer.size());
        // Populate chroma buffer from both channels with interleaving.
        for (std::size_t y = 0; y < half_height; ++y) {
            std::size_t src = y * half_stride;
            std::size_t dst = y * aligned_width;
            for (std::size_t x = 0; x < half_width; ++x) {
                chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x];
                chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x];
            }
        }
        gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),
                                       chroma_buffer.size());
        gpu.Maxwell3D().OnMemoryWrite();
        break;
    }
    default:
        UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value());
        break;
    }
 }
 } // namespace Tegra
--- a/src/video_core/command_classes/vic.h
+++ b/src/video_core/command_classes/vic.h
@ -0,0 +1,110 @@
 // Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #pragma once
 #include <memory>
 #include <vector>
 #include "common/bit_field.h"
 #include "common/common_types.h"
 struct SwsContext;
 namespace Tegra {
 class GPU;
 class Nvdec;
 struct PlaneOffsets {
    u32 luma_offset{};
    u32 chroma_u_offset{};
    u32 chroma_v_offset{};
 };
 struct VicRegisters {
    INSERT_PADDING_WORDS(64);
    u32 nop{};
    INSERT_PADDING_WORDS(15);
    u32 pm_trigger{};
    INSERT_PADDING_WORDS(47);
    u32 set_application_id{};
    u32 set_watchdog_timer{};
    INSERT_PADDING_WORDS(17);
    u32 context_save_area{};
    u32 context_switch{};
    INSERT_PADDING_WORDS(43);
    u32 execute{};
    INSERT_PADDING_WORDS(63);
    std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{};
    u32 picture_index{};
    u32 control_params{};
    u32 config_struct_offset{};
    u32 filter_struct_offset{};
    u32 palette_offset{};
    u32 hist_offset{};
    u32 context_id{};
    u32 fce_ucode_size{};
    PlaneOffsets output_surface{};
    u32 fce_ucode_offset{};
    INSERT_PADDING_WORDS(4);
    std::array<u32, 8> slot_context_id{};
    INSERT_PADDING_WORDS(16);
 };
 static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size");
 class Vic {
 public:
    enum class Method : u32 {
        Execute = 0xc0,
        SetControlParams = 0x1c1,
        SetConfigStructOffset = 0x1c2,
        SetOutputSurfaceLumaOffset = 0x1c8,
        SetOutputSurfaceChromaUOffset = 0x1c9,
        SetOutputSurfaceChromaVOffset = 0x1ca
    };
    explicit Vic(GPU& gpu, std::shared_ptr<Tegra::Nvdec> nvdec_processor);
    ~Vic();
    /// Write to the device state.
    void ProcessMethod(Vic::Method method, const std::vector<u32>& arguments);
 private:
    void Execute();
    void VicStateWrite(u32 offset, u32 arguments);
    VicRegisters vic_state{};
    enum class VideoPixelFormat : u64_le {
        RGBA8 = 0x1f,
        BGRA8 = 0x20,
        Yuv420 = 0x44,
    };
    union VicConfig {
        u64_le raw{};
        BitField<0, 7, u64_le> pixel_format;
        BitField<7, 2, u64_le> chroma_loc_horiz;
        BitField<9, 2, u64_le> chroma_loc_vert;
        BitField<11, 4, u64_le> block_linear_kind;
        BitField<15, 4, u64_le> block_linear_height_log2;
        BitField<19, 3, u64_le> reserved0;
        BitField<22, 10, u64_le> reserved1;
        BitField<32, 14, u64_le> surface_width_minus1;
        BitField<46, 14, u64_le> surface_height_minus1;
    };
    GPU& gpu;
    std::shared_ptr<Tegra::Nvdec> nvdec_processor;
    GPUVAddr config_struct_address{};
    GPUVAddr output_surface_luma_address{};
    GPUVAddr output_surface_chroma_u_address{};
    GPUVAddr output_surface_chroma_v_address{};
    SwsContext* scaler_ctx{};
    s32 scaler_width{};
    s32 scaler_height{};
 };
 } // namespace Tegra
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@ -27,9 +27,10 @@ namespace Tegra {
 MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
-GPU::GPU(Core::System& system_, bool is_async_)
+GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
    : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)},
      dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)},
      cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_},
      maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},
      fermi_2d{std::make_unique<Engines::Fermi2D>()},
      kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
@ -77,10 +78,18 @@ DmaPusher& GPU::DmaPusher() {
    return *dma_pusher;
 }
 Tegra::CDmaPusher& GPU::CDmaPusher() {
    return *cdma_pusher;
 }
 const DmaPusher& GPU::DmaPusher() const {
    return *dma_pusher;
 }
 const Tegra::CDmaPusher& GPU::CDmaPusher() const {
    return *cdma_pusher;
 }
 void GPU::WaitFence(u32 syncpoint_id, u32 value) {
    // Synced GPU, is always in sync
    if (!is_async) {
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@ -13,6 +13,7 @@
 #include "common/common_types.h"
 #include "core/hle/service/nvdrv/nvdata.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
 #include "video_core/cdma_pusher.h"
 #include "video_core/dma_pusher.h"
 using CacheAddr = std::uintptr_t;
@ -157,7 +158,7 @@ public:
              method_count(method_count) {}
    };
-    explicit GPU(Core::System& system, bool is_async);
+    explicit GPU(Core::System& system, bool is_async, bool use_nvdec);
    virtual ~GPU();
    /// Binds a renderer to the GPU.
@ -209,6 +210,15 @@ public:
    /// Returns a reference to the GPU DMA pusher.
    Tegra::DmaPusher& DmaPusher();
    /// Returns a const reference to the GPU DMA pusher.
    const Tegra::DmaPusher& DmaPusher() const;
    /// Returns a reference to the GPU CDMA pusher.
    Tegra::CDmaPusher& CDmaPusher();
    /// Returns a const reference to the GPU CDMA pusher.
    const Tegra::CDmaPusher& CDmaPusher() const;
    VideoCore::RendererBase& Renderer() {
        return *renderer;
    }
@ -249,8 +259,9 @@ public:
        return is_async;
    }
-    /// Returns a const reference to the GPU DMA pusher.
+    bool UseNvdec() const {
-    const Tegra::DmaPusher& DmaPusher() const;
+        return use_nvdec;
    }
    struct Regs {
        static constexpr size_t NUM_REGS = 0x40;
@ -311,6 +322,9 @@ public:
    /// Push GPU command entries to be processed
    virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
    /// Push GPU command buffer entries to be processed
    virtual void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) = 0;
    /// Swap buffers (render frame)
    virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
@ -349,7 +363,9 @@ protected:
    Core::System& system;
    std::unique_ptr<Tegra::MemoryManager> memory_manager;
    std::unique_ptr<Tegra::DmaPusher> dma_pusher;
    std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
    std::unique_ptr<VideoCore::RendererBase> renderer;
    const bool use_nvdec;
 private:
    /// Mapping of command subchannels to their bound engine ids
@ -372,6 +388,7 @@ private:
    std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
    std::mutex sync_mutex;
    std::mutex device_mutex;
    std::condition_variable sync_cv;
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@ -10,12 +10,13 @@
 namespace VideoCommon {
-GPUAsynch::GPUAsynch(Core::System& system) : GPU{system, true}, gpu_thread{system} {}
+GPUAsynch::GPUAsynch(Core::System& system, bool use_nvdec)
    : GPU{system, true, use_nvdec}, gpu_thread{system} {}
 GPUAsynch::~GPUAsynch() = default;
 void GPUAsynch::Start() {
-    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher);
+    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher);
    cpu_context = renderer->GetRenderWindow().CreateSharedContext();
    cpu_context->MakeCurrent();
 }
@ -32,6 +33,27 @@ void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
    gpu_thread.SubmitList(std::move(entries));
 }
 void GPUAsynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
    if (!use_nvdec) {
        return;
    }
    // This condition fires when a video stream ends, clear all intermediary data
    if (entries[0].raw == 0xDEADB33F) {
        cdma_pusher.reset();
        return;
    }
    if (!cdma_pusher) {
        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
    }
    // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
    // TODO(ameerj): RE proper async nvdec operation
    // gpu_thread.SubmitCommandBuffer(std::move(entries));
    cdma_pusher->Push(std::move(entries));
    cdma_pusher->DispatchCalls();
 }
 void GPUAsynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
    gpu_thread.SwapBuffers(framebuffer);
 }
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@ -20,13 +20,14 @@ namespace VideoCommon {
 /// Implementation of GPU interface that runs the GPU asynchronously
 class GPUAsynch final : public Tegra::GPU {
 public:
-    explicit GPUAsynch(Core::System& system);
+    explicit GPUAsynch(Core::System& system, bool use_nvdec);
    ~GPUAsynch() override;
    void Start() override;
    void ObtainContext() override;
    void ReleaseContext() override;
    void PushGPUEntries(Tegra::CommandList&& entries) override;
    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override;
    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
    void FlushRegion(VAddr addr, u64 size) override;
    void InvalidateRegion(VAddr addr, u64 size) override;
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@ -7,7 +7,7 @@
 namespace VideoCommon {
-GPUSynch::GPUSynch(Core::System& system) : GPU{system, false} {}
+GPUSynch::GPUSynch(Core::System& system, bool use_nvdec) : GPU{system, false, use_nvdec} {}
 GPUSynch::~GPUSynch() = default;
@ -26,6 +26,22 @@ void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
    dma_pusher->DispatchCalls();
 }
 void GPUSynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
    if (!use_nvdec) {
        return;
    }
    // This condition fires when a video stream ends, clears all intermediary data
    if (entries[0].raw == 0xDEADB33F) {
        cdma_pusher.reset();
        return;
    }
    if (!cdma_pusher) {
        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
    }
    cdma_pusher->Push(std::move(entries));
    cdma_pusher->DispatchCalls();
 }
 void GPUSynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
    renderer->SwapBuffers(framebuffer);
 }
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@ -19,13 +19,14 @@ namespace VideoCommon {
 /// Implementation of GPU interface that runs the GPU synchronously
 class GPUSynch final : public Tegra::GPU {
 public:
-    explicit GPUSynch(Core::System& system);
+    explicit GPUSynch(Core::System& system, bool use_nvdec);
    ~GPUSynch() override;
    void Start() override;
    void ObtainContext() override;
    void ReleaseContext() override;
    void PushGPUEntries(Tegra::CommandList&& entries) override;
    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override;
    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
    void FlushRegion(VAddr addr, u64 size) override;
    void InvalidateRegion(VAddr addr, u64 size) override;
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@ -18,7 +18,7 @@ namespace VideoCommon::GPUThread {
 /// Runs the GPU thread
 static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
                      Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
-                      SynchState& state) {
+                      SynchState& state, Tegra::CDmaPusher& cdma_pusher) {
    std::string name = "yuzu:GPU";
    MicroProfileOnThreadCreate(name.c_str());
    Common::SetCurrentThreadName(name.c_str());
@ -42,6 +42,10 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
        if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
            dma_pusher.Push(std::move(submit_list->entries));
            dma_pusher.DispatchCalls();
        } else if (const auto command_list = std::get_if<SubmitChCommandEntries>(&next.data)) {
            // NVDEC
            cdma_pusher.Push(std::move(command_list->entries));
            cdma_pusher.DispatchCalls();
        } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) {
            renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
        } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) {
@ -75,15 +79,19 @@ ThreadManager::~ThreadManager() {
 void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
                                Core::Frontend::GraphicsContext& context,
-                                Tegra::DmaPusher& dma_pusher) {
+                                Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) {
-    thread = std::thread{RunThread,         std::ref(system),     std::ref(renderer),
+    thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
-                         std::ref(context), std::ref(dma_pusher), std::ref(state)};
+                         std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher));
 }
 void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
    PushCommand(SubmitListCommand(std::move(entries)));
 }
 void ThreadManager::SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries) {
    PushCommand(SubmitChCommandEntries(std::move(entries)));
 }
 void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
    PushCommand(SwapBuffersCommand(framebuffer ? std::make_optional(*framebuffer) : std::nullopt));
 }
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@ -37,6 +37,14 @@ struct SubmitListCommand final {
    Tegra::CommandList entries;
 };
 /// Command to signal to the GPU thread that a cdma command list is ready for processing
 struct SubmitChCommandEntries final {
    explicit SubmitChCommandEntries(Tegra::ChCommandHeaderList&& entries)
        : entries{std::move(entries)} {}
    Tegra::ChCommandHeaderList entries;
 };
 /// Command to signal to the GPU thread that a swap buffers is pending
 struct SwapBuffersCommand final {
    explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer)
@ -77,9 +85,9 @@ struct OnCommandListEndCommand final {};
 struct GPUTickCommand final {};
 using CommandData =
-    std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
+    std::variant<EndProcessingCommand, SubmitListCommand, SubmitChCommandEntries,
-                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand,
+                 SwapBuffersCommand, FlushRegionCommand, InvalidateRegionCommand,
-                 GPUTickCommand>;
+                 FlushAndInvalidateRegionCommand, OnCommandListEndCommand, GPUTickCommand>;
 struct CommandDataContainer {
    CommandDataContainer() = default;
@ -109,11 +117,14 @@ public:
    /// Creates and starts the GPU thread.
    void StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context,
-                     Tegra::DmaPusher& dma_pusher);
+                     Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher);
    /// Push GPU command entries to be processed
    void SubmitList(Tegra::CommandList&& entries);
    /// Push GPU CDMA command buffer entries to be processed
    void SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries);
    /// Swap buffers (render frame)
    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@ -11,6 +11,7 @@
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
 namespace Tegra {
@ -44,6 +45,12 @@ GPUVAddr MemoryManager::MapAllocate(VAddr cpu_addr, std::size_t size, std::size_
    return Map(cpu_addr, *FindFreeRange(size, align), size);
 }
 GPUVAddr MemoryManager::MapAllocate32(VAddr cpu_addr, std::size_t size) {
    const std::optional<GPUVAddr> gpu_addr = FindFreeRange(size, 1, true);
    ASSERT(gpu_addr);
    return Map(cpu_addr, *gpu_addr, size);
 }
 void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
    if (!size) {
        return;
@ -108,7 +115,8 @@ void MemoryManager::SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::s
    page_table[PageEntryIndex(gpu_addr)] = page_entry;
 }
-std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align) const {
+std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align,
                                                     bool start_32bit_address) const {
    if (!align) {
        align = page_size;
    } else {
@ -116,7 +124,7 @@ std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size
    }
    u64 available_size{};
-    GPUVAddr gpu_addr{address_space_start};
+    GPUVAddr gpu_addr{start_32bit_address ? address_space_start_low : address_space_start};
    while (gpu_addr + available_size < address_space_size) {
        if (GetPageEntry(gpu_addr + available_size).IsUnmapped()) {
            available_size += page_size;
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@ -116,6 +116,7 @@ public:
    [[nodiscard]] GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size);
    [[nodiscard]] GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align);
    [[nodiscard]] GPUVAddr MapAllocate32(VAddr cpu_addr, std::size_t size);
    [[nodiscard]] std::optional<GPUVAddr> AllocateFixed(GPUVAddr gpu_addr, std::size_t size);
    [[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align);
    void Unmap(GPUVAddr gpu_addr, std::size_t size);
@ -124,7 +125,8 @@ private:
    [[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const;
    void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size);
    GPUVAddr UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size);
-    [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align) const;
+    [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align,
                                                        bool start_32bit_address = false) const;
    void TryLockPage(PageEntry page_entry, std::size_t size);
    void TryUnlockPage(PageEntry page_entry, std::size_t size);
@ -135,6 +137,7 @@ private:
    static constexpr u64 address_space_size = 1ULL << 40;
    static constexpr u64 address_space_start = 1ULL << 32;
    static constexpr u64 address_space_start_low = 1ULL << 16;
    static constexpr u64 page_bits{16};
    static constexpr u64 page_size{1 << page_bits};
    static constexpr u64 page_mask{page_size - 1};
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@ -44,10 +44,11 @@ namespace VideoCore {
 std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) {
    std::unique_ptr<Tegra::GPU> gpu;
    const bool use_nvdec = Settings::values.use_nvdec_emulation.GetValue();
    if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
-        gpu = std::make_unique<VideoCommon::GPUAsynch>(system);
+        gpu = std::make_unique<VideoCommon::GPUAsynch>(system, use_nvdec);
    } else {
-        gpu = std::make_unique<VideoCommon::GPUSynch>(system);
+        gpu = std::make_unique<VideoCommon::GPUSynch>(system, use_nvdec);
    }
    auto context = emu_window.CreateSharedContext();
--- a/src/yuzu/CMakeLists.txt
+++ b/src/yuzu/CMakeLists.txt
@ -265,9 +265,11 @@ if (MSVC)
    include(CopyYuzuQt5Deps)
    include(CopyYuzuSDLDeps)
    include(CopyYuzuUnicornDeps)
    include(CopyYuzuFFmpegDeps)
    copy_yuzu_Qt5_deps(yuzu)
    copy_yuzu_SDL_deps(yuzu)
    copy_yuzu_unicorn_deps(yuzu)
    copy_yuzu_FFmpeg_deps(yuzu)
 endif()
 if (NOT APPLE)
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@ -717,6 +717,8 @@ void Config::ReadRendererValues() {
    ReadSettingGlobal(Settings::values.gpu_accuracy, QStringLiteral("gpu_accuracy"), 0);
    ReadSettingGlobal(Settings::values.use_asynchronous_gpu_emulation,
                      QStringLiteral("use_asynchronous_gpu_emulation"), false);
    ReadSettingGlobal(Settings::values.use_nvdec_emulation, QStringLiteral("use_nvdec_emulation"),
                      true);
    ReadSettingGlobal(Settings::values.use_vsync, QStringLiteral("use_vsync"), true);
    ReadSettingGlobal(Settings::values.use_assembly_shaders, QStringLiteral("use_assembly_shaders"),
                      false);
@ -1265,6 +1267,8 @@ void Config::SaveRendererValues() {
                       Settings::values.gpu_accuracy.UsingGlobal(), 0);
    WriteSettingGlobal(QStringLiteral("use_asynchronous_gpu_emulation"),
                       Settings::values.use_asynchronous_gpu_emulation, false);
    WriteSettingGlobal(QStringLiteral("use_nvdec_emulation"), Settings::values.use_nvdec_emulation,
                       true);
    WriteSettingGlobal(QStringLiteral("use_vsync"), Settings::values.use_vsync, true);
    WriteSettingGlobal(QStringLiteral("use_assembly_shaders"),
                       Settings::values.use_assembly_shaders, false);
--- a/src/yuzu/configuration/configure_graphics.cpp
+++ b/src/yuzu/configuration/configure_graphics.cpp
@ -70,9 +70,11 @@ void ConfigureGraphics::SetConfiguration() {
    ui->api->setEnabled(runtime_lock);
    ui->use_asynchronous_gpu_emulation->setEnabled(runtime_lock);
    ui->use_disk_shader_cache->setEnabled(runtime_lock);
    ui->use_nvdec_emulation->setEnabled(runtime_lock);
    ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache.GetValue());
    ui->use_asynchronous_gpu_emulation->setChecked(
        Settings::values.use_asynchronous_gpu_emulation.GetValue());
    ui->use_nvdec_emulation->setChecked(Settings::values.use_nvdec_emulation.GetValue());
    if (Settings::configuring_global) {
        ui->api->setCurrentIndex(static_cast<int>(Settings::values.renderer_backend.GetValue()));
@ -116,6 +118,9 @@ void ConfigureGraphics::ApplyConfiguration() {
            Settings::values.use_asynchronous_gpu_emulation.SetValue(
                ui->use_asynchronous_gpu_emulation->isChecked());
        }
        if (Settings::values.use_nvdec_emulation.UsingGlobal()) {
            Settings::values.use_nvdec_emulation.SetValue(ui->use_nvdec_emulation->isChecked());
        }
        if (Settings::values.bg_red.UsingGlobal()) {
            Settings::values.bg_red.SetValue(static_cast<float>(bg_color.redF()));
            Settings::values.bg_green.SetValue(static_cast<float>(bg_color.greenF()));
@ -144,6 +149,8 @@ void ConfigureGraphics::ApplyConfiguration() {
        ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_asynchronous_gpu_emulation,
                                                 ui->use_asynchronous_gpu_emulation,
                                                 use_asynchronous_gpu_emulation);
        ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_nvdec_emulation,
                                                 ui->use_nvdec_emulation, use_nvdec_emulation);
        if (ui->bg_combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) {
            Settings::values.bg_red.SetGlobal(true);
@ -240,6 +247,7 @@ void ConfigureGraphics::SetupPerGameUI() {
        ui->aspect_ratio_combobox->setEnabled(Settings::values.aspect_ratio.UsingGlobal());
        ui->use_asynchronous_gpu_emulation->setEnabled(
            Settings::values.use_asynchronous_gpu_emulation.UsingGlobal());
        ui->use_nvdec_emulation->setEnabled(Settings::values.use_nvdec_emulation.UsingGlobal());
        ui->use_disk_shader_cache->setEnabled(Settings::values.use_disk_shader_cache.UsingGlobal());
        ui->bg_button->setEnabled(Settings::values.bg_red.UsingGlobal());
@ -253,6 +261,8 @@ void ConfigureGraphics::SetupPerGameUI() {
    ConfigurationShared::SetColoredTristate(
        ui->use_disk_shader_cache, Settings::values.use_disk_shader_cache, use_disk_shader_cache);
    ConfigurationShared::SetColoredTristate(
        ui->use_nvdec_emulation, Settings::values.use_nvdec_emulation, use_nvdec_emulation);
    ConfigurationShared::SetColoredTristate(ui->use_asynchronous_gpu_emulation,
                                            Settings::values.use_asynchronous_gpu_emulation,
                                            use_asynchronous_gpu_emulation);
--- a/src/yuzu/configuration/configure_graphics.h
+++ b/src/yuzu/configuration/configure_graphics.h
@ -46,6 +46,7 @@ private:
    std::unique_ptr<Ui::ConfigureGraphics> ui;
    QColor bg_color;
    ConfigurationShared::CheckState use_nvdec_emulation;
    ConfigurationShared::CheckState use_disk_shader_cache;
    ConfigurationShared::CheckState use_asynchronous_gpu_emulation;
--- a/src/yuzu/configuration/configure_graphics.ui
+++ b/src/yuzu/configuration/configure_graphics.ui
@ -97,6 +97,13 @@
          </property>
         </widget>
        </item>
        <item>
         <widget class="QCheckBox" name="use_nvdec_emulation">
          <property name="text">
           <string>Use NVDEC emulation</string>
          </property>
         </widget>
        </item>
        <item>
         <widget class="QWidget" name="aspect_ratio_layout" native="true">
          <layout class="QHBoxLayout" name="horizontalLayout_6">