1
0
Fork 0
forked from suyu/suyu

video_core: NVDEC Implementation

This commit aims to implement the NVDEC (Nvidia Decoder) functionality, with video frame decoding being handled by the FFmpeg library.

The process begins with Ioctl commands being sent to the NVDEC and VIC (Video Image Composer) emulated devices. These allocate the necessary GPU buffers for the frame data, along with providing information on the incoming video data. A Submit command then signals the GPU to process and decode the frame data.

To decode the frame, the respective codec's header must be manually composed from the information provided by NVDEC, then sent with the raw frame data to the ffmpeg library.

Currently, H264 and VP9 are supported, with VP9 having some minor artifacting issues related mainly to the reference frame composition in its uncompressed header.

Async GPU is not properly implemented at the moment.

Co-Authored-By: David <25727384+ogniK5377@users.noreply.github.com>
This commit is contained in:
ameerj 2020-10-26 23:07:36 -04:00
parent 2f6ba54483
commit eb67a45ca8
53 changed files with 4033 additions and 310 deletions

View file

@ -263,6 +263,7 @@ if (CONAN_REQUIRED_LIBS)
libzip:with_openssl=False libzip:with_openssl=False
libzip:enable_windows_crypto=False libzip:enable_windows_crypto=False
) )
conan_check(VERSION 1.24.0 REQUIRED) conan_check(VERSION 1.24.0 REQUIRED)
# Add the bincrafters remote # Add the bincrafters remote
conan_add_remote(NAME bincrafters conan_add_remote(NAME bincrafters
@ -354,6 +355,19 @@ if (NOT LIBUSB_FOUND)
set(LIBUSB_LIBRARIES usb) set(LIBUSB_LIBRARIES usb)
endif() endif()
# Use system installed ffmpeg.
if (NOT MSVC)
find_package(FFmpeg REQUIRED)
else()
set(FFMPEG_EXT_NAME "ffmpeg-4.2.1")
set(FFMPEG_PATH "${CMAKE_BINARY_DIR}/externals/${FFMPEG_EXT_NAME}")
download_bundled_external("ffmpeg/" ${FFMPEG_EXT_NAME} "")
set(FFMPEG_FOUND YES)
set(FFMPEG_INCLUDE_DIR "${FFMPEG_PATH}/include" CACHE PATH "Path to FFmpeg headers" FORCE)
set(FFMPEG_LIBRARY_DIR "${FFMPEG_PATH}/bin" CACHE PATH "Path to FFmpeg library" FORCE)
set(FFMPEG_DLL_DIR "${FFMPEG_PATH}/bin" CACHE PATH "Path to FFmpeg dll's" FORCE)
endif()
# Prefer the -pthread flag on Linux. # Prefer the -pthread flag on Linux.
set(THREADS_PREFER_PTHREAD_FLAG ON) set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED) find_package(Threads REQUIRED)

View file

@ -0,0 +1,10 @@
function(copy_yuzu_FFmpeg_deps target_dir)
include(WindowsCopyFiles)
set(DLL_DEST "${CMAKE_BINARY_DIR}/bin/$<CONFIG>/")
windows_copy_files(${target_dir} ${FFMPEG_DLL_DIR} ${DLL_DEST}
avcodec-58.dll
avutil-56.dll
swresample-3.dll
swscale-5.dll
)
endfunction(copy_yuzu_FFmpeg_deps)

100
externals/find-modules/FindFFmpeg.cmake vendored Normal file
View file

@ -0,0 +1,100 @@
# - Try to find ffmpeg libraries (libavcodec, libavformat and libavutil)
# Once done this will define
#
# FFMPEG_FOUND - system has ffmpeg or libav
# FFMPEG_INCLUDE_DIR - the ffmpeg include directory
# FFMPEG_LIBRARIES - Link these to use ffmpeg
# FFMPEG_LIBAVCODEC
# FFMPEG_LIBAVFORMAT
# FFMPEG_LIBAVUTIL
#
# Copyright (c) 2008 Andreas Schneider <mail@cynapses.org>
# Modified for other libraries by Lasse Kärkkäinen <tronic>
# Modified for Hedgewars by Stepik777
# Modified for FFmpeg-example Tuukka Pasanen 2018
# Modified for yuzu toastUnlimted 2020
#
# Redistribution and use is allowed according to the terms of the New
# BSD license.
#
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(FFMPEG
FOUND_VAR FFMPEG_FOUND
REQUIRED_VARS
FFMPEG_LIBRARY
FFMPEG_INCLUDE_DIR
VERSION_VAR FFMPEG_VERSION
)
if(FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
# in cache already
set(FFMPEG_FOUND TRUE)
else()
# use pkg-config to get the directories and then use these values
# in the FIND_PATH() and FIND_LIBRARY() calls
find_package(PkgConfig)
if(PKG_CONFIG_FOUND)
pkg_check_modules(_FFMPEG_AVCODEC libavcodec)
pkg_check_modules(_FFMPEG_AVUTIL libavutil)
pkg_check_modules(_FFMPEG_SWSCALE libswscale)
endif()
find_path(FFMPEG_AVCODEC_INCLUDE_DIR
NAMES libavcodec/avcodec.h
PATHS ${_FFMPEG_AVCODEC_INCLUDE_DIRS}
/usr/include
/usr/local/include
/opt/local/include
/sw/include
PATH_SUFFIXES ffmpeg libav)
find_library(FFMPEG_LIBAVCODEC
NAMES avcodec
PATHS ${_FFMPEG_AVCODEC_LIBRARY_DIRS}
/usr/lib
/usr/local/lib
/opt/local/lib
/sw/lib)
find_library(FFMPEG_LIBAVUTIL
NAMES avutil
PATHS ${_FFMPEG_AVUTIL_LIBRARY_DIRS}
/usr/lib
/usr/local/lib
/opt/local/lib
/sw/lib)
find_library(FFMPEG_LIBSWSCALE
NAMES swscale
PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS}
/usr/lib
/usr/local/lib
/opt/local/lib
/sw/lib)
if(FFMPEG_LIBAVCODEC AND FFMPEG_LIBAVUTIL AND FFMPEG_LIBSWSCALE)
set(FFMPEG_FOUND TRUE)
endif()
if(FFMPEG_FOUND)
set(FFMPEG_INCLUDE_DIR ${FFMPEG_AVCODEC_INCLUDE_DIR})
set(FFMPEG_LIBRARIES
${FFMPEG_LIBAVCODEC}
${FFMPEG_LIBAVUTIL}
${FFMPEG_LIBSWSCALE})
endif()
if(FFMPEG_FOUND)
if(NOT FFMPEG_FIND_QUIETLY)
message(STATUS
"Found FFMPEG or Libav: ${FFMPEG_LIBRARIES}, ${FFMPEG_INCLUDE_DIR}")
endif()
else()
if(FFMPEG_FIND_REQUIRED)
message(FATAL_ERROR
"Could not find libavcodec or libavutil or libswscale")
endif()
endif()
endif()

View file

@ -150,6 +150,8 @@ add_library(common STATIC
scope_exit.h scope_exit.h
spin_lock.cpp spin_lock.cpp
spin_lock.h spin_lock.h
stream.cpp
stream.h
string_util.cpp string_util.cpp
string_util.h string_util.h
swap.h swap.h

47
src/common/stream.cpp Normal file
View file

@ -0,0 +1,47 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <stdexcept>
#include "common/common_types.h"
#include "common/stream.h"
namespace Common {
Stream::Stream() = default;
Stream::~Stream() = default;
void Stream::Seek(s32 offset, SeekOrigin origin) {
if (origin == SeekOrigin::SetOrigin) {
if (offset < 0) {
position = 0;
} else if (position >= buffer.size()) {
position = buffer.size();
} else {
position = offset;
}
} else if (origin == SeekOrigin::FromCurrentPos) {
Seek(static_cast<s32>(position) + offset, SeekOrigin::SetOrigin);
} else if (origin == SeekOrigin::FromEnd) {
Seek(static_cast<s32>(buffer.size()) - offset, SeekOrigin::SetOrigin);
}
}
u8 Stream::ReadByte() {
if (position < buffer.size()) {
return buffer[position++];
} else {
throw std::out_of_range("Attempting to read a byte not within the buffer range");
}
}
void Stream::WriteByte(u8 byte) {
if (position == buffer.size()) {
buffer.push_back(byte);
position++;
} else {
buffer.insert(buffer.begin() + position, byte);
}
}
} // namespace Common

50
src/common/stream.h Normal file
View file

@ -0,0 +1,50 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <vector>
#include "common/common_types.h"
namespace Common {
enum class SeekOrigin {
SetOrigin,
FromCurrentPos,
FromEnd,
};
class Stream {
public:
/// Stream creates a bitstream and provides common functionality on the stream.
explicit Stream();
~Stream();
/// Reposition bitstream "cursor" to the specified offset from origin
void Seek(s32 offset, SeekOrigin origin);
/// Reads next byte in the stream buffer and increments position
u8 ReadByte();
/// Writes byte at current position
void WriteByte(u8 byte);
std::size_t GetPosition() const {
return position;
}
std::vector<u8>& GetBuffer() {
return buffer;
}
const std::vector<u8>& GetBuffer() const {
return buffer;
}
private:
std::vector<u8> buffer;
std::size_t position{0};
};
} // namespace Common

View file

@ -439,6 +439,8 @@ add_library(core STATIC
hle/service/nvdrv/devices/nvhost_gpu.h hle/service/nvdrv/devices/nvhost_gpu.h
hle/service/nvdrv/devices/nvhost_nvdec.cpp hle/service/nvdrv/devices/nvhost_nvdec.cpp
hle/service/nvdrv/devices/nvhost_nvdec.h hle/service/nvdrv/devices/nvhost_nvdec.h
hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
hle/service/nvdrv/devices/nvhost_nvdec_common.h
hle/service/nvdrv/devices/nvhost_nvjpg.cpp hle/service/nvdrv/devices/nvhost_nvjpg.cpp
hle/service/nvdrv/devices/nvhost_nvjpg.h hle/service/nvdrv/devices/nvhost_nvjpg.h
hle/service/nvdrv/devices/nvhost_vic.cpp hle/service/nvdrv/devices/nvhost_vic.cpp

View file

@ -2,15 +2,17 @@
// Licensed under GPLv2 or any later version // Licensed under GPLv2 or any later version
// Refer to the license.txt file included. // Refer to the license.txt file included.
#include <cstring>
#include "common/assert.h" #include "common/assert.h"
#include "common/logging/log.h" #include "common/logging/log.h"
#include "core/core.h"
#include "core/hle/service/nvdrv/devices/nvhost_nvdec.h" #include "core/hle/service/nvdrv/devices/nvhost_nvdec.h"
#include "video_core/memory_manager.h"
#include "video_core/renderer_base.h"
namespace Service::Nvidia::Devices { namespace Service::Nvidia::Devices {
nvhost_nvdec::nvhost_nvdec(Core::System& system) : nvdevice(system) {} nvhost_nvdec::nvhost_nvdec(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
: nvhost_nvdec_common(system, std::move(nvmap_dev)) {}
nvhost_nvdec::~nvhost_nvdec() = default; nvhost_nvdec::~nvhost_nvdec() = default;
u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2, u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@ -21,7 +23,7 @@ u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::
switch (static_cast<IoctlCommand>(command.raw)) { switch (static_cast<IoctlCommand>(command.raw)) {
case IoctlCommand::IocSetNVMAPfdCommand: case IoctlCommand::IocSetNVMAPfdCommand:
return SetNVMAPfd(input, output); return SetNVMAPfd(input);
case IoctlCommand::IocSubmit: case IoctlCommand::IocSubmit:
return Submit(input, output); return Submit(input, output);
case IoctlCommand::IocGetSyncpoint: case IoctlCommand::IocGetSyncpoint:
@ -29,79 +31,29 @@ u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::
case IoctlCommand::IocGetWaitbase: case IoctlCommand::IocGetWaitbase:
return GetWaitbase(input, output); return GetWaitbase(input, output);
case IoctlCommand::IocMapBuffer: case IoctlCommand::IocMapBuffer:
return MapBuffer(input, output); case IoctlCommand::IocMapBuffer2:
case IoctlCommand::IocMapBuffer3:
case IoctlCommand::IocMapBufferEx: case IoctlCommand::IocMapBufferEx:
return MapBufferEx(input, output); return MapBuffer(input, output);
case IoctlCommand::IocUnmapBufferEx: case IoctlCommand::IocUnmapBufferEx: {
return UnmapBufferEx(input, output); // This command is sent when the video stream has ended, flush all video contexts
// This is usually sent in the folowing order: vic, nvdec, vic.
// Inform the GPU to clear any remaining nvdec buffers when this is detected.
LOG_INFO(Service_NVDRV, "NVDEC video stream ended");
Tegra::ChCommandHeaderList cmdlist(1);
cmdlist[0] = Tegra::ChCommandHeader{0xDEADB33F};
system.GPU().PushCommandBuffer(cmdlist);
[[fallthrough]]; // fallthrough to unmap buffers
};
case IoctlCommand::IocUnmapBuffer:
case IoctlCommand::IocUnmapBuffer2:
case IoctlCommand::IocUnmapBuffer3:
return UnmapBuffer(input, output);
case IoctlCommand::IocSetSubmitTimeout:
return SetSubmitTimeout(input, output);
} }
UNIMPLEMENTED_MSG("Unimplemented ioctl"); UNIMPLEMENTED_MSG("Unimplemented ioctl 0x{:X}", command.raw);
return 0;
}
u32 nvhost_nvdec::SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlSetNvmapFD params{};
std::memcpy(&params, input.data(), sizeof(IoctlSetNvmapFD));
LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
nvmap_fd = params.nvmap_fd;
return 0;
}
u32 nvhost_nvdec::Submit(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlSubmit params{};
std::memcpy(&params, input.data(), sizeof(IoctlSubmit));
LOG_WARNING(Service_NVDRV, "(STUBBED) called");
std::memcpy(output.data(), &params, sizeof(IoctlSubmit));
return 0;
}
u32 nvhost_nvdec::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlGetSyncpoint params{};
std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
params.value = 0; // Seems to be hard coded at 0
std::memcpy(output.data(), &params, sizeof(IoctlGetSyncpoint));
return 0;
}
u32 nvhost_nvdec::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlGetWaitbase params{};
std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
params.value = 0; // Seems to be hard coded at 0
std::memcpy(output.data(), &params, sizeof(IoctlGetWaitbase));
return 0;
}
u32 nvhost_nvdec::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlMapBuffer params{};
std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
params.address_1);
params.address_1 = 0;
params.address_2 = 0;
std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
return 0;
}
u32 nvhost_nvdec::MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlMapBufferEx params{};
std::memcpy(&params, input.data(), sizeof(IoctlMapBufferEx));
LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
params.address_1);
params.address_1 = 0;
params.address_2 = 0;
std::memcpy(output.data(), &params, sizeof(IoctlMapBufferEx));
return 0;
}
u32 nvhost_nvdec::UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlUnmapBufferEx params{};
std::memcpy(&params, input.data(), sizeof(IoctlUnmapBufferEx));
LOG_WARNING(Service_NVDRV, "(STUBBED) called");
std::memcpy(output.data(), &params, sizeof(IoctlUnmapBufferEx));
return 0; return 0;
} }

View file

@ -4,16 +4,14 @@
#pragma once #pragma once
#include <vector> #include <memory>
#include "common/common_types.h" #include "core/hle/service/nvdrv/devices/nvhost_nvdec_common.h"
#include "common/swap.h"
#include "core/hle/service/nvdrv/devices/nvdevice.h"
namespace Service::Nvidia::Devices { namespace Service::Nvidia::Devices {
class nvhost_nvdec final : public nvdevice { class nvhost_nvdec final : public nvhost_nvdec_common {
public: public:
explicit nvhost_nvdec(Core::System& system); explicit nvhost_nvdec(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
~nvhost_nvdec() override; ~nvhost_nvdec() override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2, u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@ -27,62 +25,15 @@ private:
IocGetSyncpoint = 0xC0080002, IocGetSyncpoint = 0xC0080002,
IocGetWaitbase = 0xC0080003, IocGetWaitbase = 0xC0080003,
IocMapBuffer = 0xC01C0009, IocMapBuffer = 0xC01C0009,
IocMapBuffer2 = 0xC16C0009,
IocMapBuffer3 = 0xC15C0009,
IocMapBufferEx = 0xC0A40009, IocMapBufferEx = 0xC0A40009,
IocUnmapBufferEx = 0xC0A4000A, IocUnmapBuffer = 0xC0A4000A,
IocUnmapBuffer2 = 0xC16C000A,
IocUnmapBufferEx = 0xC01C000A,
IocUnmapBuffer3 = 0xC15C000A,
IocSetSubmitTimeout = 0x40040007,
}; };
struct IoctlSetNvmapFD {
u32_le nvmap_fd;
};
static_assert(sizeof(IoctlSetNvmapFD) == 0x4, "IoctlSetNvmapFD is incorrect size");
struct IoctlSubmit {
INSERT_PADDING_BYTES(0x40); // TODO(DarkLordZach): RE this structure
};
static_assert(sizeof(IoctlSubmit) == 0x40, "IoctlSubmit has incorrect size");
struct IoctlGetSyncpoint {
u32 unknown; // seems to be ignored? Nintendo added this
u32 value;
};
static_assert(sizeof(IoctlGetSyncpoint) == 0x08, "IoctlGetSyncpoint has incorrect size");
struct IoctlGetWaitbase {
u32 unknown; // seems to be ignored? Nintendo added this
u32 value;
};
static_assert(sizeof(IoctlGetWaitbase) == 0x08, "IoctlGetWaitbase has incorrect size");
struct IoctlMapBuffer {
u32 unknown;
u32 address_1;
u32 address_2;
INSERT_PADDING_BYTES(0x10); // TODO(DarkLordZach): RE this structure
};
static_assert(sizeof(IoctlMapBuffer) == 0x1C, "IoctlMapBuffer is incorrect size");
struct IoctlMapBufferEx {
u32 unknown;
u32 address_1;
u32 address_2;
INSERT_PADDING_BYTES(0x98); // TODO(DarkLordZach): RE this structure
};
static_assert(sizeof(IoctlMapBufferEx) == 0xA4, "IoctlMapBufferEx has incorrect size");
struct IoctlUnmapBufferEx {
INSERT_PADDING_BYTES(0xA4); // TODO(DarkLordZach): RE this structure
};
static_assert(sizeof(IoctlUnmapBufferEx) == 0xA4, "IoctlUnmapBufferEx has incorrect size");
u32_le nvmap_fd{};
u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output);
u32 Submit(const std::vector<u8>& input, std::vector<u8>& output);
u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output);
u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
u32 MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
u32 UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
}; };
} // namespace Service::Nvidia::Devices } // namespace Service::Nvidia::Devices

View file

@ -0,0 +1,234 @@
// Copyright 2020 yuzu emulator team
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <algorithm>
#include <cstring>
#include "common/assert.h"
#include "common/common_types.h"
#include "common/logging/log.h"
#include "core/core.h"
#include "core/hle/service/nvdrv/devices/nvhost_nvdec_common.h"
#include "core/hle/service/nvdrv/devices/nvmap.h"
#include "core/memory.h"
#include "video_core/memory_manager.h"
#include "video_core/renderer_base.h"
namespace Service::Nvidia::Devices {
namespace {
// Splice vectors will copy count amount of type T from the input vector into the dst vector.
template <typename T>
std::size_t SpliceVectors(const std::vector<u8>& input, std::vector<T>& dst, std::size_t count,
std::size_t offset) {
std::memcpy(dst.data(), input.data() + offset, count * sizeof(T));
offset += count * sizeof(T);
return offset;
}
// Write vectors will write data to the output buffer
template <typename T>
std::size_t WriteVectors(std::vector<u8>& dst, const std::vector<T>& src, std::size_t offset) {
std::memcpy(dst.data() + offset, src.data(), src.size() * sizeof(T));
offset += src.size() * sizeof(T);
return offset;
}
} // Anonymous namespace
namespace NvErrCodes {
constexpr u32 Success{};
constexpr u32 OutOfMemory{static_cast<u32>(-12)};
constexpr u32 InvalidInput{static_cast<u32>(-22)};
} // namespace NvErrCodes
nvhost_nvdec_common::nvhost_nvdec_common(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
: nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
nvhost_nvdec_common::~nvhost_nvdec_common() = default;
u32 nvhost_nvdec_common::SetNVMAPfd(const std::vector<u8>& input) {
IoctlSetNvmapFD params{};
std::memcpy(&params, input.data(), sizeof(IoctlSetNvmapFD));
LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
nvmap_fd = params.nvmap_fd;
return 0;
}
u32 nvhost_nvdec_common::Submit(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlSubmit params{};
std::memcpy(&params, input.data(), sizeof(IoctlSubmit));
LOG_DEBUG(Service_NVDRV, "called NVDEC Submit, cmd_buffer_count={}", params.cmd_buffer_count);
// Instantiate param buffers
std::size_t offset = sizeof(IoctlSubmit);
std::vector<CommandBuffer> command_buffers(params.cmd_buffer_count);
std::vector<Reloc> relocs(params.relocation_count);
std::vector<u32> reloc_shifts(params.relocation_count);
std::vector<SyncptIncr> syncpt_increments(params.syncpoint_count);
std::vector<SyncptIncr> wait_checks(params.syncpoint_count);
std::vector<Fence> fences(params.fence_count);
// Splice input into their respective buffers
offset = SpliceVectors(input, command_buffers, params.cmd_buffer_count, offset);
offset = SpliceVectors(input, relocs, params.relocation_count, offset);
offset = SpliceVectors(input, reloc_shifts, params.relocation_count, offset);
offset = SpliceVectors(input, syncpt_increments, params.syncpoint_count, offset);
offset = SpliceVectors(input, wait_checks, params.syncpoint_count, offset);
offset = SpliceVectors(input, fences, params.fence_count, offset);
// TODO(ameerj): For async gpu, utilize fences for syncpoint 'max' increment
auto& gpu = system.GPU();
for (const auto& cmd_buffer : command_buffers) {
auto object = nvmap_dev->GetObject(cmd_buffer.memory_id);
ASSERT_OR_EXECUTE(object, return NvErrCodes::InvalidInput;);
const auto map = FindBufferMap(object->dma_map_addr);
if (!map) {
LOG_ERROR(Service_NVDRV, "Tried to submit an invalid offset 0x{:X} dma 0x{:X}",
object->addr, object->dma_map_addr);
return 0;
}
Tegra::ChCommandHeaderList cmdlist(cmd_buffer.word_count);
gpu.MemoryManager().ReadBlock(map->StartAddr() + cmd_buffer.offset, cmdlist.data(),
cmdlist.size() * sizeof(u32));
gpu.PushCommandBuffer(cmdlist);
}
std::memcpy(output.data(), &params, sizeof(IoctlSubmit));
// Some games expect command_buffers to be written back
offset = sizeof(IoctlSubmit);
offset = WriteVectors(output, command_buffers, offset);
offset = WriteVectors(output, relocs, offset);
offset = WriteVectors(output, reloc_shifts, offset);
offset = WriteVectors(output, syncpt_increments, offset);
offset = WriteVectors(output, wait_checks, offset);
return NvErrCodes::Success;
}
u32 nvhost_nvdec_common::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlGetSyncpoint params{};
std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
LOG_DEBUG(Service_NVDRV, "called GetSyncpoint, id={}", params.param);
// We found that implementing this causes deadlocks with async gpu, along with degraded
// performance. TODO: RE the nvdec async implementation
params.value = 0;
std::memcpy(output.data(), &params, sizeof(IoctlGetSyncpoint));
return NvErrCodes::Success;
}
u32 nvhost_nvdec_common::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlGetWaitbase params{};
std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
params.value = 0; // Seems to be hard coded at 0
std::memcpy(output.data(), &params, sizeof(IoctlGetWaitbase));
return 0;
}
u32 nvhost_nvdec_common::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlMapBuffer params{};
std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
std::vector<MapBufferEntry> cmd_buffer_handles(params.num_entries);
SpliceVectors(input, cmd_buffer_handles, params.num_entries, sizeof(IoctlMapBuffer));
auto& gpu = system.GPU();
for (auto& cmf_buff : cmd_buffer_handles) {
auto object{nvmap_dev->GetObject(cmf_buff.map_handle)};
if (!object) {
LOG_ERROR(Service_NVDRV, "invalid cmd_buffer nvmap_handle={:X}", cmf_buff.map_handle);
std::memcpy(output.data(), &params, output.size());
return NvErrCodes::InvalidInput;
}
if (object->dma_map_addr == 0) {
// NVDEC and VIC memory is in the 32-bit address space
// MapAllocate32 will attempt to map a lower 32-bit value in the shared gpu memory space
const GPUVAddr low_addr = gpu.MemoryManager().MapAllocate32(object->addr, object->size);
object->dma_map_addr = static_cast<u32>(low_addr);
// Ensure that the dma_map_addr is indeed in the lower 32-bit address space.
ASSERT(object->dma_map_addr == low_addr);
}
if (!object->dma_map_addr) {
LOG_ERROR(Service_NVDRV, "failed to map size={}", object->size);
} else {
cmf_buff.map_address = object->dma_map_addr;
AddBufferMap(object->dma_map_addr, object->size, object->addr,
object->status == nvmap::Object::Status::Allocated);
}
}
std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
std::memcpy(output.data() + sizeof(IoctlMapBuffer), cmd_buffer_handles.data(),
cmd_buffer_handles.size() * sizeof(MapBufferEntry));
return NvErrCodes::Success;
}
u32 nvhost_nvdec_common::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlMapBuffer params{};
std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
std::vector<MapBufferEntry> cmd_buffer_handles(params.num_entries);
SpliceVectors(input, cmd_buffer_handles, params.num_entries, sizeof(IoctlMapBuffer));
auto& gpu = system.GPU();
for (auto& cmf_buff : cmd_buffer_handles) {
const auto object{nvmap_dev->GetObject(cmf_buff.map_handle)};
if (!object) {
LOG_ERROR(Service_NVDRV, "invalid cmd_buffer nvmap_handle={:X}", cmf_buff.map_handle);
std::memcpy(output.data(), &params, output.size());
return NvErrCodes::InvalidInput;
}
if (const auto size{RemoveBufferMap(object->dma_map_addr)}; size) {
gpu.MemoryManager().Unmap(object->dma_map_addr, *size);
} else {
// This occurs quite frequently, however does not seem to impact functionality
LOG_DEBUG(Service_NVDRV, "invalid offset=0x{:X} dma=0x{:X}", object->addr,
object->dma_map_addr);
}
object->dma_map_addr = 0;
}
std::memset(output.data(), 0, output.size());
return NvErrCodes::Success;
}
u32 nvhost_nvdec_common::SetSubmitTimeout(const std::vector<u8>& input, std::vector<u8>& output) {
std::memcpy(&submit_timeout, input.data(), input.size());
LOG_WARNING(Service_NVDRV, "(STUBBED) called");
return NvErrCodes::Success;
}
std::optional<nvhost_nvdec_common::BufferMap> nvhost_nvdec_common::FindBufferMap(
GPUVAddr gpu_addr) const {
const auto it = std::find_if(
buffer_mappings.begin(), buffer_mappings.upper_bound(gpu_addr), [&](const auto& entry) {
return (gpu_addr >= entry.second.StartAddr() && gpu_addr < entry.second.EndAddr());
});
ASSERT(it != buffer_mappings.end());
return it->second;
}
void nvhost_nvdec_common::AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr,
bool is_allocated) {
buffer_mappings.insert_or_assign(gpu_addr, BufferMap{gpu_addr, size, cpu_addr, is_allocated});
}
std::optional<std::size_t> nvhost_nvdec_common::RemoveBufferMap(GPUVAddr gpu_addr) {
const auto iter{buffer_mappings.find(gpu_addr)};
if (iter == buffer_mappings.end()) {
return std::nullopt;
}
std::size_t size = 0;
if (iter->second.IsAllocated()) {
size = iter->second.Size();
}
buffer_mappings.erase(iter);
return size;
}
} // namespace Service::Nvidia::Devices

View file

@ -0,0 +1,168 @@
// Copyright 2020 yuzu emulator team
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <map>
#include <vector>
#include "common/common_types.h"
#include "common/swap.h"
#include "core/hle/service/nvdrv/devices/nvdevice.h"
namespace Service::Nvidia::Devices {
class nvmap;
class nvhost_nvdec_common : public nvdevice {
public:
explicit nvhost_nvdec_common(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
~nvhost_nvdec_common() override;
virtual u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
std::vector<u8>& output, std::vector<u8>& output2, IoctlCtrl& ctrl,
IoctlVersion version) = 0;
protected:
class BufferMap final {
public:
constexpr BufferMap() = default;
constexpr BufferMap(GPUVAddr start_addr, std::size_t size)
: start_addr{start_addr}, end_addr{start_addr + size} {}
constexpr BufferMap(GPUVAddr start_addr, std::size_t size, VAddr cpu_addr,
bool is_allocated)
: start_addr{start_addr}, end_addr{start_addr + size}, cpu_addr{cpu_addr},
is_allocated{is_allocated} {}
constexpr VAddr StartAddr() const {
return start_addr;
}
constexpr VAddr EndAddr() const {
return end_addr;
}
constexpr std::size_t Size() const {
return end_addr - start_addr;
}
constexpr VAddr CpuAddr() const {
return cpu_addr;
}
constexpr bool IsAllocated() const {
return is_allocated;
}
private:
GPUVAddr start_addr{};
GPUVAddr end_addr{};
VAddr cpu_addr{};
bool is_allocated{};
};
struct IoctlSetNvmapFD {
u32_le nvmap_fd;
};
static_assert(sizeof(IoctlSetNvmapFD) == 4, "IoctlSetNvmapFD is incorrect size");
struct IoctlSubmitCommandBuffer {
u32_le id;
u32_le offset;
u32_le count;
};
static_assert(sizeof(IoctlSubmitCommandBuffer) == 0xC,
"IoctlSubmitCommandBuffer is incorrect size");
struct IoctlSubmit {
u32_le cmd_buffer_count;
u32_le relocation_count;
u32_le syncpoint_count;
u32_le fence_count;
};
static_assert(sizeof(IoctlSubmit) == 0x10, "IoctlSubmit has incorrect size");
struct CommandBuffer {
s32 memory_id;
u32 offset;
s32 word_count;
};
static_assert(sizeof(CommandBuffer) == 0xC, "CommandBuffer has incorrect size");
struct Reloc {
s32 cmdbuffer_memory;
s32 cmdbuffer_offset;
s32 target;
s32 target_offset;
};
static_assert(sizeof(Reloc) == 0x10, "CommandBuffer has incorrect size");
struct SyncptIncr {
u32 id;
u32 increments;
};
static_assert(sizeof(SyncptIncr) == 0x8, "CommandBuffer has incorrect size");
struct Fence {
u32 id;
u32 value;
};
static_assert(sizeof(Fence) == 0x8, "CommandBuffer has incorrect size");
struct IoctlGetSyncpoint {
// Input
u32_le param;
// Output
u32_le value;
};
static_assert(sizeof(IoctlGetSyncpoint) == 8, "IocGetIdParams has wrong size");
struct IoctlGetWaitbase {
u32_le unknown; // seems to be ignored? Nintendo added this
u32_le value;
};
static_assert(sizeof(IoctlGetWaitbase) == 0x8, "IoctlGetWaitbase is incorrect size");
struct IoctlMapBuffer {
u32_le num_entries;
u32_le data_address; // Ignored by the driver.
u32_le attach_host_ch_das;
};
static_assert(sizeof(IoctlMapBuffer) == 0x0C, "IoctlMapBuffer is incorrect size");
struct IocGetIdParams {
// Input
u32_le param;
// Output
u32_le value;
};
static_assert(sizeof(IocGetIdParams) == 8, "IocGetIdParams has wrong size");
// Used for mapping and unmapping command buffers
struct MapBufferEntry {
u32_le map_handle;
u32_le map_address;
};
static_assert(sizeof(IoctlMapBuffer) == 0x0C, "IoctlMapBuffer is incorrect size");
/// Ioctl command implementations
u32 SetNVMAPfd(const std::vector<u8>& input);
u32 Submit(const std::vector<u8>& input, std::vector<u8>& output);
u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output);
u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
u32 UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
u32 SetSubmitTimeout(const std::vector<u8>& input, std::vector<u8>& output);
std::optional<BufferMap> FindBufferMap(GPUVAddr gpu_addr) const;
void AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr, bool is_allocated);
std::optional<std::size_t> RemoveBufferMap(GPUVAddr gpu_addr);
u32_le nvmap_fd{};
u32_le submit_timeout{};
std::shared_ptr<nvmap> nvmap_dev;
// This is expected to be ordered, therefore we must use a map, not unordered_map
std::map<GPUVAddr, BufferMap> buffer_mappings;
};
}; // namespace Service::Nvidia::Devices

View file

@ -2,15 +2,17 @@
// Licensed under GPLv2 or any later version // Licensed under GPLv2 or any later version
// Refer to the license.txt file included. // Refer to the license.txt file included.
#include <cstring>
#include "common/assert.h" #include "common/assert.h"
#include "common/logging/log.h" #include "common/logging/log.h"
#include "core/core.h"
#include "core/hle/service/nvdrv/devices/nvhost_vic.h" #include "core/hle/service/nvdrv/devices/nvhost_vic.h"
#include "video_core/memory_manager.h"
#include "video_core/renderer_base.h"
namespace Service::Nvidia::Devices { namespace Service::Nvidia::Devices {
nvhost_vic::nvhost_vic(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
: nvhost_nvdec_common(system, std::move(nvmap_dev)) {}
nvhost_vic::nvhost_vic(Core::System& system) : nvdevice(system) {}
nvhost_vic::~nvhost_vic() = default; nvhost_vic::~nvhost_vic() = default;
u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2, u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@ -21,7 +23,7 @@ u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::ve
switch (static_cast<IoctlCommand>(command.raw)) { switch (static_cast<IoctlCommand>(command.raw)) {
case IoctlCommand::IocSetNVMAPfdCommand: case IoctlCommand::IocSetNVMAPfdCommand:
return SetNVMAPfd(input, output); return SetNVMAPfd(input);
case IoctlCommand::IocSubmit: case IoctlCommand::IocSubmit:
return Submit(input, output); return Submit(input, output);
case IoctlCommand::IocGetSyncpoint: case IoctlCommand::IocGetSyncpoint:
@ -29,83 +31,19 @@ u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::ve
case IoctlCommand::IocGetWaitbase: case IoctlCommand::IocGetWaitbase:
return GetWaitbase(input, output); return GetWaitbase(input, output);
case IoctlCommand::IocMapBuffer: case IoctlCommand::IocMapBuffer:
return MapBuffer(input, output); case IoctlCommand::IocMapBuffer2:
case IoctlCommand::IocMapBuffer3:
case IoctlCommand::IocMapBuffer4:
case IoctlCommand::IocMapBufferEx: case IoctlCommand::IocMapBufferEx:
return MapBuffer(input, output); return MapBuffer(input, output);
case IoctlCommand::IocUnmapBuffer:
case IoctlCommand::IocUnmapBuffer2:
case IoctlCommand::IocUnmapBuffer3:
case IoctlCommand::IocUnmapBufferEx: case IoctlCommand::IocUnmapBufferEx:
return UnmapBufferEx(input, output); return UnmapBuffer(input, output);
} }
UNIMPLEMENTED_MSG("Unimplemented ioctl"); UNIMPLEMENTED_MSG("Unimplemented ioctl 0x{:X}", command.raw);
return 0;
}
u32 nvhost_vic::SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlSetNvmapFD params{};
std::memcpy(&params, input.data(), sizeof(IoctlSetNvmapFD));
LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
nvmap_fd = params.nvmap_fd;
return 0;
}
u32 nvhost_vic::Submit(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlSubmit params{};
std::memcpy(&params, input.data(), sizeof(IoctlSubmit));
LOG_WARNING(Service_NVDRV, "(STUBBED) called");
// Workaround for Luigi's Mansion 3, as nvhost_vic is not implemented for asynch GPU
params.command_buffer = {};
std::memcpy(output.data(), &params, sizeof(IoctlSubmit));
return 0;
}
u32 nvhost_vic::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlGetSyncpoint params{};
std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
params.value = 0; // Seems to be hard coded at 0
std::memcpy(output.data(), &params, sizeof(IoctlGetSyncpoint));
return 0;
}
u32 nvhost_vic::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlGetWaitbase params{};
std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
params.value = 0; // Seems to be hard coded at 0
std::memcpy(output.data(), &params, sizeof(IoctlGetWaitbase));
return 0;
}
u32 nvhost_vic::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlMapBuffer params{};
std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
params.address_1);
params.address_1 = 0;
params.address_2 = 0;
std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
return 0;
}
u32 nvhost_vic::MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlMapBufferEx params{};
std::memcpy(&params, input.data(), sizeof(IoctlMapBufferEx));
LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
params.address_1);
params.address_1 = 0;
params.address_2 = 0;
std::memcpy(output.data(), &params, sizeof(IoctlMapBufferEx));
return 0;
}
u32 nvhost_vic::UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlUnmapBufferEx params{};
std::memcpy(&params, input.data(), sizeof(IoctlUnmapBufferEx));
LOG_WARNING(Service_NVDRV, "(STUBBED) called");
std::memcpy(output.data(), &params, sizeof(IoctlUnmapBufferEx));
return 0; return 0;
} }

View file

@ -4,19 +4,15 @@
#pragma once #pragma once
#include <array> #include "core/hle/service/nvdrv/devices/nvhost_nvdec_common.h"
#include <vector>
#include "common/common_types.h"
#include "common/swap.h"
#include "core/hle/service/nvdrv/devices/nvdevice.h"
namespace Service::Nvidia::Devices { namespace Service::Nvidia::Devices {
class nvmap;
class nvhost_vic final : public nvdevice { class nvhost_vic final : public nvhost_nvdec_common {
public: public:
explicit nvhost_vic(Core::System& system); explicit nvhost_vic(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
~nvhost_vic() override; ~nvhost_vic();
u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2, u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
std::vector<u8>& output, std::vector<u8>& output2, IoctlCtrl& ctrl, std::vector<u8>& output, std::vector<u8>& output2, IoctlCtrl& ctrl,
IoctlVersion version) override; IoctlVersion version) override;
@ -28,74 +24,14 @@ private:
IocGetSyncpoint = 0xC0080002, IocGetSyncpoint = 0xC0080002,
IocGetWaitbase = 0xC0080003, IocGetWaitbase = 0xC0080003,
IocMapBuffer = 0xC01C0009, IocMapBuffer = 0xC01C0009,
IocMapBuffer2 = 0xC0340009,
IocMapBuffer3 = 0xC0140009,
IocMapBuffer4 = 0xC00C0009,
IocMapBufferEx = 0xC03C0009, IocMapBufferEx = 0xC03C0009,
IocUnmapBufferEx = 0xC03C000A, IocUnmapBuffer = 0xC03C000A,
IocUnmapBuffer2 = 0xC034000A,
IocUnmapBuffer3 = 0xC00C000A,
IocUnmapBufferEx = 0xC01C000A,
}; };
struct IoctlSetNvmapFD {
u32_le nvmap_fd;
};
static_assert(sizeof(IoctlSetNvmapFD) == 4, "IoctlSetNvmapFD is incorrect size");
struct IoctlSubmitCommandBuffer {
u32 id;
u32 offset;
u32 count;
};
static_assert(sizeof(IoctlSubmitCommandBuffer) == 0xC,
"IoctlSubmitCommandBuffer is incorrect size");
struct IoctlSubmit {
u32 command_buffer_count;
u32 relocations_count;
u32 syncpt_count;
u32 wait_count;
std::array<IoctlSubmitCommandBuffer, 4> command_buffer;
};
static_assert(sizeof(IoctlSubmit) == 0x40, "IoctlSubmit is incorrect size");
struct IoctlGetSyncpoint {
u32 unknown; // seems to be ignored? Nintendo added this
u32 value;
};
static_assert(sizeof(IoctlGetSyncpoint) == 0x8, "IoctlGetSyncpoint is incorrect size");
struct IoctlGetWaitbase {
u32 unknown; // seems to be ignored? Nintendo added this
u32 value;
};
static_assert(sizeof(IoctlGetWaitbase) == 0x8, "IoctlGetWaitbase is incorrect size");
struct IoctlMapBuffer {
u32 unknown;
u32 address_1;
u32 address_2;
INSERT_PADDING_BYTES(0x10); // TODO(DarkLordZach): RE this structure
};
static_assert(sizeof(IoctlMapBuffer) == 0x1C, "IoctlMapBuffer is incorrect size");
struct IoctlMapBufferEx {
u32 unknown;
u32 address_1;
u32 address_2;
INSERT_PADDING_BYTES(0x30); // TODO(DarkLordZach): RE this structure
};
static_assert(sizeof(IoctlMapBufferEx) == 0x3C, "IoctlMapBufferEx is incorrect size");
struct IoctlUnmapBufferEx {
INSERT_PADDING_BYTES(0x3C); // TODO(DarkLordZach): RE this structure
};
static_assert(sizeof(IoctlUnmapBufferEx) == 0x3C, "IoctlUnmapBufferEx is incorrect size");
u32_le nvmap_fd{};
u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output);
u32 Submit(const std::vector<u8>& input, std::vector<u8>& output);
u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output);
u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
u32 MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
u32 UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
}; };
} // namespace Service::Nvidia::Devices } // namespace Service::Nvidia::Devices

View file

@ -37,6 +37,7 @@ public:
VAddr addr; VAddr addr;
Status status; Status status;
u32 refcount; u32 refcount;
u32 dma_map_addr;
}; };
std::shared_ptr<Object> GetObject(u32 handle) const { std::shared_ptr<Object> GetObject(u32 handle) const {

View file

@ -51,9 +51,9 @@ Module::Module(Core::System& system) {
devices["/dev/nvmap"] = nvmap_dev; devices["/dev/nvmap"] = nvmap_dev;
devices["/dev/nvdisp_disp0"] = std::make_shared<Devices::nvdisp_disp0>(system, nvmap_dev); devices["/dev/nvdisp_disp0"] = std::make_shared<Devices::nvdisp_disp0>(system, nvmap_dev);
devices["/dev/nvhost-ctrl"] = std::make_shared<Devices::nvhost_ctrl>(system, events_interface); devices["/dev/nvhost-ctrl"] = std::make_shared<Devices::nvhost_ctrl>(system, events_interface);
devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>(system); devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>(system, nvmap_dev);
devices["/dev/nvhost-nvjpg"] = std::make_shared<Devices::nvhost_nvjpg>(system); devices["/dev/nvhost-nvjpg"] = std::make_shared<Devices::nvhost_nvjpg>(system);
devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>(system); devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>(system, nvmap_dev);
} }
Module::~Module() = default; Module::~Module() = default;

View file

@ -63,6 +63,7 @@ void LogSettings() {
log_setting("Renderer_GPUAccuracyLevel", values.gpu_accuracy.GetValue()); log_setting("Renderer_GPUAccuracyLevel", values.gpu_accuracy.GetValue());
log_setting("Renderer_UseAsynchronousGpuEmulation", log_setting("Renderer_UseAsynchronousGpuEmulation",
values.use_asynchronous_gpu_emulation.GetValue()); values.use_asynchronous_gpu_emulation.GetValue());
log_setting("Renderer_UseNvdecEmulation", values.use_nvdec_emulation.GetValue());
log_setting("Renderer_UseVsync", values.use_vsync.GetValue()); log_setting("Renderer_UseVsync", values.use_vsync.GetValue());
log_setting("Renderer_UseAssemblyShaders", values.use_assembly_shaders.GetValue()); log_setting("Renderer_UseAssemblyShaders", values.use_assembly_shaders.GetValue());
log_setting("Renderer_UseAsynchronousShaders", values.use_asynchronous_shaders.GetValue()); log_setting("Renderer_UseAsynchronousShaders", values.use_asynchronous_shaders.GetValue());
@ -119,6 +120,7 @@ void RestoreGlobalState() {
values.use_disk_shader_cache.SetGlobal(true); values.use_disk_shader_cache.SetGlobal(true);
values.gpu_accuracy.SetGlobal(true); values.gpu_accuracy.SetGlobal(true);
values.use_asynchronous_gpu_emulation.SetGlobal(true); values.use_asynchronous_gpu_emulation.SetGlobal(true);
values.use_nvdec_emulation.SetGlobal(true);
values.use_vsync.SetGlobal(true); values.use_vsync.SetGlobal(true);
values.use_assembly_shaders.SetGlobal(true); values.use_assembly_shaders.SetGlobal(true);
values.use_asynchronous_shaders.SetGlobal(true); values.use_asynchronous_shaders.SetGlobal(true);

View file

@ -111,6 +111,7 @@ struct Values {
Setting<bool> use_disk_shader_cache; Setting<bool> use_disk_shader_cache;
Setting<GPUAccuracy> gpu_accuracy; Setting<GPUAccuracy> gpu_accuracy;
Setting<bool> use_asynchronous_gpu_emulation; Setting<bool> use_asynchronous_gpu_emulation;
Setting<bool> use_nvdec_emulation;
Setting<bool> use_vsync; Setting<bool> use_vsync;
Setting<bool> use_assembly_shaders; Setting<bool> use_assembly_shaders;
Setting<bool> use_asynchronous_shaders; Setting<bool> use_asynchronous_shaders;

View file

@ -206,6 +206,8 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) {
TranslateGPUAccuracyLevel(Settings::values.gpu_accuracy.GetValue())); TranslateGPUAccuracyLevel(Settings::values.gpu_accuracy.GetValue()));
AddField(field_type, "Renderer_UseAsynchronousGpuEmulation", AddField(field_type, "Renderer_UseAsynchronousGpuEmulation",
Settings::values.use_asynchronous_gpu_emulation.GetValue()); Settings::values.use_asynchronous_gpu_emulation.GetValue());
AddField(field_type, "Renderer_UseNvdecEmulation",
Settings::values.use_nvdec_emulation.GetValue());
AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync.GetValue()); AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync.GetValue());
AddField(field_type, "Renderer_UseAssemblyShaders", AddField(field_type, "Renderer_UseAssemblyShaders",
Settings::values.use_assembly_shaders.GetValue()); Settings::values.use_assembly_shaders.GetValue());

View file

@ -5,6 +5,24 @@ add_library(video_core STATIC
buffer_cache/buffer_cache.h buffer_cache/buffer_cache.h
buffer_cache/map_interval.cpp buffer_cache/map_interval.cpp
buffer_cache/map_interval.h buffer_cache/map_interval.h
cdma_pusher.cpp
cdma_pusher.h
command_classes/codecs/codec.cpp
command_classes/codecs/codec.h
command_classes/codecs/h264.cpp
command_classes/codecs/h264.h
command_classes/codecs/vp9.cpp
command_classes/codecs/vp9.h
command_classes/codecs/vp9_types.h
command_classes/host1x.cpp
command_classes/host1x.h
command_classes/nvdec.cpp
command_classes/nvdec.h
command_classes/nvdec_common.h
command_classes/sync_manager.cpp
command_classes/sync_manager.h
command_classes/vic.cpp
command_classes/vic.h
compatible_formats.cpp compatible_formats.cpp
compatible_formats.h compatible_formats.h
dirty_flags.cpp dirty_flags.cpp
@ -250,6 +268,14 @@ create_target_directory_groups(video_core)
target_link_libraries(video_core PUBLIC common core) target_link_libraries(video_core PUBLIC common core)
target_link_libraries(video_core PRIVATE glad xbyak) target_link_libraries(video_core PRIVATE glad xbyak)
if (MSVC)
target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
target_link_libraries(video_core PUBLIC ${FFMPEG_LIBRARY_DIR}/swscale.lib ${FFMPEG_LIBRARY_DIR}/avcodec.lib ${FFMPEG_LIBRARY_DIR}/avutil.lib)
else()
target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
target_link_libraries(video_core PRIVATE ${FFMPEG_LIBRARIES})
endif()
add_dependencies(video_core host_shaders) add_dependencies(video_core host_shaders)
target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE}) target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE})

View file

@ -0,0 +1,171 @@
// MIT License
//
// Copyright (c) Ryujinx Team and Contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#include "command_classes/host1x.h"
#include "command_classes/nvdec.h"
#include "command_classes/vic.h"
#include "common/bit_util.h"
#include "video_core/cdma_pusher.h"
#include "video_core/command_classes/nvdec_common.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
namespace Tegra {
CDmaPusher::CDmaPusher(GPU& gpu)
: gpu(gpu), nvdec_processor(std::make_shared<Nvdec>(gpu)),
vic_processor(std::make_unique<Vic>(gpu, nvdec_processor)),
host1x_processor(std::make_unique<Host1x>(gpu)),
nvdec_sync(std::make_unique<SyncptIncrManager>(gpu)),
vic_sync(std::make_unique<SyncptIncrManager>(gpu)) {}
CDmaPusher::~CDmaPusher() = default;
void CDmaPusher::Push(ChCommandHeaderList&& entries) {
cdma_queue.push(std::move(entries));
}
void CDmaPusher::DispatchCalls() {
while (!cdma_queue.empty()) {
Step();
}
}
void CDmaPusher::Step() {
const auto entries{cdma_queue.front()};
cdma_queue.pop();
std::vector<u32> values(entries.size());
std::memcpy(values.data(), entries.data(), entries.size() * sizeof(u32));
for (const u32 value : values) {
if (mask != 0) {
const u32 lbs = Common::CountTrailingZeroes32(mask);
mask &= ~(1U << lbs);
ExecuteCommand(static_cast<u32>(offset + lbs), value);
continue;
} else if (count != 0) {
--count;
ExecuteCommand(static_cast<u32>(offset), value);
if (incrementing) {
++offset;
}
continue;
}
const auto mode = static_cast<ChSubmissionMode>((value >> 28) & 0xf);
switch (mode) {
case ChSubmissionMode::SetClass: {
mask = value & 0x3f;
offset = (value >> 16) & 0xfff;
current_class = static_cast<ChClassId>((value >> 6) & 0x3ff);
break;
}
case ChSubmissionMode::Incrementing:
case ChSubmissionMode::NonIncrementing:
count = value & 0xffff;
offset = (value >> 16) & 0xfff;
incrementing = mode == ChSubmissionMode::Incrementing;
break;
case ChSubmissionMode::Mask:
mask = value & 0xffff;
offset = (value >> 16) & 0xfff;
break;
case ChSubmissionMode::Immediate: {
const u32 data = value & 0xfff;
offset = (value >> 16) & 0xfff;
ExecuteCommand(static_cast<u32>(offset), data);
break;
}
default:
UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode));
break;
}
}
}
void CDmaPusher::ExecuteCommand(u32 offset, u32 data) {
switch (current_class) {
case ChClassId::NvDec:
ThiStateWrite(nvdec_thi_state, offset, {data});
switch (static_cast<ThiMethod>(offset)) {
case ThiMethod::IncSyncpt: {
LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method");
const auto syncpoint_id = static_cast<u32>(data & 0xFF);
const auto cond = static_cast<u32>((data >> 8) & 0xFF);
if (cond == 0) {
nvdec_sync->Increment(syncpoint_id);
} else {
nvdec_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id);
nvdec_sync->SignalDone(syncpoint_id);
}
break;
}
case ThiMethod::SetMethod1:
LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}",
static_cast<u32>(nvdec_thi_state.method_0));
nvdec_processor->ProcessMethod(
static_cast<Tegra::Nvdec::Method>(nvdec_thi_state.method_0), {data});
break;
default:
break;
}
break;
case ChClassId::GraphicsVic:
ThiStateWrite(vic_thi_state, static_cast<u32>(offset), {data});
switch (static_cast<ThiMethod>(offset)) {
case ThiMethod::IncSyncpt: {
LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method");
const auto syncpoint_id = static_cast<u32>(data & 0xFF);
const auto cond = static_cast<u32>((data >> 8) & 0xFF);
if (cond == 0) {
vic_sync->Increment(syncpoint_id);
} else {
vic_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id);
vic_sync->SignalDone(syncpoint_id);
}
break;
}
case ThiMethod::SetMethod1:
LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})",
static_cast<u32>(vic_thi_state.method_0));
vic_processor->ProcessMethod(static_cast<Tegra::Vic::Method>(vic_thi_state.method_0),
{data});
break;
default:
break;
}
break;
case ChClassId::Host1x:
// This device is mainly for syncpoint synchronization
LOG_DEBUG(Service_NVDRV, "Host1X Class Method");
host1x_processor->ProcessMethod(static_cast<Tegra::Host1x::Method>(offset), {data});
break;
default:
UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class));
break;
}
}
void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 offset, const std::vector<u32>& arguments) {
u8* const state_offset = reinterpret_cast<u8*>(&state) + sizeof(u32) * offset;
std::memcpy(state_offset, arguments.data(), sizeof(u32) * arguments.size());
}
} // namespace Tegra

View file

@ -0,0 +1,138 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <memory>
#include <unordered_map>
#include <vector>
#include <queue>
#include "common/bit_field.h"
#include "common/common_types.h"
#include "video_core/command_classes/sync_manager.h"
namespace Tegra {
class GPU;
class Nvdec;
class Vic;
class Host1x;
enum class ChSubmissionMode : u32 {
SetClass = 0,
Incrementing = 1,
NonIncrementing = 2,
Mask = 3,
Immediate = 4,
Restart = 5,
Gather = 6,
};
enum class ChClassId : u32 {
NoClass = 0x0,
Host1x = 0x1,
VideoEncodeMpeg = 0x20,
VideoEncodeNvEnc = 0x21,
VideoStreamingVi = 0x30,
VideoStreamingIsp = 0x32,
VideoStreamingIspB = 0x34,
VideoStreamingViI2c = 0x36,
GraphicsVic = 0x5d,
Graphics3D = 0x60,
GraphicsGpu = 0x61,
Tsec = 0xe0,
TsecB = 0xe1,
NvJpg = 0xc0,
NvDec = 0xf0
};
enum class ChMethod : u32 {
Empty = 0,
SetMethod = 0x10,
SetData = 0x11,
};
union ChCommandHeader {
u32 raw;
BitField<0, 16, u32> value;
BitField<16, 12, ChMethod> method_offset;
BitField<28, 4, ChSubmissionMode> submission_mode;
};
static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size");
struct ChCommand {
ChClassId class_id{};
int method_offset{};
std::vector<u32> arguments;
};
using ChCommandHeaderList = std::vector<Tegra::ChCommandHeader>;
using ChCommandList = std::vector<Tegra::ChCommand>;
struct ThiRegisters {
u32_le increment_syncpt{};
INSERT_PADDING_WORDS(1);
u32_le increment_syncpt_error{};
u32_le ctx_switch_incremement_syncpt{};
INSERT_PADDING_WORDS(4);
u32_le ctx_switch{};
INSERT_PADDING_WORDS(1);
u32_le ctx_syncpt_eof{};
INSERT_PADDING_WORDS(5);
u32_le method_0{};
u32_le method_1{};
INSERT_PADDING_WORDS(12);
u32_le int_status{};
u32_le int_mask{};
};
enum class ThiMethod : u32 {
IncSyncpt = offsetof(ThiRegisters, increment_syncpt) / sizeof(u32),
SetMethod0 = offsetof(ThiRegisters, method_0) / sizeof(u32),
SetMethod1 = offsetof(ThiRegisters, method_1) / sizeof(u32),
};
class CDmaPusher {
public:
explicit CDmaPusher(GPU& gpu);
~CDmaPusher();
/// Push NVDEC command buffer entries into queue
void Push(ChCommandHeaderList&& entries);
/// Process queued command buffer entries
void DispatchCalls();
/// Process one queue element
void Step();
/// Invoke command class devices to execute the command based on the current state
void ExecuteCommand(u32 offset, u32 data);
private:
/// Write arguments value to the ThiRegisters member at the specified offset
void ThiStateWrite(ThiRegisters& state, u32 offset, const std::vector<u32>& arguments);
GPU& gpu;
std::shared_ptr<Tegra::Nvdec> nvdec_processor;
std::unique_ptr<Tegra::Vic> vic_processor;
std::unique_ptr<Tegra::Host1x> host1x_processor;
std::unique_ptr<SyncptIncrManager> nvdec_sync;
std::unique_ptr<SyncptIncrManager> vic_sync;
ChClassId current_class{};
ThiRegisters vic_thi_state{};
ThiRegisters nvdec_thi_state{};
s32 count{};
s32 offset{};
s32 mask{};
bool incrementing{};
// Queue of command lists to be processed
std::queue<ChCommandHeaderList> cdma_queue;
};
} // namespace Tegra

View file

@ -0,0 +1,114 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <cstring>
#include <fstream>
#include "common/assert.h"
#include "video_core/command_classes/codecs/codec.h"
#include "video_core/command_classes/codecs/h264.h"
#include "video_core/command_classes/codecs/vp9.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
extern "C" {
#include <libavutil/opt.h>
}
namespace Tegra {
Codec::Codec(GPU& gpu_)
: gpu(gpu_), h264_decoder(std::make_unique<Decoder::H264>(gpu)),
vp9_decoder(std::make_unique<Decoder::VP9>(gpu)) {}
Codec::~Codec() {
if (!initialized) {
return;
}
// Free libav memory
avcodec_send_packet(av_codec_ctx, nullptr);
avcodec_receive_frame(av_codec_ctx, av_frame);
avcodec_flush_buffers(av_codec_ctx);
av_frame_unref(av_frame);
av_free(av_frame);
avcodec_close(av_codec_ctx);
}
void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {
LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", static_cast<u32>(codec));
current_codec = codec;
}
void Codec::StateWrite(u32 offset, u64 arguments) {
u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u64);
std::memcpy(state_offset, &arguments, sizeof(u64));
}
void Codec::Decode() {
bool is_first_frame = false;
if (!initialized) {
if (current_codec == NvdecCommon::VideoCodec::H264) {
av_codec = avcodec_find_decoder(AV_CODEC_ID_H264);
} else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
av_codec = avcodec_find_decoder(AV_CODEC_ID_VP9);
} else {
LOG_ERROR(Service_NVDRV, "Unknown video codec {}", static_cast<u32>(current_codec));
return;
}
av_codec_ctx = avcodec_alloc_context3(av_codec);
av_frame = av_frame_alloc();
av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);
// TODO(ameerj): libavcodec gpu hw acceleration
const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr);
if (av_error < 0) {
LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed.");
av_frame_unref(av_frame);
av_free(av_frame);
avcodec_close(av_codec_ctx);
return;
}
initialized = true;
is_first_frame = true;
}
bool vp9_hidden_frame = false;
AVPacket packet{};
av_init_packet(&packet);
std::vector<u8> frame_data;
if (current_codec == NvdecCommon::VideoCodec::H264) {
frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame);
} else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
frame_data = vp9_decoder->ComposeFrameHeader(state);
vp9_hidden_frame = vp9_decoder->WasFrameHidden();
}
packet.data = frame_data.data();
packet.size = static_cast<int>(frame_data.size());
avcodec_send_packet(av_codec_ctx, &packet);
if (!vp9_hidden_frame) {
// Only receive/store visible frames
avcodec_receive_frame(av_codec_ctx, av_frame);
}
}
AVFrame* Codec::GetCurrentFrame() {
return av_frame;
}
const AVFrame* Codec::GetCurrentFrame() const {
return av_frame;
}
NvdecCommon::VideoCodec Codec::GetCurrentCodec() const {
return current_codec;
}
} // namespace Tegra

View file

@ -0,0 +1,68 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <memory>
#include <vector>
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "video_core/command_classes/nvdec_common.h"
extern "C" {
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic ignored "-Wconversion"
#endif
#include <libavcodec/avcodec.h>
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic pop
#endif
}
namespace Tegra {
class GPU;
struct VicRegisters;
namespace Decoder {
class H264;
class VP9;
} // namespace Decoder
class Codec {
public:
explicit Codec(GPU& gpu);
~Codec();
/// Sets NVDEC video stream codec
void SetTargetCodec(NvdecCommon::VideoCodec codec);
/// Populate NvdecRegisters state with argument value at the provided offset
void StateWrite(u32 offset, u64 arguments);
/// Call decoders to construct headers, decode AVFrame with ffmpeg
void Decode();
/// Returns most recently decoded frame
AVFrame* GetCurrentFrame();
const AVFrame* GetCurrentFrame() const;
/// Returns the value of current_codec
NvdecCommon::VideoCodec GetCurrentCodec() const;
private:
bool initialized{};
NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None};
AVCodec* av_codec{nullptr};
AVCodecContext* av_codec_ctx{nullptr};
AVFrame* av_frame{nullptr};
GPU& gpu;
std::unique_ptr<Decoder::H264> h264_decoder;
std::unique_ptr<Decoder::VP9> vp9_decoder;
NvdecCommon::NvdecRegisters state{};
};
} // namespace Tegra

View file

@ -0,0 +1,276 @@
// MIT License
//
// Copyright (c) Ryujinx Team and Contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#include "common/bit_util.h"
#include "video_core/command_classes/codecs/h264.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
namespace Tegra::Decoder {
H264::H264(GPU& gpu_) : gpu(gpu_) {}
H264::~H264() = default;
std::vector<u8>& H264::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state, bool is_first_frame) {
H264DecoderContext context{};
gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext));
const s32 frame_number = static_cast<s32>((context.h264_parameter_set.flags >> 46) & 0x1ffff);
if (!is_first_frame && frame_number != 0) {
frame.resize(context.frame_data_size);
gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size());
} else {
/// Encode header
H264BitWriter writer{};
writer.WriteU(1, 24);
writer.WriteU(0, 1);
writer.WriteU(3, 2);
writer.WriteU(7, 5);
writer.WriteU(100, 8);
writer.WriteU(0, 8);
writer.WriteU(31, 8);
writer.WriteUe(0);
const s32 chroma_format_idc = (context.h264_parameter_set.flags >> 12) & 0x3;
writer.WriteUe(chroma_format_idc);
if (chroma_format_idc == 3) {
writer.WriteBit(false);
}
writer.WriteUe(0);
writer.WriteUe(0);
writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag
writer.WriteBit(false); // Scaling matrix present flag
const s32 order_cnt_type = static_cast<s32>((context.h264_parameter_set.flags >> 14) & 3);
writer.WriteUe(static_cast<s32>((context.h264_parameter_set.flags >> 8) & 0xf));
writer.WriteUe(order_cnt_type);
if (order_cnt_type == 0) {
writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt);
} else if (order_cnt_type == 1) {
writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);
writer.WriteSe(0);
writer.WriteSe(0);
writer.WriteUe(0);
}
const s32 pic_height = context.h264_parameter_set.pic_height_in_map_units /
(context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
writer.WriteUe(16);
writer.WriteBit(false);
writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
writer.WriteUe(pic_height - 1);
writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0);
if (!context.h264_parameter_set.frame_mbs_only_flag) {
writer.WriteBit(((context.h264_parameter_set.flags >> 0) & 1) != 0);
}
writer.WriteBit(((context.h264_parameter_set.flags >> 1) & 1) != 0);
writer.WriteBit(false); // Frame cropping flag
writer.WriteBit(false); // VUI parameter present flag
writer.End();
// H264 PPS
writer.WriteU(1, 24);
writer.WriteU(0, 1);
writer.WriteU(3, 2);
writer.WriteU(8, 5);
writer.WriteUe(0);
writer.WriteUe(0);
writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag);
writer.WriteBit(false);
writer.WriteUe(0);
writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active);
writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active);
writer.WriteBit(((context.h264_parameter_set.flags >> 2) & 1) != 0);
writer.WriteU(static_cast<s32>((context.h264_parameter_set.flags >> 32) & 0x3), 2);
s32 pic_init_qp = static_cast<s32>((context.h264_parameter_set.flags >> 16) & 0x3f);
pic_init_qp = (pic_init_qp << 26) >> 26;
writer.WriteSe(pic_init_qp);
writer.WriteSe(0);
s32 chroma_qp_index_offset =
static_cast<s32>((context.h264_parameter_set.flags >> 22) & 0x1f);
chroma_qp_index_offset = (chroma_qp_index_offset << 27) >> 27;
writer.WriteSe(chroma_qp_index_offset);
writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_flag != 0);
writer.WriteBit(((context.h264_parameter_set.flags >> 3) & 1) != 0);
writer.WriteBit(context.h264_parameter_set.redundant_pic_count_flag != 0);
writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0);
writer.WriteBit(true);
for (s32 index = 0; index < 6; index++) {
writer.WriteBit(true);
const auto matrix_x4 =
std::vector<u8>(context.scaling_matrix_4.begin(), context.scaling_matrix_4.end());
writer.WriteScalingList(matrix_x4, index * 16, 16);
}
if (context.h264_parameter_set.transform_8x8_mode_flag) {
for (s32 index = 0; index < 2; index++) {
writer.WriteBit(true);
const auto matrix_x8 = std::vector<u8>(context.scaling_matrix_8.begin(),
context.scaling_matrix_8.end());
writer.WriteScalingList(matrix_x8, index * 64, 64);
}
}
s32 chroma_qp_index_offset2 =
static_cast<s32>((context.h264_parameter_set.flags >> 27) & 0x1f);
chroma_qp_index_offset2 = (chroma_qp_index_offset2 << 27) >> 27;
writer.WriteSe(chroma_qp_index_offset2);
writer.End();
const auto& encoded_header = writer.GetByteArray();
frame.resize(encoded_header.size() + context.frame_data_size);
std::memcpy(frame.data(), encoded_header.data(), encoded_header.size());
gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset,
frame.data() + encoded_header.size(),
context.frame_data_size);
}
return frame;
}
H264BitWriter::H264BitWriter() = default;
H264BitWriter::~H264BitWriter() = default;
void H264BitWriter::WriteU(s32 value, s32 value_sz) {
WriteBits(value, value_sz);
}
void H264BitWriter::WriteSe(s32 value) {
WriteExpGolombCodedInt(value);
}
void H264BitWriter::WriteUe(s32 value) {
WriteExpGolombCodedUInt((u32)value);
}
void H264BitWriter::End() {
WriteBit(true);
Flush();
}
void H264BitWriter::WriteBit(bool state) {
WriteBits(state ? 1 : 0, 1);
}
void H264BitWriter::WriteScalingList(const std::vector<u8>& list, s32 start, s32 count) {
std::vector<u8> scan(count);
if (count == 16) {
std::memcpy(scan.data(), zig_zag_scan.data(), scan.size());
} else {
std::memcpy(scan.data(), zig_zag_direct.data(), scan.size());
}
u8 last_scale = 8;
for (s32 index = 0; index < count; index++) {
const u8 value = list[start + scan[index]];
const s32 delta_scale = static_cast<s32>(value - last_scale);
WriteSe(delta_scale);
last_scale = value;
}
}
std::vector<u8>& H264BitWriter::GetByteArray() {
return byte_array;
}
const std::vector<u8>& H264BitWriter::GetByteArray() const {
return byte_array;
}
void H264BitWriter::WriteBits(s32 value, s32 bit_count) {
s32 value_pos = 0;
s32 remaining = bit_count;
while (remaining > 0) {
s32 copy_size = remaining;
const s32 free_bits = GetFreeBufferBits();
if (copy_size > free_bits) {
copy_size = free_bits;
}
const s32 mask = (1 << copy_size) - 1;
const s32 src_shift = (bit_count - value_pos) - copy_size;
const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
buffer |= ((value >> src_shift) & mask) << dst_shift;
value_pos += copy_size;
buffer_pos += copy_size;
remaining -= copy_size;
}
}
void H264BitWriter::WriteExpGolombCodedInt(s32 value) {
const s32 sign = value <= 0 ? 0 : 1;
if (value < 0) {
value = -value;
}
value = (value << 1) - sign;
WriteExpGolombCodedUInt(value);
}
void H264BitWriter::WriteExpGolombCodedUInt(u32 value) {
const s32 size = 32 - Common::CountLeadingZeroes32(static_cast<s32>(value + 1));
WriteBits(1, size);
value -= (1U << (size - 1)) - 1;
WriteBits(static_cast<s32>(value), size - 1);
}
s32 H264BitWriter::GetFreeBufferBits() {
if (buffer_pos == buffer_size) {
Flush();
}
return buffer_size - buffer_pos;
}
void H264BitWriter::Flush() {
if (buffer_pos == 0) {
return;
}
byte_array.push_back(static_cast<u8>(buffer));
buffer = 0;
buffer_pos = 0;
}
} // namespace Tegra::Decoder

View file

@ -0,0 +1,130 @@
// MIT License
//
// Copyright (c) Ryujinx Team and Contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#pragma once
#include <vector>
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "video_core/command_classes/nvdec_common.h"
namespace Tegra {
class GPU;
namespace Decoder {
class H264BitWriter {
public:
H264BitWriter();
~H264BitWriter();
/// The following Write methods are based on clause 9.1 in the H.264 specification.
/// WriteSe and WriteUe write in the Exp-Golomb-coded syntax
void WriteU(s32 value, s32 value_sz);
void WriteSe(s32 value);
void WriteUe(s32 value);
/// Finalize the bitstream
void End();
/// append a bit to the stream, equivalent value to the state parameter
void WriteBit(bool state);
/// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification
/// Writes the scaling matrices of the sream
void WriteScalingList(const std::vector<u8>& list, s32 start, s32 count);
/// Return the bitstream as a vector.
std::vector<u8>& GetByteArray();
const std::vector<u8>& GetByteArray() const;
private:
// ZigZag LUTs from libavcodec.
static constexpr std::array<u8, 64> zig_zag_direct{
0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48,
41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23,
30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
};
static constexpr std::array<u8, 16> zig_zag_scan{
0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4,
1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4,
};
void WriteBits(s32 value, s32 bit_count);
void WriteExpGolombCodedInt(s32 value);
void WriteExpGolombCodedUInt(u32 value);
s32 GetFreeBufferBits();
void Flush();
s32 buffer_size{8};
s32 buffer{};
s32 buffer_pos{};
std::vector<u8> byte_array;
};
class H264 {
public:
explicit H264(GPU& gpu);
~H264();
/// Compose the H264 header of the frame for FFmpeg decoding
std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state,
bool is_first_frame = false);
private:
struct H264ParameterSet {
u32 log2_max_pic_order_cnt{};
u32 delta_pic_order_always_zero_flag{};
u32 frame_mbs_only_flag{};
u32 pic_width_in_mbs{};
u32 pic_height_in_map_units{};
INSERT_PADDING_WORDS(1);
u32 entropy_coding_mode_flag{};
u32 bottom_field_pic_order_flag{};
u32 num_refidx_l0_default_active{};
u32 num_refidx_l1_default_active{};
u32 deblocking_filter_control_flag{};
u32 redundant_pic_count_flag{};
u32 transform_8x8_mode_flag{};
INSERT_PADDING_WORDS(9);
u64 flags{};
u32 frame_number{};
u32 frame_number2{};
};
static_assert(sizeof(H264ParameterSet) == 0x68, "H264ParameterSet is an invalid size");
struct H264DecoderContext {
INSERT_PADDING_BYTES(0x48);
u32 frame_data_size{};
INSERT_PADDING_BYTES(0xc);
H264ParameterSet h264_parameter_set{};
INSERT_PADDING_BYTES(0x100);
std::array<u8, 0x60> scaling_matrix_4;
std::array<u8, 0x80> scaling_matrix_8;
};
static_assert(sizeof(H264DecoderContext) == 0x2a0, "H264DecoderContext is an invalid size");
std::vector<u8> frame;
GPU& gpu;
};
} // namespace Decoder
} // namespace Tegra

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,216 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <unordered_map>
#include <vector>
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "common/stream.h"
#include "video_core/command_classes/codecs/vp9_types.h"
#include "video_core/command_classes/nvdec_common.h"
namespace Tegra {
class GPU;
enum class FrameType { KeyFrame = 0, InterFrame = 1 };
namespace Decoder {
/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
/// VP9 header bitstreams.
class VpxRangeEncoder {
public:
VpxRangeEncoder();
~VpxRangeEncoder();
/// Writes the rightmost value_size bits from value into the stream
void Write(s32 value, s32 value_size);
/// Writes a single bit with half probability
void Write(bool bit);
/// Writes a bit to the base_stream encoded with probability
void Write(bool bit, s32 probability);
/// Signal the end of the bitstream
void End();
std::vector<u8>& GetBuffer() {
return base_stream.GetBuffer();
}
const std::vector<u8>& GetBuffer() const {
return base_stream.GetBuffer();
}
private:
u8 PeekByte();
Common::Stream base_stream{};
u32 low_value{};
u32 range{0xff};
s32 count{-24};
s32 half_probability{128};
static constexpr std::array<s32, 256> norm_lut{
0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
};
class VpxBitStreamWriter {
public:
VpxBitStreamWriter();
~VpxBitStreamWriter();
/// Write an unsigned integer value
void WriteU(u32 value, u32 value_size);
/// Write a signed integer value
void WriteS(s32 value, u32 value_size);
/// Based on 6.2.10 of VP9 Spec, writes a delta coded value
void WriteDeltaQ(u32 value);
/// Write a single bit.
void WriteBit(bool state);
/// Pushes current buffer into buffer_array, resets buffer
void Flush();
/// Returns byte_array
std::vector<u8>& GetByteArray();
/// Returns const byte_array
const std::vector<u8>& GetByteArray() const;
private:
/// Write bit_count bits from value into buffer
void WriteBits(u32 value, u32 bit_count);
/// Gets next available position in buffer, invokes Flush() if buffer is full
s32 GetFreeBufferBits();
s32 buffer_size{8};
s32 buffer{};
s32 buffer_pos{};
std::vector<u8> byte_array;
};
class VP9 {
public:
explicit VP9(GPU& gpu);
~VP9();
/// Composes the VP9 frame from the GPU state information. Based on the official VP9 spec
/// documentation
std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state);
/// Returns true if the most recent frame was a hidden frame.
bool WasFrameHidden() const {
return hidden;
}
private:
/// Generates compressed header probability updates in the bitstream writer
template <typename T, std::size_t N>
void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
const std::array<T, N>& old_prob);
/// Generates compressed header probability updates in the bitstream writer
/// If probs are not equal, WriteProbabilityDelta is invoked
void WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
/// Generates compressed header probability deltas in the bitstream writer
void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
/// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification
s32 RemapProbability(s32 new_prob, s32 old_prob);
/// Recenters probability. Based on section 6.3.6 of VP9 Specification
s32 RecenterNonNeg(s32 new_prob, s32 old_prob);
/// Inverse of 6.3.4 Decode term subexp
void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value);
/// Writes if the value is less than the test value
bool WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test);
/// Writes probability updates for the Coef probabilities
void WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
const std::array<u8, 2304>& new_prob,
const std::array<u8, 2304>& old_prob);
/// Write probabilities for 4-byte aligned structures
template <typename T, std::size_t N>
void WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
const std::array<T, N>& old_prob);
/// Write motion vector probability updates. 6.3.17 in the spec
void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
/// 6.2.14 Tile size calculation
s32 CalcMinLog2TileCols(s32 frame_width);
s32 CalcMaxLog2TileCols(s32 frame_width);
/// Returns VP9 information from NVDEC provided offset and size
Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state);
/// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct
void InsertEntropy(u64 offset, Vp9EntropyProbs& dst);
/// Returns frame to be decoded after buffering
Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state);
/// Use NVDEC providied information to compose the headers for the current frame
std::vector<u8> ComposeCompressedHeader();
VpxBitStreamWriter ComposeUncompressedHeader();
GPU& gpu;
std::vector<u8> frame;
std::array<s8, 4> loop_filter_ref_deltas{};
std::array<s8, 2> loop_filter_mode_deltas{};
bool hidden;
s64 current_frame_number = -2; // since we buffer 2 frames
s32 grace_period = 6; // frame offsets need to stabilize
std::array<FrameContexts, 4> frame_ctxs{};
Vp9FrameContainer next_frame{};
Vp9FrameContainer next_next_frame{};
bool swap_next_golden{};
Vp9PictureInfo current_frame_info{};
Vp9EntropyProbs prev_frame_probs{};
s32 diff_update_probability = 252;
s32 frame_sync_code = 0x498342;
static constexpr std::array<s32, 254> map_lut = {
20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
36, 37, 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 2, 50,
51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 3, 62, 63, 64, 65, 66,
67, 68, 69, 70, 71, 72, 73, 4, 74, 75, 76, 77, 78, 79, 80, 81, 82,
83, 84, 85, 5, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 6,
98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 7, 110, 111, 112, 113,
114, 115, 116, 117, 118, 119, 120, 121, 8, 122, 123, 124, 125, 126, 127, 128, 129,
130, 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145,
10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11, 158, 159, 160,
161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171, 172, 173, 174, 175, 176,
177, 178, 179, 180, 181, 13, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192,
193, 14, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15, 206, 207,
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223,
224, 225, 226, 227, 228, 229, 17, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
240, 241, 18, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19,
};
};
} // namespace Decoder
} // namespace Tegra

View file

@ -0,0 +1,369 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <algorithm>
#include <list>
#include <vector>
#include "common/cityhash.h"
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "video_core/command_classes/nvdec_common.h"
namespace Tegra {
class GPU;
namespace Decoder {
struct Vp9FrameDimensions {
s16 width{};
s16 height{};
s16 luma_pitch{};
s16 chroma_pitch{};
};
static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size");
enum FrameFlags : u32 {
IsKeyFrame = 1 << 0,
LastFrameIsKeyFrame = 1 << 1,
FrameSizeChanged = 1 << 2,
ErrorResilientMode = 1 << 3,
LastShowFrame = 1 << 4,
IntraOnly = 1 << 5,
};
enum class MvJointType {
MvJointZero = 0, /* Zero vector */
MvJointHnzvz = 1, /* Vert zero, hor nonzero */
MvJointHzvnz = 2, /* Hor zero, vert nonzero */
MvJointHnzvnz = 3, /* Both components nonzero */
};
enum class MvClassType {
MvClass0 = 0, /* (0, 2] integer pel */
MvClass1 = 1, /* (2, 4] integer pel */
MvClass2 = 2, /* (4, 8] integer pel */
MvClass3 = 3, /* (8, 16] integer pel */
MvClass4 = 4, /* (16, 32] integer pel */
MvClass5 = 5, /* (32, 64] integer pel */
MvClass6 = 6, /* (64, 128] integer pel */
MvClass7 = 7, /* (128, 256] integer pel */
MvClass8 = 8, /* (256, 512] integer pel */
MvClass9 = 9, /* (512, 1024] integer pel */
MvClass10 = 10, /* (1024,2048] integer pel */
};
enum class BlockSize {
Block4x4 = 0,
Block4x8 = 1,
Block8x4 = 2,
Block8x8 = 3,
Block8x16 = 4,
Block16x8 = 5,
Block16x16 = 6,
Block16x32 = 7,
Block32x16 = 8,
Block32x32 = 9,
Block32x64 = 10,
Block64x32 = 11,
Block64x64 = 12,
BlockSizes = 13,
BlockInvalid = BlockSizes
};
enum class PredictionMode {
DcPred = 0, // Average of above and left pixels
VPred = 1, // Vertical
HPred = 2, // Horizontal
D45Pred = 3, // Directional 45 deg = round(arctan(1 / 1) * 180 / pi)
D135Pred = 4, // Directional 135 deg = 180 - 45
D117Pred = 5, // Directional 117 deg = 180 - 63
D153Pred = 6, // Directional 153 deg = 180 - 27
D207Pred = 7, // Directional 207 deg = 180 + 27
D63Pred = 8, // Directional 63 deg = round(arctan(2 / 1) * 180 / pi)
TmPred = 9, // True-motion
NearestMv = 10,
NearMv = 11,
ZeroMv = 12,
NewMv = 13,
MbModeCount = 14
};
enum class TxSize {
Tx4x4 = 0, // 4x4 transform
Tx8x8 = 1, // 8x8 transform
Tx16x16 = 2, // 16x16 transform
Tx32x32 = 3, // 32x32 transform
TxSizes = 4
};
enum class TxMode {
Only4X4 = 0, // Only 4x4 transform used
Allow8X8 = 1, // Allow block transform size up to 8x8
Allow16X16 = 2, // Allow block transform size up to 16x16
Allow32X32 = 3, // Allow block transform size up to 32x32
TxModeSelect = 4, // Transform specified for each block
TxModes = 5
};
enum class reference_mode {
SingleReference = 0,
CompoundReference = 1,
ReferenceModeSelect = 2,
ReferenceModes = 3
};
struct Segmentation {
u8 enabled{};
u8 update_map{};
u8 temporal_update{};
u8 abs_delta{};
std::array<u32, 8> feature_mask{};
std::array<std::array<s16, 4>, 8> feature_data{};
};
static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size");
struct LoopFilter {
u8 mode_ref_delta_enabled{};
std::array<s8, 4> ref_deltas{};
std::array<s8, 2> mode_deltas{};
};
static_assert(sizeof(LoopFilter) == 0x7, "LoopFilter is an invalid size");
struct Vp9EntropyProbs {
std::array<u8, 36> y_mode_prob{};
std::array<u8, 64> partition_prob{};
std::array<u8, 2304> coef_probs{};
std::array<u8, 8> switchable_interp_prob{};
std::array<u8, 28> inter_mode_prob{};
std::array<u8, 4> intra_inter_prob{};
std::array<u8, 5> comp_inter_prob{};
std::array<u8, 10> single_ref_prob{};
std::array<u8, 5> comp_ref_prob{};
std::array<u8, 6> tx_32x32_prob{};
std::array<u8, 4> tx_16x16_prob{};
std::array<u8, 2> tx_8x8_prob{};
std::array<u8, 3> skip_probs{};
std::array<u8, 3> joints{};
std::array<u8, 2> sign{};
std::array<u8, 20> classes{};
std::array<u8, 2> class_0{};
std::array<u8, 20> prob_bits{};
std::array<u8, 12> class_0_fr{};
std::array<u8, 6> fr{};
std::array<u8, 2> class_0_hp{};
std::array<u8, 2> high_precision{};
};
static_assert(sizeof(Vp9EntropyProbs) == 0x9F4, "Vp9EntropyProbs is an invalid size");
struct Vp9PictureInfo {
bool is_key_frame{};
bool intra_only{};
bool last_frame_was_key{};
bool frame_size_changed{};
bool error_resilient_mode{};
bool last_frame_shown{};
bool show_frame{};
std::array<s8, 4> ref_frame_sign_bias{};
s32 base_q_index{};
s32 y_dc_delta_q{};
s32 uv_dc_delta_q{};
s32 uv_ac_delta_q{};
bool lossless{};
s32 transform_mode{};
bool allow_high_precision_mv{};
s32 interp_filter{};
s32 reference_mode{};
s8 comp_fixed_ref{};
std::array<s8, 2> comp_var_ref{};
s32 log2_tile_cols{};
s32 log2_tile_rows{};
bool segment_enabled{};
bool segment_map_update{};
bool segment_map_temporal_update{};
s32 segment_abs_delta{};
std::array<u32, 8> segment_feature_enable{};
std::array<std::array<s16, 4>, 8> segment_feature_data{};
bool mode_ref_delta_enabled{};
bool use_prev_in_find_mv_refs{};
std::array<s8, 4> ref_deltas{};
std::array<s8, 2> mode_deltas{};
Vp9EntropyProbs entropy{};
Vp9FrameDimensions frame_size{};
u8 first_level{};
u8 sharpness_level{};
u32 bitstream_size{};
std::array<u64, 4> frame_offsets{};
std::array<bool, 4> refresh_frame{};
};
struct Vp9FrameContainer {
Vp9PictureInfo info{};
std::vector<u8> bit_stream;
};
struct PictureInfo {
INSERT_PADDING_WORDS(12);
u32 bitstream_size{};
INSERT_PADDING_WORDS(5);
Vp9FrameDimensions last_frame_size{};
Vp9FrameDimensions golden_frame_size{};
Vp9FrameDimensions alt_frame_size{};
Vp9FrameDimensions current_frame_size{};
u32 vp9_flags{};
std::array<s8, 4> ref_frame_sign_bias{};
u8 first_level{};
u8 sharpness_level{};
u8 base_q_index{};
u8 y_dc_delta_q{};
u8 uv_ac_delta_q{};
u8 uv_dc_delta_q{};
u8 lossless{};
u8 tx_mode{};
u8 allow_high_precision_mv{};
u8 interp_filter{};
u8 reference_mode{};
s8 comp_fixed_ref{};
std::array<s8, 2> comp_var_ref{};
u8 log2_tile_cols{};
u8 log2_tile_rows{};
Segmentation segmentation{};
LoopFilter loop_filter{};
INSERT_PADDING_BYTES(5);
u32 surface_params{};
INSERT_PADDING_WORDS(3);
Vp9PictureInfo Convert() const {
return Vp9PictureInfo{
.is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0,
.intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0,
.last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0,
.frame_size_changed = (vp9_flags & FrameFlags::FrameSizeChanged) != 0,
.error_resilient_mode = (vp9_flags & FrameFlags::ErrorResilientMode) != 0,
.last_frame_shown = (vp9_flags & FrameFlags::LastShowFrame) != 0,
.ref_frame_sign_bias = ref_frame_sign_bias,
.base_q_index = base_q_index,
.y_dc_delta_q = y_dc_delta_q,
.uv_dc_delta_q = uv_dc_delta_q,
.uv_ac_delta_q = uv_ac_delta_q,
.lossless = lossless != 0,
.transform_mode = tx_mode,
.allow_high_precision_mv = allow_high_precision_mv != 0,
.interp_filter = interp_filter,
.reference_mode = reference_mode,
.comp_fixed_ref = comp_fixed_ref,
.comp_var_ref = comp_var_ref,
.log2_tile_cols = log2_tile_cols,
.log2_tile_rows = log2_tile_rows,
.segment_enabled = segmentation.enabled != 0,
.segment_map_update = segmentation.update_map != 0,
.segment_map_temporal_update = segmentation.temporal_update != 0,
.segment_abs_delta = segmentation.abs_delta,
.segment_feature_enable = segmentation.feature_mask,
.segment_feature_data = segmentation.feature_data,
.mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0,
.use_prev_in_find_mv_refs = !(vp9_flags == (FrameFlags::ErrorResilientMode)) &&
!(vp9_flags == (FrameFlags::FrameSizeChanged)) &&
!(vp9_flags == (FrameFlags::IntraOnly)) &&
(vp9_flags == (FrameFlags::LastShowFrame)) &&
!(vp9_flags == (FrameFlags::LastFrameIsKeyFrame)),
.ref_deltas = loop_filter.ref_deltas,
.mode_deltas = loop_filter.mode_deltas,
.frame_size = current_frame_size,
.first_level = first_level,
.sharpness_level = sharpness_level,
.bitstream_size = bitstream_size,
};
}
};
static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size");
struct EntropyProbs {
INSERT_PADDING_BYTES(1024);
std::array<std::array<u8, 4>, 7> inter_mode_prob{};
std::array<u8, 4> intra_inter_prob{};
INSERT_PADDING_BYTES(80);
std::array<std::array<u8, 1>, 2> tx_8x8_prob{};
std::array<std::array<u8, 2>, 2> tx_16x16_prob{};
std::array<std::array<u8, 3>, 2> tx_32x32_prob{};
std::array<u8, 4> y_mode_prob_e8{};
std::array<std::array<u8, 8>, 4> y_mode_prob_e0e7{};
INSERT_PADDING_BYTES(64);
std::array<std::array<u8, 4>, 16> partition_prob{};
INSERT_PADDING_BYTES(10);
std::array<std::array<u8, 2>, 4> switchable_interp_prob{};
std::array<u8, 5> comp_inter_prob{};
std::array<u8, 4> skip_probs{};
std::array<u8, 3> joints{};
std::array<u8, 2> sign{};
std::array<std::array<u8, 1>, 2> class_0{};
std::array<std::array<u8, 3>, 2> fr{};
std::array<u8, 2> class_0_hp{};
std::array<u8, 2> high_precision{};
std::array<std::array<u8, 10>, 2> classes{};
std::array<std::array<std::array<u8, 3>, 2>, 2> class_0_fr{};
std::array<std::array<u8, 10>, 2> pred_bits{};
std::array<std::array<u8, 2>, 5> single_ref_prob{};
std::array<u8, 5> comp_ref_prob{};
INSERT_PADDING_BYTES(17);
std::array<std::array<std::array<std::array<std::array<std::array<u8, 4>, 6>, 6>, 2>, 2>, 4>
coef_probs{};
void Convert(Vp9EntropyProbs& fc) {
std::memcpy(fc.inter_mode_prob.data(), inter_mode_prob.data(), fc.inter_mode_prob.size());
std::memcpy(fc.intra_inter_prob.data(), intra_inter_prob.data(),
fc.intra_inter_prob.size());
std::memcpy(fc.tx_8x8_prob.data(), tx_8x8_prob.data(), fc.tx_8x8_prob.size());
std::memcpy(fc.tx_16x16_prob.data(), tx_16x16_prob.data(), fc.tx_16x16_prob.size());
std::memcpy(fc.tx_32x32_prob.data(), tx_32x32_prob.data(), fc.tx_32x32_prob.size());
for (s32 i = 0; i < 4; i++) {
for (s32 j = 0; j < 9; j++) {
fc.y_mode_prob[j + 9 * i] = j < 8 ? y_mode_prob_e0e7[i][j] : y_mode_prob_e8[i];
}
}
std::memcpy(fc.partition_prob.data(), partition_prob.data(), fc.partition_prob.size());
std::memcpy(fc.switchable_interp_prob.data(), switchable_interp_prob.data(),
fc.switchable_interp_prob.size());
std::memcpy(fc.comp_inter_prob.data(), comp_inter_prob.data(), fc.comp_inter_prob.size());
std::memcpy(fc.skip_probs.data(), skip_probs.data(), fc.skip_probs.size());
std::memcpy(fc.joints.data(), joints.data(), fc.joints.size());
std::memcpy(fc.sign.data(), sign.data(), fc.sign.size());
std::memcpy(fc.class_0.data(), class_0.data(), fc.class_0.size());
std::memcpy(fc.fr.data(), fr.data(), fc.fr.size());
std::memcpy(fc.class_0_hp.data(), class_0_hp.data(), fc.class_0_hp.size());
std::memcpy(fc.high_precision.data(), high_precision.data(), fc.high_precision.size());
std::memcpy(fc.classes.data(), classes.data(), fc.classes.size());
std::memcpy(fc.class_0_fr.data(), class_0_fr.data(), fc.class_0_fr.size());
std::memcpy(fc.prob_bits.data(), pred_bits.data(), fc.prob_bits.size());
std::memcpy(fc.single_ref_prob.data(), single_ref_prob.data(), fc.single_ref_prob.size());
std::memcpy(fc.comp_ref_prob.data(), comp_ref_prob.data(), fc.comp_ref_prob.size());
std::memcpy(fc.coef_probs.data(), coef_probs.data(), fc.coef_probs.size());
}
};
static_assert(sizeof(EntropyProbs) == 0xEA0, "EntropyProbs is an invalid size");
enum class Ref { Last, Golden, AltRef };
struct RefPoolElement {
s64 frame{};
Ref ref{};
bool refresh{};
};
struct FrameContexts {
s64 from{};
bool adapted{};
Vp9EntropyProbs probs{};
};
}; // namespace Decoder
}; // namespace Tegra

View file

@ -0,0 +1,39 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/assert.h"
#include "video_core/command_classes/host1x.h"
#include "video_core/gpu.h"
Tegra::Host1x::Host1x(GPU& gpu_) : gpu(gpu_) {}
Tegra::Host1x::~Host1x() = default;
void Tegra::Host1x::StateWrite(u32 offset, u32 arguments) {
u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u32);
std::memcpy(state_offset, &arguments, sizeof(u32));
}
void Tegra::Host1x::ProcessMethod(Host1x::Method method, const std::vector<u32>& arguments) {
StateWrite(static_cast<u32>(method), arguments[0]);
switch (method) {
case Method::WaitSyncpt:
Execute(arguments[0]);
break;
case Method::LoadSyncptPayload32:
syncpoint_value = arguments[0];
break;
case Method::WaitSyncpt32:
Execute(arguments[0]);
break;
default:
UNIMPLEMENTED_MSG("Host1x method 0x{:X}", static_cast<u32>(method));
break;
}
}
void Tegra::Host1x::Execute(u32 data) {
// This method waits on a valid syncpoint.
// TODO: Implement when proper Async is in place
}

View file

@ -0,0 +1,78 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <vector>
#include "common/common_funcs.h"
#include "common/common_types.h"
namespace Tegra {
class GPU;
class Nvdec;
class Host1x {
public:
struct Host1xClassRegisters {
u32 incr_syncpt{};
u32 incr_syncpt_ctrl{};
u32 incr_syncpt_error{};
INSERT_PADDING_WORDS(5);
u32 wait_syncpt{};
u32 wait_syncpt_base{};
u32 wait_syncpt_incr{};
u32 load_syncpt_base{};
u32 incr_syncpt_base{};
u32 clear{};
u32 wait{};
u32 wait_with_interrupt{};
u32 delay_use{};
u32 tick_count_high{};
u32 tick_count_low{};
u32 tick_ctrl{};
INSERT_PADDING_WORDS(23);
u32 ind_ctrl{};
u32 ind_off2{};
u32 ind_off{};
std::array<u32, 31> ind_data{};
INSERT_PADDING_WORDS(1);
u32 load_syncpoint_payload32{};
u32 stall_ctrl{};
u32 wait_syncpt32{};
u32 wait_syncpt_base32{};
u32 load_syncpt_base32{};
u32 incr_syncpt_base32{};
u32 stall_count_high{};
u32 stall_count_low{};
u32 xref_ctrl{};
u32 channel_xref_high{};
u32 channel_xref_low{};
};
static_assert(sizeof(Host1xClassRegisters) == 0x164, "Host1xClassRegisters is an invalid size");
enum class Method : u32 {
WaitSyncpt = offsetof(Host1xClassRegisters, wait_syncpt) / 4,
LoadSyncptPayload32 = offsetof(Host1xClassRegisters, load_syncpoint_payload32) / 4,
WaitSyncpt32 = offsetof(Host1xClassRegisters, wait_syncpt32) / 4,
};
explicit Host1x(GPU& gpu);
~Host1x();
/// Writes the method into the state, Invoke Execute() if encountered
void ProcessMethod(Host1x::Method method, const std::vector<u32>& arguments);
private:
/// For Host1x, execute is waiting on a syncpoint previously written into the state
void Execute(u32 data);
/// Write argument into the provided offset
void StateWrite(u32 offset, u32 arguments);
u32 syncpoint_value{};
Host1xClassRegisters state{};
GPU& gpu;
};
} // namespace Tegra

View file

@ -0,0 +1,56 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <bitset>
#include "common/assert.h"
#include "common/bit_util.h"
#include "core/memory.h"
#include "video_core/command_classes/nvdec.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
namespace Tegra {
Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {}
Nvdec::~Nvdec() = default;
void Nvdec::ProcessMethod(Nvdec::Method method, const std::vector<u32>& arguments) {
if (method == Method::SetVideoCodec) {
codec->StateWrite(static_cast<u32>(method), arguments[0]);
} else {
codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8);
}
switch (method) {
case Method::SetVideoCodec:
codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0]));
break;
case Method::Execute:
Execute();
break;
}
}
AVFrame* Nvdec::GetFrame() {
return codec->GetCurrentFrame();
}
const AVFrame* Nvdec::GetFrame() const {
return codec->GetCurrentFrame();
}
void Nvdec::Execute() {
switch (codec->GetCurrentCodec()) {
case NvdecCommon::VideoCodec::H264:
case NvdecCommon::VideoCodec::Vp9:
codec->Decode();
break;
default:
UNIMPLEMENTED_MSG("Unknown codec {}", static_cast<u32>(codec->GetCurrentCodec()));
break;
}
}
} // namespace Tegra

View file

@ -0,0 +1,39 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <vector>
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "video_core/command_classes/codecs/codec.h"
namespace Tegra {
class GPU;
class Nvdec {
public:
enum class Method : u32 {
SetVideoCodec = 0x80,
Execute = 0xc0,
};
explicit Nvdec(GPU& gpu);
~Nvdec();
/// Writes the method into the state, Invoke Execute() if encountered
void ProcessMethod(Nvdec::Method method, const std::vector<u32>& arguments);
/// Return most recently decoded frame
AVFrame* GetFrame();
const AVFrame* GetFrame() const;
private:
/// Invoke codec to decode a frame
void Execute();
GPU& gpu;
std::unique_ptr<Tegra::Codec> codec;
};
} // namespace Tegra

View file

@ -0,0 +1,48 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include "common/common_funcs.h"
#include "common/common_types.h"
namespace Tegra::NvdecCommon {
struct NvdecRegisters {
INSERT_PADDING_WORDS(256);
u64 set_codec_id{};
INSERT_PADDING_WORDS(254);
u64 set_platform_id{};
u64 picture_info_offset{};
u64 frame_bitstream_offset{};
u64 frame_number{};
u64 h264_slice_data_offsets{};
u64 h264_mv_dump_offset{};
INSERT_PADDING_WORDS(6);
u64 frame_stats_offset{};
u64 h264_last_surface_luma_offset{};
u64 h264_last_surface_chroma_offset{};
std::array<u64, 17> surface_luma_offset{};
std::array<u64, 17> surface_chroma_offset{};
INSERT_PADDING_WORDS(132);
u64 vp9_entropy_probs_offset{};
u64 vp9_backward_updates_offset{};
u64 vp9_last_frame_segmap_offset{};
u64 vp9_curr_frame_segmap_offset{};
INSERT_PADDING_WORDS(2);
u64 vp9_last_frame_mvs_offset{};
u64 vp9_curr_frame_mvs_offset{};
INSERT_PADDING_WORDS(2);
};
static_assert(sizeof(NvdecRegisters) == (0xBC0), "NvdecRegisters is incorrect size");
enum class VideoCodec : u32 {
None = 0x0,
H264 = 0x3,
Vp8 = 0x5,
H265 = 0x7,
Vp9 = 0x9,
};
} // namespace Tegra::NvdecCommon

View file

@ -0,0 +1,60 @@
// MIT License
//
// Copyright (c) Ryujinx Team and Contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#include <algorithm>
#include "sync_manager.h"
#include "video_core/gpu.h"
namespace Tegra {
SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {}
SyncptIncrManager::~SyncptIncrManager() = default;
void SyncptIncrManager::Increment(u32 id) {
increments.push_back(SyncptIncr{0, id, true});
IncrementAllDone();
}
u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) {
const u32 handle = current_id++;
increments.push_back(SyncptIncr{handle, class_id, id});
return handle;
}
void SyncptIncrManager::SignalDone(u32 handle) {
auto done_incr = std::find_if(increments.begin(), increments.end(),
[handle](SyncptIncr incr) { return incr.id == handle; });
if (done_incr != increments.end()) {
const SyncptIncr incr = *done_incr;
*done_incr = SyncptIncr{incr.id, incr.class_id, incr.syncpt_id, true};
}
IncrementAllDone();
}
void SyncptIncrManager::IncrementAllDone() {
std::size_t done_count = 0;
for (; done_count < increments.size(); ++done_count) {
if (!increments[done_count].complete) {
break;
}
gpu.IncrementSyncPoint(increments[done_count].syncpt_id);
}
increments.erase(increments.begin(), increments.begin() + done_count);
}
} // namespace Tegra

View file

@ -0,0 +1,64 @@
// MIT License
//
// Copyright (c) Ryujinx Team and Contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#pragma once
#include <mutex>
#include <vector>
#include "common/common_types.h"
namespace Tegra {
class GPU;
struct SyncptIncr {
u32 id;
u32 class_id;
u32 syncpt_id;
bool complete;
SyncptIncr(u32 id, u32 syncpt_id_, u32 class_id_, bool done = false)
: id(id), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {}
};
class SyncptIncrManager {
public:
explicit SyncptIncrManager(GPU& gpu);
~SyncptIncrManager();
/// Add syncpoint id and increment all
void Increment(u32 id);
/// Returns a handle to increment later
u32 IncrementWhenDone(u32 class_id, u32 id);
/// IncrememntAllDone, including handle
void SignalDone(u32 handle);
/// Increment all sequential pending increments that are already done.
void IncrementAllDone();
private:
std::vector<SyncptIncr> increments;
std::mutex increment_lock;
u32 current_id{};
GPU& gpu;
};
} // namespace Tegra

View file

@ -0,0 +1,180 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <array>
#include "common/assert.h"
#include "video_core/command_classes/nvdec.h"
#include "video_core/command_classes/vic.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
#include "video_core/texture_cache/surface_params.h"
extern "C" {
#include <libswscale/swscale.h>
}
namespace Tegra {
Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_)
: gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {}
Vic::~Vic() = default;
void Vic::VicStateWrite(u32 offset, u32 arguments) {
u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32);
std::memcpy(state_offset, &arguments, sizeof(u32));
}
void Vic::ProcessMethod(Vic::Method method, const std::vector<u32>& arguments) {
LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast<u32>(method));
VicStateWrite(static_cast<u32>(method), arguments[0]);
const u64 arg = static_cast<u64>(arguments[0]) << 8;
switch (method) {
case Method::Execute:
Execute();
break;
case Method::SetConfigStructOffset:
config_struct_address = arg;
break;
case Method::SetOutputSurfaceLumaOffset:
output_surface_luma_address = arg;
break;
case Method::SetOutputSurfaceChromaUOffset:
output_surface_chroma_u_address = arg;
break;
case Method::SetOutputSurfaceChromaVOffset:
output_surface_chroma_v_address = arg;
break;
default:
break;
}
}
void Vic::Execute() {
if (output_surface_luma_address == 0) {
LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Recieved 0x{:X}",
vic_state.output_surface.luma_offset);
return;
}
const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)};
const VideoPixelFormat pixel_format =
static_cast<VideoPixelFormat>(config.pixel_format.Value());
switch (pixel_format) {
case VideoPixelFormat::BGRA8:
case VideoPixelFormat::RGBA8: {
LOG_TRACE(Service_NVDRV, "Writing RGB Frame");
const auto* frame = nvdec_processor->GetFrame();
if (!frame || frame->width == 0 || frame->height == 0) {
return;
}
if (scaler_ctx == nullptr || frame->width != scaler_width ||
frame->height != scaler_height) {
const AVPixelFormat target_format =
(pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA;
sws_freeContext(scaler_ctx);
scaler_ctx = nullptr;
// FFmpeg returns all frames in YUV420, convert it into expected format
scaler_ctx =
sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width,
frame->height, target_format, 0, nullptr, nullptr, nullptr);
scaler_width = frame->width;
scaler_height = frame->height;
}
// Get Converted frame
const std::size_t linear_size = frame->width * frame->height * 4;
using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>;
AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free};
const int converted_stride{frame->width * 4};
u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height,
&converted_frame_buf_addr, &converted_stride);
const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
if (blk_kind != 0) {
// swizzle pitch linear to block linear
const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1,
block_height, 0);
std::vector<u8> swizzled_data(size);
Tegra::Texture::CopySwizzledData(frame->width, frame->height, 1, 4, 4,
swizzled_data.data(), converted_frame_buffer.get(),
false, block_height, 0, 1);
gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size);
gpu.Maxwell3D().OnMemoryWrite();
} else {
// send pitch linear frame
gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
linear_size);
gpu.Maxwell3D().OnMemoryWrite();
}
break;
}
case VideoPixelFormat::Yuv420: {
LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");
const auto* frame = nvdec_processor->GetFrame();
if (!frame || frame->width == 0 || frame->height == 0) {
return;
}
const std::size_t surface_width = config.surface_width_minus1 + 1;
const std::size_t surface_height = config.surface_height_minus1 + 1;
const std::size_t half_width = surface_width / 2;
const std::size_t half_height = config.surface_height_minus1 / 2;
const std::size_t aligned_width = (surface_width + 0xff) & ~0xff;
const auto* luma_ptr = frame->data[0];
const auto* chroma_b_ptr = frame->data[1];
const auto* chroma_r_ptr = frame->data[2];
const auto stride = frame->linesize[0];
const auto half_stride = frame->linesize[1];
std::vector<u8> luma_buffer(aligned_width * surface_height);
std::vector<u8> chroma_buffer(aligned_width * half_height);
// Populate luma buffer
for (std::size_t y = 0; y < surface_height - 1; ++y) {
std::size_t src = y * stride;
std::size_t dst = y * aligned_width;
std::size_t size = surface_width;
for (std::size_t offset = 0; offset < size; ++offset) {
luma_buffer[dst + offset] = luma_ptr[src + offset];
}
}
gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
luma_buffer.size());
// Populate chroma buffer from both channels with interleaving.
for (std::size_t y = 0; y < half_height; ++y) {
std::size_t src = y * half_stride;
std::size_t dst = y * aligned_width;
for (std::size_t x = 0; x < half_width; ++x) {
chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x];
chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x];
}
}
gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),
chroma_buffer.size());
gpu.Maxwell3D().OnMemoryWrite();
break;
}
default:
UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value());
break;
}
}
} // namespace Tegra

View file

@ -0,0 +1,110 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <memory>
#include <vector>
#include "common/bit_field.h"
#include "common/common_types.h"
struct SwsContext;
namespace Tegra {
class GPU;
class Nvdec;
struct PlaneOffsets {
u32 luma_offset{};
u32 chroma_u_offset{};
u32 chroma_v_offset{};
};
struct VicRegisters {
INSERT_PADDING_WORDS(64);
u32 nop{};
INSERT_PADDING_WORDS(15);
u32 pm_trigger{};
INSERT_PADDING_WORDS(47);
u32 set_application_id{};
u32 set_watchdog_timer{};
INSERT_PADDING_WORDS(17);
u32 context_save_area{};
u32 context_switch{};
INSERT_PADDING_WORDS(43);
u32 execute{};
INSERT_PADDING_WORDS(63);
std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{};
u32 picture_index{};
u32 control_params{};
u32 config_struct_offset{};
u32 filter_struct_offset{};
u32 palette_offset{};
u32 hist_offset{};
u32 context_id{};
u32 fce_ucode_size{};
PlaneOffsets output_surface{};
u32 fce_ucode_offset{};
INSERT_PADDING_WORDS(4);
std::array<u32, 8> slot_context_id{};
INSERT_PADDING_WORDS(16);
};
static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size");
class Vic {
public:
enum class Method : u32 {
Execute = 0xc0,
SetControlParams = 0x1c1,
SetConfigStructOffset = 0x1c2,
SetOutputSurfaceLumaOffset = 0x1c8,
SetOutputSurfaceChromaUOffset = 0x1c9,
SetOutputSurfaceChromaVOffset = 0x1ca
};
explicit Vic(GPU& gpu, std::shared_ptr<Tegra::Nvdec> nvdec_processor);
~Vic();
/// Write to the device state.
void ProcessMethod(Vic::Method method, const std::vector<u32>& arguments);
private:
void Execute();
void VicStateWrite(u32 offset, u32 arguments);
VicRegisters vic_state{};
enum class VideoPixelFormat : u64_le {
RGBA8 = 0x1f,
BGRA8 = 0x20,
Yuv420 = 0x44,
};
union VicConfig {
u64_le raw{};
BitField<0, 7, u64_le> pixel_format;
BitField<7, 2, u64_le> chroma_loc_horiz;
BitField<9, 2, u64_le> chroma_loc_vert;
BitField<11, 4, u64_le> block_linear_kind;
BitField<15, 4, u64_le> block_linear_height_log2;
BitField<19, 3, u64_le> reserved0;
BitField<22, 10, u64_le> reserved1;
BitField<32, 14, u64_le> surface_width_minus1;
BitField<46, 14, u64_le> surface_height_minus1;
};
GPU& gpu;
std::shared_ptr<Tegra::Nvdec> nvdec_processor;
GPUVAddr config_struct_address{};
GPUVAddr output_surface_luma_address{};
GPUVAddr output_surface_chroma_u_address{};
GPUVAddr output_surface_chroma_v_address{};
SwsContext* scaler_ctx{};
s32 scaler_width{};
s32 scaler_height{};
};
} // namespace Tegra

View file

@ -27,9 +27,10 @@ namespace Tegra {
MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
GPU::GPU(Core::System& system_, bool is_async_) GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
: system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)}, : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)},
dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)}, dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)},
cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_},
maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)}, maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},
fermi_2d{std::make_unique<Engines::Fermi2D>()}, fermi_2d{std::make_unique<Engines::Fermi2D>()},
kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)}, kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
@ -77,10 +78,18 @@ DmaPusher& GPU::DmaPusher() {
return *dma_pusher; return *dma_pusher;
} }
Tegra::CDmaPusher& GPU::CDmaPusher() {
return *cdma_pusher;
}
const DmaPusher& GPU::DmaPusher() const { const DmaPusher& GPU::DmaPusher() const {
return *dma_pusher; return *dma_pusher;
} }
const Tegra::CDmaPusher& GPU::CDmaPusher() const {
return *cdma_pusher;
}
void GPU::WaitFence(u32 syncpoint_id, u32 value) { void GPU::WaitFence(u32 syncpoint_id, u32 value) {
// Synced GPU, is always in sync // Synced GPU, is always in sync
if (!is_async) { if (!is_async) {

View file

@ -13,6 +13,7 @@
#include "common/common_types.h" #include "common/common_types.h"
#include "core/hle/service/nvdrv/nvdata.h" #include "core/hle/service/nvdrv/nvdata.h"
#include "core/hle/service/nvflinger/buffer_queue.h" #include "core/hle/service/nvflinger/buffer_queue.h"
#include "video_core/cdma_pusher.h"
#include "video_core/dma_pusher.h" #include "video_core/dma_pusher.h"
using CacheAddr = std::uintptr_t; using CacheAddr = std::uintptr_t;
@ -157,7 +158,7 @@ public:
method_count(method_count) {} method_count(method_count) {}
}; };
explicit GPU(Core::System& system, bool is_async); explicit GPU(Core::System& system, bool is_async, bool use_nvdec);
virtual ~GPU(); virtual ~GPU();
/// Binds a renderer to the GPU. /// Binds a renderer to the GPU.
@ -209,6 +210,15 @@ public:
/// Returns a reference to the GPU DMA pusher. /// Returns a reference to the GPU DMA pusher.
Tegra::DmaPusher& DmaPusher(); Tegra::DmaPusher& DmaPusher();
/// Returns a const reference to the GPU DMA pusher.
const Tegra::DmaPusher& DmaPusher() const;
/// Returns a reference to the GPU CDMA pusher.
Tegra::CDmaPusher& CDmaPusher();
/// Returns a const reference to the GPU CDMA pusher.
const Tegra::CDmaPusher& CDmaPusher() const;
VideoCore::RendererBase& Renderer() { VideoCore::RendererBase& Renderer() {
return *renderer; return *renderer;
} }
@ -249,8 +259,9 @@ public:
return is_async; return is_async;
} }
/// Returns a const reference to the GPU DMA pusher. bool UseNvdec() const {
const Tegra::DmaPusher& DmaPusher() const; return use_nvdec;
}
struct Regs { struct Regs {
static constexpr size_t NUM_REGS = 0x40; static constexpr size_t NUM_REGS = 0x40;
@ -311,6 +322,9 @@ public:
/// Push GPU command entries to be processed /// Push GPU command entries to be processed
virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0; virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
/// Push GPU command buffer entries to be processed
virtual void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) = 0;
/// Swap buffers (render frame) /// Swap buffers (render frame)
virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0; virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
@ -349,7 +363,9 @@ protected:
Core::System& system; Core::System& system;
std::unique_ptr<Tegra::MemoryManager> memory_manager; std::unique_ptr<Tegra::MemoryManager> memory_manager;
std::unique_ptr<Tegra::DmaPusher> dma_pusher; std::unique_ptr<Tegra::DmaPusher> dma_pusher;
std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
std::unique_ptr<VideoCore::RendererBase> renderer; std::unique_ptr<VideoCore::RendererBase> renderer;
const bool use_nvdec;
private: private:
/// Mapping of command subchannels to their bound engine ids /// Mapping of command subchannels to their bound engine ids
@ -372,6 +388,7 @@ private:
std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts; std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
std::mutex sync_mutex; std::mutex sync_mutex;
std::mutex device_mutex;
std::condition_variable sync_cv; std::condition_variable sync_cv;

View file

@ -10,12 +10,13 @@
namespace VideoCommon { namespace VideoCommon {
GPUAsynch::GPUAsynch(Core::System& system) : GPU{system, true}, gpu_thread{system} {} GPUAsynch::GPUAsynch(Core::System& system, bool use_nvdec)
: GPU{system, true, use_nvdec}, gpu_thread{system} {}
GPUAsynch::~GPUAsynch() = default; GPUAsynch::~GPUAsynch() = default;
void GPUAsynch::Start() { void GPUAsynch::Start() {
gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher); gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher);
cpu_context = renderer->GetRenderWindow().CreateSharedContext(); cpu_context = renderer->GetRenderWindow().CreateSharedContext();
cpu_context->MakeCurrent(); cpu_context->MakeCurrent();
} }
@ -32,6 +33,27 @@ void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
gpu_thread.SubmitList(std::move(entries)); gpu_thread.SubmitList(std::move(entries));
} }
void GPUAsynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
if (!use_nvdec) {
return;
}
// This condition fires when a video stream ends, clear all intermediary data
if (entries[0].raw == 0xDEADB33F) {
cdma_pusher.reset();
return;
}
if (!cdma_pusher) {
cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
}
// SubmitCommandBuffer would make the nvdec operations async, this is not currently working
// TODO(ameerj): RE proper async nvdec operation
// gpu_thread.SubmitCommandBuffer(std::move(entries));
cdma_pusher->Push(std::move(entries));
cdma_pusher->DispatchCalls();
}
void GPUAsynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { void GPUAsynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
gpu_thread.SwapBuffers(framebuffer); gpu_thread.SwapBuffers(framebuffer);
} }

View file

@ -20,13 +20,14 @@ namespace VideoCommon {
/// Implementation of GPU interface that runs the GPU asynchronously /// Implementation of GPU interface that runs the GPU asynchronously
class GPUAsynch final : public Tegra::GPU { class GPUAsynch final : public Tegra::GPU {
public: public:
explicit GPUAsynch(Core::System& system); explicit GPUAsynch(Core::System& system, bool use_nvdec);
~GPUAsynch() override; ~GPUAsynch() override;
void Start() override; void Start() override;
void ObtainContext() override; void ObtainContext() override;
void ReleaseContext() override; void ReleaseContext() override;
void PushGPUEntries(Tegra::CommandList&& entries) override; void PushGPUEntries(Tegra::CommandList&& entries) override;
void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override;
void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
void FlushRegion(VAddr addr, u64 size) override; void FlushRegion(VAddr addr, u64 size) override;
void InvalidateRegion(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size) override;

View file

@ -7,7 +7,7 @@
namespace VideoCommon { namespace VideoCommon {
GPUSynch::GPUSynch(Core::System& system) : GPU{system, false} {} GPUSynch::GPUSynch(Core::System& system, bool use_nvdec) : GPU{system, false, use_nvdec} {}
GPUSynch::~GPUSynch() = default; GPUSynch::~GPUSynch() = default;
@ -26,6 +26,22 @@ void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
dma_pusher->DispatchCalls(); dma_pusher->DispatchCalls();
} }
void GPUSynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
if (!use_nvdec) {
return;
}
// This condition fires when a video stream ends, clears all intermediary data
if (entries[0].raw == 0xDEADB33F) {
cdma_pusher.reset();
return;
}
if (!cdma_pusher) {
cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
}
cdma_pusher->Push(std::move(entries));
cdma_pusher->DispatchCalls();
}
void GPUSynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { void GPUSynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
renderer->SwapBuffers(framebuffer); renderer->SwapBuffers(framebuffer);
} }

View file

@ -19,13 +19,14 @@ namespace VideoCommon {
/// Implementation of GPU interface that runs the GPU synchronously /// Implementation of GPU interface that runs the GPU synchronously
class GPUSynch final : public Tegra::GPU { class GPUSynch final : public Tegra::GPU {
public: public:
explicit GPUSynch(Core::System& system); explicit GPUSynch(Core::System& system, bool use_nvdec);
~GPUSynch() override; ~GPUSynch() override;
void Start() override; void Start() override;
void ObtainContext() override; void ObtainContext() override;
void ReleaseContext() override; void ReleaseContext() override;
void PushGPUEntries(Tegra::CommandList&& entries) override; void PushGPUEntries(Tegra::CommandList&& entries) override;
void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override;
void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
void FlushRegion(VAddr addr, u64 size) override; void FlushRegion(VAddr addr, u64 size) override;
void InvalidateRegion(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size) override;

View file

@ -18,7 +18,7 @@ namespace VideoCommon::GPUThread {
/// Runs the GPU thread /// Runs the GPU thread
static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher, Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
SynchState& state) { SynchState& state, Tegra::CDmaPusher& cdma_pusher) {
std::string name = "yuzu:GPU"; std::string name = "yuzu:GPU";
MicroProfileOnThreadCreate(name.c_str()); MicroProfileOnThreadCreate(name.c_str());
Common::SetCurrentThreadName(name.c_str()); Common::SetCurrentThreadName(name.c_str());
@ -42,6 +42,10 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) { if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
dma_pusher.Push(std::move(submit_list->entries)); dma_pusher.Push(std::move(submit_list->entries));
dma_pusher.DispatchCalls(); dma_pusher.DispatchCalls();
} else if (const auto command_list = std::get_if<SubmitChCommandEntries>(&next.data)) {
// NVDEC
cdma_pusher.Push(std::move(command_list->entries));
cdma_pusher.DispatchCalls();
} else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) { } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) {
renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr); renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
} else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) { } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) {
@ -75,15 +79,19 @@ ThreadManager::~ThreadManager() {
void ThreadManager::StartThread(VideoCore::RendererBase& renderer, void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
Core::Frontend::GraphicsContext& context, Core::Frontend::GraphicsContext& context,
Tegra::DmaPusher& dma_pusher) { Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) {
thread = std::thread{RunThread, std::ref(system), std::ref(renderer), thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
std::ref(context), std::ref(dma_pusher), std::ref(state)}; std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher));
} }
void ThreadManager::SubmitList(Tegra::CommandList&& entries) { void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
PushCommand(SubmitListCommand(std::move(entries))); PushCommand(SubmitListCommand(std::move(entries)));
} }
void ThreadManager::SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries) {
PushCommand(SubmitChCommandEntries(std::move(entries)));
}
void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
PushCommand(SwapBuffersCommand(framebuffer ? std::make_optional(*framebuffer) : std::nullopt)); PushCommand(SwapBuffersCommand(framebuffer ? std::make_optional(*framebuffer) : std::nullopt));
} }

View file

@ -37,6 +37,14 @@ struct SubmitListCommand final {
Tegra::CommandList entries; Tegra::CommandList entries;
}; };
/// Command to signal to the GPU thread that a cdma command list is ready for processing
struct SubmitChCommandEntries final {
explicit SubmitChCommandEntries(Tegra::ChCommandHeaderList&& entries)
: entries{std::move(entries)} {}
Tegra::ChCommandHeaderList entries;
};
/// Command to signal to the GPU thread that a swap buffers is pending /// Command to signal to the GPU thread that a swap buffers is pending
struct SwapBuffersCommand final { struct SwapBuffersCommand final {
explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer) explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer)
@ -77,9 +85,9 @@ struct OnCommandListEndCommand final {};
struct GPUTickCommand final {}; struct GPUTickCommand final {};
using CommandData = using CommandData =
std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand, std::variant<EndProcessingCommand, SubmitListCommand, SubmitChCommandEntries,
InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand, SwapBuffersCommand, FlushRegionCommand, InvalidateRegionCommand,
GPUTickCommand>; FlushAndInvalidateRegionCommand, OnCommandListEndCommand, GPUTickCommand>;
struct CommandDataContainer { struct CommandDataContainer {
CommandDataContainer() = default; CommandDataContainer() = default;
@ -109,11 +117,14 @@ public:
/// Creates and starts the GPU thread. /// Creates and starts the GPU thread.
void StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context, void StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context,
Tegra::DmaPusher& dma_pusher); Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher);
/// Push GPU command entries to be processed /// Push GPU command entries to be processed
void SubmitList(Tegra::CommandList&& entries); void SubmitList(Tegra::CommandList&& entries);
/// Push GPU CDMA command buffer entries to be processed
void SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries);
/// Swap buffers (render frame) /// Swap buffers (render frame)
void SwapBuffers(const Tegra::FramebufferConfig* framebuffer); void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);

View file

@ -11,6 +11,7 @@
#include "video_core/gpu.h" #include "video_core/gpu.h"
#include "video_core/memory_manager.h" #include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h" #include "video_core/rasterizer_interface.h"
#include "video_core/renderer_base.h"
namespace Tegra { namespace Tegra {
@ -44,6 +45,12 @@ GPUVAddr MemoryManager::MapAllocate(VAddr cpu_addr, std::size_t size, std::size_
return Map(cpu_addr, *FindFreeRange(size, align), size); return Map(cpu_addr, *FindFreeRange(size, align), size);
} }
GPUVAddr MemoryManager::MapAllocate32(VAddr cpu_addr, std::size_t size) {
const std::optional<GPUVAddr> gpu_addr = FindFreeRange(size, 1, true);
ASSERT(gpu_addr);
return Map(cpu_addr, *gpu_addr, size);
}
void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) { void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
if (!size) { if (!size) {
return; return;
@ -108,7 +115,8 @@ void MemoryManager::SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::s
page_table[PageEntryIndex(gpu_addr)] = page_entry; page_table[PageEntryIndex(gpu_addr)] = page_entry;
} }
std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align) const { std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align,
bool start_32bit_address) const {
if (!align) { if (!align) {
align = page_size; align = page_size;
} else { } else {
@ -116,7 +124,7 @@ std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size
} }
u64 available_size{}; u64 available_size{};
GPUVAddr gpu_addr{address_space_start}; GPUVAddr gpu_addr{start_32bit_address ? address_space_start_low : address_space_start};
while (gpu_addr + available_size < address_space_size) { while (gpu_addr + available_size < address_space_size) {
if (GetPageEntry(gpu_addr + available_size).IsUnmapped()) { if (GetPageEntry(gpu_addr + available_size).IsUnmapped()) {
available_size += page_size; available_size += page_size;

View file

@ -116,6 +116,7 @@ public:
[[nodiscard]] GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size); [[nodiscard]] GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size);
[[nodiscard]] GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align); [[nodiscard]] GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align);
[[nodiscard]] GPUVAddr MapAllocate32(VAddr cpu_addr, std::size_t size);
[[nodiscard]] std::optional<GPUVAddr> AllocateFixed(GPUVAddr gpu_addr, std::size_t size); [[nodiscard]] std::optional<GPUVAddr> AllocateFixed(GPUVAddr gpu_addr, std::size_t size);
[[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align); [[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align);
void Unmap(GPUVAddr gpu_addr, std::size_t size); void Unmap(GPUVAddr gpu_addr, std::size_t size);
@ -124,7 +125,8 @@ private:
[[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const; [[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const;
void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size); void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size);
GPUVAddr UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size); GPUVAddr UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size);
[[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align) const; [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align,
bool start_32bit_address = false) const;
void TryLockPage(PageEntry page_entry, std::size_t size); void TryLockPage(PageEntry page_entry, std::size_t size);
void TryUnlockPage(PageEntry page_entry, std::size_t size); void TryUnlockPage(PageEntry page_entry, std::size_t size);
@ -135,6 +137,7 @@ private:
static constexpr u64 address_space_size = 1ULL << 40; static constexpr u64 address_space_size = 1ULL << 40;
static constexpr u64 address_space_start = 1ULL << 32; static constexpr u64 address_space_start = 1ULL << 32;
static constexpr u64 address_space_start_low = 1ULL << 16;
static constexpr u64 page_bits{16}; static constexpr u64 page_bits{16};
static constexpr u64 page_size{1 << page_bits}; static constexpr u64 page_size{1 << page_bits};
static constexpr u64 page_mask{page_size - 1}; static constexpr u64 page_mask{page_size - 1};

View file

@ -44,10 +44,11 @@ namespace VideoCore {
std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) { std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) {
std::unique_ptr<Tegra::GPU> gpu; std::unique_ptr<Tegra::GPU> gpu;
const bool use_nvdec = Settings::values.use_nvdec_emulation.GetValue();
if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) { if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
gpu = std::make_unique<VideoCommon::GPUAsynch>(system); gpu = std::make_unique<VideoCommon::GPUAsynch>(system, use_nvdec);
} else { } else {
gpu = std::make_unique<VideoCommon::GPUSynch>(system); gpu = std::make_unique<VideoCommon::GPUSynch>(system, use_nvdec);
} }
auto context = emu_window.CreateSharedContext(); auto context = emu_window.CreateSharedContext();

View file

@ -265,9 +265,11 @@ if (MSVC)
include(CopyYuzuQt5Deps) include(CopyYuzuQt5Deps)
include(CopyYuzuSDLDeps) include(CopyYuzuSDLDeps)
include(CopyYuzuUnicornDeps) include(CopyYuzuUnicornDeps)
include(CopyYuzuFFmpegDeps)
copy_yuzu_Qt5_deps(yuzu) copy_yuzu_Qt5_deps(yuzu)
copy_yuzu_SDL_deps(yuzu) copy_yuzu_SDL_deps(yuzu)
copy_yuzu_unicorn_deps(yuzu) copy_yuzu_unicorn_deps(yuzu)
copy_yuzu_FFmpeg_deps(yuzu)
endif() endif()
if (NOT APPLE) if (NOT APPLE)

View file

@ -717,6 +717,8 @@ void Config::ReadRendererValues() {
ReadSettingGlobal(Settings::values.gpu_accuracy, QStringLiteral("gpu_accuracy"), 0); ReadSettingGlobal(Settings::values.gpu_accuracy, QStringLiteral("gpu_accuracy"), 0);
ReadSettingGlobal(Settings::values.use_asynchronous_gpu_emulation, ReadSettingGlobal(Settings::values.use_asynchronous_gpu_emulation,
QStringLiteral("use_asynchronous_gpu_emulation"), false); QStringLiteral("use_asynchronous_gpu_emulation"), false);
ReadSettingGlobal(Settings::values.use_nvdec_emulation, QStringLiteral("use_nvdec_emulation"),
true);
ReadSettingGlobal(Settings::values.use_vsync, QStringLiteral("use_vsync"), true); ReadSettingGlobal(Settings::values.use_vsync, QStringLiteral("use_vsync"), true);
ReadSettingGlobal(Settings::values.use_assembly_shaders, QStringLiteral("use_assembly_shaders"), ReadSettingGlobal(Settings::values.use_assembly_shaders, QStringLiteral("use_assembly_shaders"),
false); false);
@ -1265,6 +1267,8 @@ void Config::SaveRendererValues() {
Settings::values.gpu_accuracy.UsingGlobal(), 0); Settings::values.gpu_accuracy.UsingGlobal(), 0);
WriteSettingGlobal(QStringLiteral("use_asynchronous_gpu_emulation"), WriteSettingGlobal(QStringLiteral("use_asynchronous_gpu_emulation"),
Settings::values.use_asynchronous_gpu_emulation, false); Settings::values.use_asynchronous_gpu_emulation, false);
WriteSettingGlobal(QStringLiteral("use_nvdec_emulation"), Settings::values.use_nvdec_emulation,
true);
WriteSettingGlobal(QStringLiteral("use_vsync"), Settings::values.use_vsync, true); WriteSettingGlobal(QStringLiteral("use_vsync"), Settings::values.use_vsync, true);
WriteSettingGlobal(QStringLiteral("use_assembly_shaders"), WriteSettingGlobal(QStringLiteral("use_assembly_shaders"),
Settings::values.use_assembly_shaders, false); Settings::values.use_assembly_shaders, false);

View file

@ -70,9 +70,11 @@ void ConfigureGraphics::SetConfiguration() {
ui->api->setEnabled(runtime_lock); ui->api->setEnabled(runtime_lock);
ui->use_asynchronous_gpu_emulation->setEnabled(runtime_lock); ui->use_asynchronous_gpu_emulation->setEnabled(runtime_lock);
ui->use_disk_shader_cache->setEnabled(runtime_lock); ui->use_disk_shader_cache->setEnabled(runtime_lock);
ui->use_nvdec_emulation->setEnabled(runtime_lock);
ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache.GetValue()); ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache.GetValue());
ui->use_asynchronous_gpu_emulation->setChecked( ui->use_asynchronous_gpu_emulation->setChecked(
Settings::values.use_asynchronous_gpu_emulation.GetValue()); Settings::values.use_asynchronous_gpu_emulation.GetValue());
ui->use_nvdec_emulation->setChecked(Settings::values.use_nvdec_emulation.GetValue());
if (Settings::configuring_global) { if (Settings::configuring_global) {
ui->api->setCurrentIndex(static_cast<int>(Settings::values.renderer_backend.GetValue())); ui->api->setCurrentIndex(static_cast<int>(Settings::values.renderer_backend.GetValue()));
@ -116,6 +118,9 @@ void ConfigureGraphics::ApplyConfiguration() {
Settings::values.use_asynchronous_gpu_emulation.SetValue( Settings::values.use_asynchronous_gpu_emulation.SetValue(
ui->use_asynchronous_gpu_emulation->isChecked()); ui->use_asynchronous_gpu_emulation->isChecked());
} }
if (Settings::values.use_nvdec_emulation.UsingGlobal()) {
Settings::values.use_nvdec_emulation.SetValue(ui->use_nvdec_emulation->isChecked());
}
if (Settings::values.bg_red.UsingGlobal()) { if (Settings::values.bg_red.UsingGlobal()) {
Settings::values.bg_red.SetValue(static_cast<float>(bg_color.redF())); Settings::values.bg_red.SetValue(static_cast<float>(bg_color.redF()));
Settings::values.bg_green.SetValue(static_cast<float>(bg_color.greenF())); Settings::values.bg_green.SetValue(static_cast<float>(bg_color.greenF()));
@ -144,6 +149,8 @@ void ConfigureGraphics::ApplyConfiguration() {
ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_asynchronous_gpu_emulation, ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_asynchronous_gpu_emulation,
ui->use_asynchronous_gpu_emulation, ui->use_asynchronous_gpu_emulation,
use_asynchronous_gpu_emulation); use_asynchronous_gpu_emulation);
ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_nvdec_emulation,
ui->use_nvdec_emulation, use_nvdec_emulation);
if (ui->bg_combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) { if (ui->bg_combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) {
Settings::values.bg_red.SetGlobal(true); Settings::values.bg_red.SetGlobal(true);
@ -240,6 +247,7 @@ void ConfigureGraphics::SetupPerGameUI() {
ui->aspect_ratio_combobox->setEnabled(Settings::values.aspect_ratio.UsingGlobal()); ui->aspect_ratio_combobox->setEnabled(Settings::values.aspect_ratio.UsingGlobal());
ui->use_asynchronous_gpu_emulation->setEnabled( ui->use_asynchronous_gpu_emulation->setEnabled(
Settings::values.use_asynchronous_gpu_emulation.UsingGlobal()); Settings::values.use_asynchronous_gpu_emulation.UsingGlobal());
ui->use_nvdec_emulation->setEnabled(Settings::values.use_nvdec_emulation.UsingGlobal());
ui->use_disk_shader_cache->setEnabled(Settings::values.use_disk_shader_cache.UsingGlobal()); ui->use_disk_shader_cache->setEnabled(Settings::values.use_disk_shader_cache.UsingGlobal());
ui->bg_button->setEnabled(Settings::values.bg_red.UsingGlobal()); ui->bg_button->setEnabled(Settings::values.bg_red.UsingGlobal());
@ -253,6 +261,8 @@ void ConfigureGraphics::SetupPerGameUI() {
ConfigurationShared::SetColoredTristate( ConfigurationShared::SetColoredTristate(
ui->use_disk_shader_cache, Settings::values.use_disk_shader_cache, use_disk_shader_cache); ui->use_disk_shader_cache, Settings::values.use_disk_shader_cache, use_disk_shader_cache);
ConfigurationShared::SetColoredTristate(
ui->use_nvdec_emulation, Settings::values.use_nvdec_emulation, use_nvdec_emulation);
ConfigurationShared::SetColoredTristate(ui->use_asynchronous_gpu_emulation, ConfigurationShared::SetColoredTristate(ui->use_asynchronous_gpu_emulation,
Settings::values.use_asynchronous_gpu_emulation, Settings::values.use_asynchronous_gpu_emulation,
use_asynchronous_gpu_emulation); use_asynchronous_gpu_emulation);

View file

@ -46,6 +46,7 @@ private:
std::unique_ptr<Ui::ConfigureGraphics> ui; std::unique_ptr<Ui::ConfigureGraphics> ui;
QColor bg_color; QColor bg_color;
ConfigurationShared::CheckState use_nvdec_emulation;
ConfigurationShared::CheckState use_disk_shader_cache; ConfigurationShared::CheckState use_disk_shader_cache;
ConfigurationShared::CheckState use_asynchronous_gpu_emulation; ConfigurationShared::CheckState use_asynchronous_gpu_emulation;

View file

@ -97,6 +97,13 @@
</property> </property>
</widget> </widget>
</item> </item>
<item>
<widget class="QCheckBox" name="use_nvdec_emulation">
<property name="text">
<string>Use NVDEC emulation</string>
</property>
</widget>
</item>
<item> <item>
<widget class="QWidget" name="aspect_ratio_layout" native="true"> <widget class="QWidget" name="aspect_ratio_layout" native="true">
<layout class="QHBoxLayout" name="horizontalLayout_6"> <layout class="QHBoxLayout" name="horizontalLayout_6">