1
0
Fork 0
forked from suyu/suyu

Compare commits

...

7 commits

Author SHA1 Message Date
ee365bad95 Fixed missing reddit hyperlink
all chat and reddit mentions are hyperlinked, but one mention of the subreddit wasn't, so I hyperlinked it like the others
2024-10-06 11:40:15 +02:00
27769c595b Restored Hyperlink to sudachi's website now it is back up 2024-10-06 11:36:27 +02:00
26b1d7e879
enable boost concepts 2024-10-05 12:35:09 +02:00
40def7017c
include fmt/ranges.h 2024-10-05 09:10:15 +02:00
c52427b676
mark format functions as const 2024-10-05 08:04:46 +02:00
509b880eec
Revert all the trash commits that were breaking build, back to e5c47e911b
This reverts commit 592f93b26c.
2024-10-05 13:50:31 +08:00
8d6b694569 Update README.md 2024-09-30 12:30:59 +02:00
29 changed files with 1469 additions and 768 deletions

View file

@ -279,8 +279,6 @@ endif()
# Configure C++ standard
# ===========================
# boost asio's concept usage doesn't play nicely with some compilers yet.
add_definitions(-DBOOST_ASIO_DISABLE_CONCEPTS)
if (MSVC)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/std:c++20>)

View file

@ -9,6 +9,8 @@ SPDX-License-Identifier: GPL-3.0-or-later
We're in need of developers. Please join our chat below or DM a dev if you want to contribute!
This repo is currently based on Yuzu EA 4176 but the code will be rewritten for legal and performance reasons.
Our only website is suyu.dev so please be cautious when using other sites offering builds/downloads.
<hr />
<h1 align="center">
@ -65,7 +67,7 @@ You can also contact any of the developers on the Chat to learn more about the c
* __Linux__: [Releases](https://git.suyu.dev/suyu/suyu/releases)
* __macOS__: [Releases](https://git.suyu.dev/suyu/suyu/releases)
* __Android__: [Releases](https://git.suyu.dev/suyu/suyu/releases)
###### We currently do not provide builds for iOS, however if you would like, you could try the experimental Sudachi Emulator and it's bigger project: [Folium](https://apps.apple.com/us/app/folium/id6498623389).
###### We currently do not provide builds for iOS, however if you would like, you could try the experimental [Sudachi Emulator](https://sudachi.emuplace.app/) and it's bigger project: [Folium](https://apps.apple.com/us/app/folium/id6498623389).
If you want daily builds then [Click here](https://git.suyu.dev/suyu/suyu/actions).
If you don't know how to download the daily builds then [Click here](https://git.suyu.dev/suyu/suyu/raw/branch/dev/img/daily-builds.png)
@ -85,7 +87,7 @@ For Multiplayer, we recommend using the "Yuzu Online" patch, install instruction
## Support
If you have any questions, don't hesitate to ask us in our [Chat](https://chat.suyu.dev) or Subreddit, make an issue or contact a developer. We don't bite!
If you have any questions, don't hesitate to ask us in our [Chat](https://chat.suyu.dev) or [Subreddit](https://www.reddit.com/r/suyu/), make an issue or contact a developer. We don't bite!
## License

View file

@ -14,7 +14,7 @@ template <typename T>
struct fmt::formatter<T, std::enable_if_t<std::is_enum_v<T>, char>>
: formatter<std::underlying_type_t<T>> {
template <typename FormatContext>
auto format(const T& value, FormatContext& ctx) -> decltype(ctx.out()) {
auto format(const T& value, FormatContext& ctx) const -> decltype(ctx.out()) {
return fmt::formatter<std::underlying_type_t<T>>::format(
static_cast<std::underlying_type_t<T>>(value), ctx);
}

View file

@ -262,7 +262,7 @@ struct fmt::formatter<Common::PhysicalAddress> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Common::PhysicalAddress& addr, FormatContext& ctx) {
auto format(const Common::PhysicalAddress& addr, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "{:#x}", static_cast<u64>(addr.GetValue()));
}
};
@ -273,7 +273,7 @@ struct fmt::formatter<Common::ProcessAddress> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Common::ProcessAddress& addr, FormatContext& ctx) {
auto format(const Common::ProcessAddress& addr, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "{:#x}", static_cast<u64>(addr.GetValue()));
}
};
@ -284,7 +284,7 @@ struct fmt::formatter<Common::VirtualAddress> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Common::VirtualAddress& addr, FormatContext& ctx) {
auto format(const Common::VirtualAddress& addr, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "{:#x}", static_cast<u64>(addr.GetValue()));
}
};

View file

@ -22,7 +22,7 @@ struct fmt::formatter<Dynarmic::A32::CoprocReg> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Dynarmic::A32::CoprocReg& reg, FormatContext& ctx) {
auto format(const Dynarmic::A32::CoprocReg& reg, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "cp{}", static_cast<size_t>(reg));
}
};

View file

@ -26,6 +26,24 @@ std::shared_ptr<EventType> CreateEvent(std::string name, TimedCallback&& callbac
return std::make_shared<EventType>(std::move(callback), std::move(name));
}
struct CoreTiming::Event {
s64 time;
u64 fifo_order;
std::weak_ptr<EventType> type;
s64 reschedule_time;
heap_t::handle_type handle{};
// Sort by time, unless the times are the same, in which case sort by
// the order added to the queue
friend bool operator>(const Event& left, const Event& right) {
return std::tie(left.time, left.fifo_order) > std::tie(right.time, right.fifo_order);
}
friend bool operator<(const Event& left, const Event& right) {
return std::tie(left.time, left.fifo_order) < std::tie(right.time, right.fifo_order);
}
};
CoreTiming::CoreTiming() : clock{Common::CreateOptimalClock()} {}
CoreTiming::~CoreTiming() {
@ -69,7 +87,7 @@ void CoreTiming::Pause(bool is_paused) {
}
void CoreTiming::SyncPause(bool is_paused) {
if (is_paused == paused && paused_set == is_paused) {
if (is_paused == paused && paused_set == paused) {
return;
}
@ -94,7 +112,7 @@ bool CoreTiming::IsRunning() const {
bool CoreTiming::HasPendingEvents() const {
std::scoped_lock lock{basic_lock};
return !event_queue.empty();
return !(wait_set && event_queue.empty());
}
void CoreTiming::ScheduleEvent(std::chrono::nanoseconds ns_into_future,
@ -103,8 +121,8 @@ void CoreTiming::ScheduleEvent(std::chrono::nanoseconds ns_into_future,
std::scoped_lock scope{basic_lock};
const auto next_time{absolute_time ? ns_into_future : GetGlobalTimeNs() + ns_into_future};
event_queue.emplace_back(Event{next_time.count(), event_fifo_id++, event_type});
std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>());
auto h{event_queue.emplace(Event{next_time.count(), event_fifo_id++, event_type, 0})};
(*h).handle = h;
}
event.Set();
@ -118,9 +136,9 @@ void CoreTiming::ScheduleLoopingEvent(std::chrono::nanoseconds start_time,
std::scoped_lock scope{basic_lock};
const auto next_time{absolute_time ? start_time : GetGlobalTimeNs() + start_time};
event_queue.emplace_back(
Event{next_time.count(), event_fifo_id++, event_type, resched_time.count()});
std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>());
auto h{event_queue.emplace(
Event{next_time.count(), event_fifo_id++, event_type, resched_time.count()})};
(*h).handle = h;
}
event.Set();
@ -131,11 +149,17 @@ void CoreTiming::UnscheduleEvent(const std::shared_ptr<EventType>& event_type,
{
std::scoped_lock lk{basic_lock};
event_queue.erase(
std::remove_if(event_queue.begin(), event_queue.end(),
[&](const Event& e) { return e.type.lock().get() == event_type.get(); }),
event_queue.end());
std::make_heap(event_queue.begin(), event_queue.end(), std::greater<>());
std::vector<heap_t::handle_type> to_remove;
for (auto itr = event_queue.begin(); itr != event_queue.end(); itr++) {
const Event& e = *itr;
if (e.type.lock().get() == event_type.get()) {
to_remove.push_back(itr->handle);
}
}
for (auto& h : to_remove) {
event_queue.erase(h);
}
event_type->sequence_number++;
}
@ -148,7 +172,7 @@ void CoreTiming::UnscheduleEvent(const std::shared_ptr<EventType>& event_type,
void CoreTiming::AddTicks(u64 ticks_to_add) {
cpu_ticks += ticks_to_add;
downcount -= static_cast<s64>(ticks_to_add);
downcount -= static_cast<s64>(cpu_ticks);
}
void CoreTiming::Idle() {
@ -156,7 +180,7 @@ void CoreTiming::Idle() {
}
void CoreTiming::ResetTicks() {
downcount.store(MAX_SLICE_LENGTH, std::memory_order_release);
downcount = MAX_SLICE_LENGTH;
}
u64 CoreTiming::GetClockTicks() const {
@ -177,38 +201,48 @@ std::optional<s64> CoreTiming::Advance() {
std::scoped_lock lock{advance_lock, basic_lock};
global_timer = GetGlobalTimeNs().count();
while (!event_queue.empty() && event_queue.front().time <= global_timer) {
Event evt = std::move(event_queue.front());
std::pop_heap(event_queue.begin(), event_queue.end(), std::greater<>());
event_queue.pop_back();
while (!event_queue.empty() && event_queue.top().time <= global_timer) {
const Event& evt = event_queue.top();
if (const auto event_type = evt.type.lock()) {
if (const auto event_type{evt.type.lock()}) {
const auto evt_time = evt.time;
const auto evt_sequence_num = event_type->sequence_number;
basic_lock.unlock();
if (evt.reschedule_time == 0) {
event_queue.pop();
const auto new_schedule_time = event_type->callback(
evt_time, std::chrono::nanoseconds{GetGlobalTimeNs().count() - evt_time});
basic_lock.unlock();
basic_lock.lock();
event_type->callback(
evt_time, std::chrono::nanoseconds{GetGlobalTimeNs().count() - evt_time});
if (evt_sequence_num != event_type->sequence_number) {
continue;
}
basic_lock.lock();
} else {
basic_lock.unlock();
if (new_schedule_time.has_value() || evt.reschedule_time != 0) {
const auto next_schedule_time = new_schedule_time.value_or(
std::chrono::nanoseconds{evt.reschedule_time});
const auto new_schedule_time{event_type->callback(
evt_time, std::chrono::nanoseconds{GetGlobalTimeNs().count() - evt_time})};
auto next_time = evt.time + next_schedule_time.count();
if (evt.time < pause_end_time) {
next_time = pause_end_time + next_schedule_time.count();
basic_lock.lock();
if (evt_sequence_num != event_type->sequence_number) {
// Heap handle is invalidated after external modification.
continue;
}
event_queue.emplace_back(Event{next_time, event_fifo_id++, evt.type,
next_schedule_time.count()});
std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>());
const auto next_schedule_time{new_schedule_time.has_value()
? new_schedule_time.value().count()
: evt.reschedule_time};
// If this event was scheduled into a pause, its time now is going to be way
// behind. Re-set this event to continue from the end of the pause.
auto next_time{evt.time + next_schedule_time};
if (evt.time < pause_end_time) {
next_time = pause_end_time + next_schedule_time;
}
event_queue.update(evt.handle, Event{next_time, event_fifo_id++, evt.type,
next_schedule_time, evt.handle});
}
}
@ -216,7 +250,7 @@ std::optional<s64> CoreTiming::Advance() {
}
if (!event_queue.empty()) {
return event_queue.front().time;
return event_queue.top().time;
} else {
return std::nullopt;
}
@ -235,7 +269,7 @@ void CoreTiming::ThreadLoop() {
#ifdef _WIN32
while (!paused && !event.IsSet() && wait_time > 0) {
wait_time = *next_time - GetGlobalTimeNs().count();
if (wait_time >= 1'000'000) { // 1ms
if (wait_time >= timer_resolution_ns) {
Common::Windows::SleepForOneTick();
} else {
#ifdef ARCHITECTURE_x86_64
@ -256,8 +290,10 @@ void CoreTiming::ThreadLoop() {
} else {
// Queue is empty, wait until another event is scheduled and signals us to
// continue.
wait_set = true;
event.Wait();
}
wait_set = false;
}
paused_set = true;
@ -291,4 +327,10 @@ std::chrono::microseconds CoreTiming::GetGlobalTimeUs() const {
return std::chrono::microseconds{Common::WallClock::CPUTickToUS(cpu_ticks)};
}
#ifdef _WIN32
void CoreTiming::SetTimerResolutionNs(std::chrono::nanoseconds ns) {
timer_resolution_ns = ns.count();
}
#endif
} // namespace Core::Timing

View file

@ -11,7 +11,8 @@
#include <optional>
#include <string>
#include <thread>
#include <vector>
#include <boost/heap/fibonacci_heap.hpp>
#include "common/common_types.h"
#include "common/thread.h"
@ -42,6 +43,18 @@ enum class UnscheduleEventType {
NoWait,
};
/**
* This is a system to schedule events into the emulated machine's future. Time is measured
* in main CPU clock cycles.
*
* To schedule an event, you first have to register its type. This is where you pass in the
* callback. You then schedule events using the type ID you get back.
*
* The s64 ns_late that the callbacks get is how many ns late it was.
* So to schedule a new event on a regular basis:
* inside callback:
* ScheduleEvent(period_in_ns - ns_late, callback, "whatever")
*/
class CoreTiming {
public:
CoreTiming();
@ -53,56 +66,99 @@ public:
CoreTiming& operator=(const CoreTiming&) = delete;
CoreTiming& operator=(CoreTiming&&) = delete;
/// CoreTiming begins at the boundary of timing slice -1. An initial call to Advance() is
/// required to end slice - 1 and start slice 0 before the first cycle of code is executed.
void Initialize(std::function<void()>&& on_thread_init_);
/// Clear all pending events. This should ONLY be done on exit.
void ClearPendingEvents();
/// Sets if emulation is multicore or single core, must be set before Initialize
void SetMulticore(bool is_multicore_) {
is_multicore = is_multicore_;
}
/// Pauses/Unpauses the execution of the timer thread.
void Pause(bool is_paused);
/// Pauses/Unpauses the execution of the timer thread and waits until paused.
void SyncPause(bool is_paused);
/// Checks if core timing is running.
bool IsRunning() const;
/// Checks if the timer thread has started.
bool HasStarted() const {
return has_started;
}
/// Checks if there are any pending time events.
bool HasPendingEvents() const;
/// Schedules an event in core timing
void ScheduleEvent(std::chrono::nanoseconds ns_into_future,
const std::shared_ptr<EventType>& event_type, bool absolute_time = false);
/// Schedules an event which will automatically re-schedule itself with the given time, until
/// unscheduled
void ScheduleLoopingEvent(std::chrono::nanoseconds start_time,
std::chrono::nanoseconds resched_time,
const std::shared_ptr<EventType>& event_type,
bool absolute_time = false);
void UnscheduleEvent(const std::shared_ptr<EventType>& event_type,
UnscheduleEventType type = UnscheduleEventType::Wait);
void AddTicks(u64 ticks_to_add);
void ResetTicks();
void Idle();
s64 GetDowncount() const {
return downcount.load(std::memory_order_relaxed);
return downcount;
}
/// Returns the current CNTPCT tick value.
u64 GetClockTicks() const;
/// Returns the current GPU tick value.
u64 GetGPUTicks() const;
/// Returns current time in microseconds.
std::chrono::microseconds GetGlobalTimeUs() const;
/// Returns current time in nanoseconds.
std::chrono::nanoseconds GetGlobalTimeNs() const;
/// Checks for events manually and returns time in nanoseconds for next event, threadsafe.
std::optional<s64> Advance();
#ifdef _WIN32
void SetTimerResolutionNs(std::chrono::nanoseconds ns);
#endif
private:
struct Event {
s64 time;
u64 fifo_order;
std::shared_ptr<EventType> type;
bool operator>(const Event& other) const {
return std::tie(time, fifo_order) > std::tie(other.time, other.fifo_order);
}
};
struct Event;
static void ThreadEntry(CoreTiming& instance);
void ThreadLoop();
void Reset();
std::unique_ptr<Common::WallClock> clock;
std::atomic<s64> global_timer{0};
std::vector<Event> event_queue;
std::atomic<u64> event_fifo_id{0};
s64 global_timer = 0;
#ifdef _WIN32
s64 timer_resolution_ns;
#endif
using heap_t =
boost::heap::fibonacci_heap<CoreTiming::Event, boost::heap::compare<std::greater<>>>;
heap_t event_queue;
u64 event_fifo_id = 0;
Common::Event event{};
Common::Event pause_event{};
@ -117,12 +173,20 @@ private:
std::function<void()> on_thread_init{};
bool is_multicore{};
std::atomic<s64> pause_end_time{};
s64 pause_end_time{};
std::atomic<u64> cpu_ticks{};
std::atomic<s64> downcount{};
/// Cycle timing
u64 cpu_ticks{};
s64 downcount{};
};
/// Creates a core timing event with the given name and callback.
///
/// @param name The name of the core timing event to create.
/// @param callback The callback to execute for the event.
///
/// @returns An EventType instance representing the created event.
///
std::shared_ptr<EventType> CreateEvent(std::string name, TimedCallback&& callback);
} // namespace Core::Timing

View file

@ -1,12 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <algorithm>
#include <atomic>
#include <memory>
#include <thread>
#include <vector>
#include "common/fiber.h"
#include "common/microprofile.h"
#include "common/scope_exit.h"
@ -30,7 +24,6 @@ void CpuManager::Initialize() {
num_cores = is_multicore ? Core::Hardware::NUM_CPU_CORES : 1;
gpu_barrier = std::make_unique<Common::Barrier>(num_cores + 1);
core_data.resize(num_cores);
for (std::size_t core = 0; core < num_cores; core++) {
core_data[core].host_thread =
std::jthread([this, core](std::stop_token token) { RunThread(token, core); });
@ -38,10 +31,10 @@ void CpuManager::Initialize() {
}
void CpuManager::Shutdown() {
for (auto& data : core_data) {
if (data.host_thread.joinable()) {
data.host_thread.request_stop();
data.host_thread.join();
for (std::size_t core = 0; core < num_cores; core++) {
if (core_data[core].host_thread.joinable()) {
core_data[core].host_thread.request_stop();
core_data[core].host_thread.join();
}
}
}
@ -73,7 +66,12 @@ void CpuManager::HandleInterrupt() {
Kernel::KInterruptManager::HandleInterrupt(kernel, static_cast<s32>(core_index));
}
///////////////////////////////////////////////////////////////////////////////
/// MultiCore ///
///////////////////////////////////////////////////////////////////////////////
void CpuManager::MultiCoreRunGuestThread() {
// Similar to UserModeThreadStarter in HOS
auto& kernel = system.Kernel();
auto* thread = Kernel::GetCurrentThreadPointer(kernel);
kernel.CurrentScheduler()->OnThreadStart();
@ -90,6 +88,10 @@ void CpuManager::MultiCoreRunGuestThread() {
}
void CpuManager::MultiCoreRunIdleThread() {
// Not accurate to HOS. Remove this entire method when singlecore is removed.
// See notes in KScheduler::ScheduleImpl for more information about why this
// is inaccurate.
auto& kernel = system.Kernel();
kernel.CurrentScheduler()->OnThreadStart();
@ -103,6 +105,10 @@ void CpuManager::MultiCoreRunIdleThread() {
}
}
///////////////////////////////////////////////////////////////////////////////
/// SingleCore ///
///////////////////////////////////////////////////////////////////////////////
void CpuManager::SingleCoreRunGuestThread() {
auto& kernel = system.Kernel();
auto* thread = Kernel::GetCurrentThreadPointer(kernel);
@ -148,16 +154,19 @@ void CpuManager::PreemptSingleCore(bool from_running_environment) {
system.CoreTiming().Advance();
kernel.SetIsPhantomModeForSingleCore(false);
}
current_core.store((current_core + 1) % Core::Hardware::NUM_CPU_CORES, std::memory_order_release);
current_core.store((current_core + 1) % Core::Hardware::NUM_CPU_CORES);
system.CoreTiming().ResetTicks();
kernel.Scheduler(current_core).PreemptSingleCore();
// We've now been scheduled again, and we may have exchanged schedulers.
// Reload the scheduler in case it's different.
if (!kernel.Scheduler(current_core).IsIdle()) {
idle_count = 0;
}
}
void CpuManager::GuestActivate() {
// Similar to the HorizonKernelMain callback in HOS
auto& kernel = system.Kernel();
auto* scheduler = kernel.CurrentScheduler();
@ -175,19 +184,27 @@ void CpuManager::ShutdownThread() {
}
void CpuManager::RunThread(std::stop_token token, std::size_t core) {
/// Initialization
system.RegisterCoreThread(core);
std::string name = is_multicore ? "CPUCore_" + std::to_string(core) : "CPUThread";
std::string name;
if (is_multicore) {
name = "CPUCore_" + std::to_string(core);
} else {
name = "CPUThread";
}
MicroProfileOnThreadCreate(name.c_str());
Common::SetCurrentThreadName(name.c_str());
Common::SetCurrentThreadPriority(Common::ThreadPriority::Critical);
auto& data = core_data[core];
data.host_context = Common::Fiber::ThreadToFiber();
// Cleanup
SCOPE_EXIT {
data.host_context->Exit();
MicroProfileOnThreadExit();
};
// Running
if (!gpu_barrier->Sync(token)) {
return;
}

View file

@ -9,6 +9,7 @@
#include <thread>
#include <boost/algorithm/string.hpp>
#include <fmt/ranges.h>
#include "common/hex_util.h"
#include "common/logging/log.h"

View file

@ -10,7 +10,7 @@ namespace FileSys::SystemArchive {
namespace NgWord1Data {
constexpr std::size_t NUMBER_WORD_TXT_FILES = 0x10;
[[maybe_unused]] constexpr std::size_t NUMBER_WORD_TXT_FILES = 0x10;
// Should this archive replacement mysteriously not work on a future game, consider updating.
constexpr std::array<u8, 4> VERSION_DAT{0x0, 0x0, 0x0, 0x20}; // 11.0.1 System Version

View file

@ -15,6 +15,7 @@
#endif
#include <fmt/format.h>
#include <fmt/ranges.h>
#include "common/fs/file.h"
#include "common/fs/fs.h"

View file

@ -167,7 +167,7 @@ constexpr inline Result GetSpanBetweenTimePoints(s64* out_seconds, const SteadyC
template <>
struct fmt::formatter<Service::PSC::Time::TimeType> : fmt::formatter<fmt::string_view> {
template <typename FormatContext>
auto format(Service::PSC::Time::TimeType type, FormatContext& ctx) {
auto format(Service::PSC::Time::TimeType type, FormatContext& ctx) const {
const string_view name = [type] {
using Service::PSC::Time::TimeType;
switch (type) {
@ -270,4 +270,4 @@ struct fmt::formatter<Service::PSC::Time::ContinuousAdjustmentTimePoint>
time_point.rtc_offset, time_point.diff_scale, time_point.shift_amount,
time_point.lower, time_point.upper);
}
};
};

File diff suppressed because it is too large Load diff

View file

@ -184,7 +184,7 @@ struct fmt::formatter<Shader::Backend::GLASM::Id> {
return ctx.begin();
}
template <typename FormatContext>
auto format(Shader::Backend::GLASM::Id id, FormatContext& ctx) {
auto format(Shader::Backend::GLASM::Id id, FormatContext& ctx) const {
return Shader::Backend::GLASM::FormatTo<true>(ctx, id);
}
};
@ -195,7 +195,7 @@ struct fmt::formatter<Shader::Backend::GLASM::Register> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Shader::Backend::GLASM::Register& value, FormatContext& ctx) {
auto format(const Shader::Backend::GLASM::Register& value, FormatContext& ctx) const {
if (value.type != Shader::Backend::GLASM::Type::Register) {
throw Shader::InvalidArgument("Register value type is not register");
}
@ -209,7 +209,7 @@ struct fmt::formatter<Shader::Backend::GLASM::ScalarRegister> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Shader::Backend::GLASM::ScalarRegister& value, FormatContext& ctx) {
auto format(const Shader::Backend::GLASM::ScalarRegister& value, FormatContext& ctx) const {
if (value.type != Shader::Backend::GLASM::Type::Register) {
throw Shader::InvalidArgument("Register value type is not register");
}
@ -223,7 +223,7 @@ struct fmt::formatter<Shader::Backend::GLASM::ScalarU32> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Shader::Backend::GLASM::ScalarU32& value, FormatContext& ctx) {
auto format(const Shader::Backend::GLASM::ScalarU32& value, FormatContext& ctx) const {
switch (value.type) {
case Shader::Backend::GLASM::Type::Void:
break;
@ -244,7 +244,7 @@ struct fmt::formatter<Shader::Backend::GLASM::ScalarS32> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Shader::Backend::GLASM::ScalarS32& value, FormatContext& ctx) {
auto format(const Shader::Backend::GLASM::ScalarS32& value, FormatContext& ctx) const {
switch (value.type) {
case Shader::Backend::GLASM::Type::Void:
break;
@ -265,7 +265,7 @@ struct fmt::formatter<Shader::Backend::GLASM::ScalarF32> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Shader::Backend::GLASM::ScalarF32& value, FormatContext& ctx) {
auto format(const Shader::Backend::GLASM::ScalarF32& value, FormatContext& ctx) const {
switch (value.type) {
case Shader::Backend::GLASM::Type::Void:
break;
@ -286,7 +286,7 @@ struct fmt::formatter<Shader::Backend::GLASM::ScalarF64> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Shader::Backend::GLASM::ScalarF64& value, FormatContext& ctx) {
auto format(const Shader::Backend::GLASM::ScalarF64& value, FormatContext& ctx) const {
switch (value.type) {
case Shader::Backend::GLASM::Type::Void:
break;

View file

@ -250,7 +250,7 @@ struct fmt::formatter<Shader::IR::Attribute> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Shader::IR::Attribute& attribute, FormatContext& ctx) {
auto format(const Shader::IR::Attribute& attribute, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "{}", Shader::IR::NameOf(attribute));
}
};

View file

@ -52,7 +52,7 @@ struct fmt::formatter<Shader::IR::Condition> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Shader::IR::Condition& cond, FormatContext& ctx) {
auto format(const Shader::IR::Condition& cond, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "{}", Shader::IR::NameOf(cond));
}
};

View file

@ -55,7 +55,7 @@ struct fmt::formatter<Shader::IR::FlowTest> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Shader::IR::FlowTest& flow_test, FormatContext& ctx) {
auto format(const Shader::IR::FlowTest& flow_test, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "{}", Shader::IR::NameOf(flow_test));
}
};

View file

@ -54,7 +54,7 @@ constexpr Type F64x2{Type::F64x2};
constexpr Type F64x3{Type::F64x3};
constexpr Type F64x4{Type::F64x4};
constexpr OpcodeMeta META_TABLE[]{
constexpr OpcodeMeta META_TABLE[] {
#define OPCODE(name_token, type_token, ...) \
{ \
.name{#name_token}, \
@ -103,7 +103,7 @@ struct fmt::formatter<Shader::IR::Opcode> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Shader::IR::Opcode& op, FormatContext& ctx) {
auto format(const Shader::IR::Opcode& op, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "{}", Shader::IR::NameOf(op));
}
};

View file

@ -33,7 +33,7 @@ struct fmt::formatter<Shader::IR::Pred> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Shader::IR::Pred& pred, FormatContext& ctx) {
auto format(const Shader::IR::Pred& pred, FormatContext& ctx) const {
if (pred == Shader::IR::Pred::PT) {
return fmt::format_to(ctx.out(), "PT");
} else {

View file

@ -319,7 +319,7 @@ struct fmt::formatter<Shader::IR::Reg> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Shader::IR::Reg& reg, FormatContext& ctx) {
auto format(const Shader::IR::Reg& reg, FormatContext& ctx) const {
if (reg == Shader::IR::Reg::RZ) {
return fmt::format_to(ctx.out(), "RZ");
} else if (static_cast<int>(reg) >= 0 && static_cast<int>(reg) < 255) {

View file

@ -54,7 +54,7 @@ struct fmt::formatter<Shader::IR::Type> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Shader::IR::Type& type, FormatContext& ctx) {
auto format(const Shader::IR::Type& type, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "{}", NameOf(type));
}
};

View file

@ -102,7 +102,7 @@ struct fmt::formatter<Shader::Maxwell::Location> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Shader::Maxwell::Location& location, FormatContext& ctx) {
auto format(const Shader::Maxwell::Location& location, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "{:04x}", location.Offset());
}
};

View file

@ -23,7 +23,7 @@ struct fmt::formatter<Shader::Maxwell::Opcode> {
return ctx.begin();
}
template <typename FormatContext>
auto format(const Shader::Maxwell::Opcode& opcode, FormatContext& ctx) {
auto format(const Shader::Maxwell::Opcode& opcode, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "{}", NameOf(opcode));
}
};

View file

@ -9,6 +9,8 @@
#include <memory>
#include <thread>
#include <fmt/ranges.h>
#include "core/hle/service/am/applet_manager.h"
#include "core/loader/nca.h"
#include "core/loader/nro.h"

View file

@ -40,23 +40,10 @@ struct GPU::Impl {
explicit Impl(GPU& gpu_, Core::System& system_, bool is_async_, bool use_nvdec_)
: gpu{gpu_}, system{system_}, host1x{system.Host1x()}, use_nvdec{use_nvdec_},
shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_},
gpu_thread{system_, is_async_}, scheduler{std::make_unique<Control::Scheduler>(gpu)} {
Initialize();
}
gpu_thread{system_, is_async_}, scheduler{std::make_unique<Control::Scheduler>(gpu)} {}
~Impl() = default;
void Initialize() {
// Initialize the GPU memory manager
memory_manager = std::make_unique<Tegra::MemoryManager>(system);
// Initialize the command buffer
command_buffer.reserve(COMMAND_BUFFER_SIZE);
// Initialize the fence manager
fence_manager = std::make_unique<FenceManager>();
}
std::shared_ptr<Control::ChannelState> CreateChannel(s32 channel_id) {
auto channel_state = std::make_shared<Tegra::Control::ChannelState>(channel_id);
channels.emplace(channel_id, channel_state);
@ -104,15 +91,14 @@ struct GPU::Impl {
/// Flush all current written commands into the host GPU for execution.
void FlushCommands() {
if (!command_buffer.empty()) {
rasterizer->ExecuteCommands(command_buffer);
command_buffer.clear();
}
rasterizer->FlushCommands();
}
/// Synchronizes CPU writes with Host GPU memory.
void InvalidateGPUCache() {
rasterizer->InvalidateGPUCache();
std::function<void(PAddr, size_t)> callback_writes(
[this](PAddr address, size_t size) { rasterizer->OnCacheInvalidation(address, size); });
system.GatherGPUDirtyMemory(callback_writes);
}
/// Signal the ending of command list.
@ -122,10 +108,11 @@ struct GPU::Impl {
}
/// Request a host GPU memory flush from the CPU.
u64 RequestSyncOperation(std::function<void()>&& action) {
template <typename Func>
[[nodiscard]] u64 RequestSyncOperation(Func&& action) {
std::unique_lock lck{sync_request_mutex};
const u64 fence = ++last_sync_fence;
sync_requests.emplace_back(std::move(action), fence);
sync_requests.emplace_back(action);
return fence;
}
@ -143,12 +130,12 @@ struct GPU::Impl {
void TickWork() {
std::unique_lock lck{sync_request_mutex};
while (!sync_requests.empty()) {
auto& request = sync_requests.front();
auto request = std::move(sync_requests.front());
sync_requests.pop_front();
sync_request_mutex.unlock();
request.first();
request();
current_sync_fence.fetch_add(1, std::memory_order_release);
sync_request_mutex.lock();
sync_requests.pop_front();
sync_request_cv.notify_all();
}
}
@ -235,6 +222,7 @@ struct GPU::Impl {
/// This can be used to launch any necessary threads and register any necessary
/// core timing events.
void Start() {
Settings::UpdateGPUAccuracy();
gpu_thread.StartThread(*renderer, renderer->Context(), *scheduler);
}
@ -264,7 +252,7 @@ struct GPU::Impl {
/// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
void FlushRegion(DAddr addr, u64 size) {
rasterizer->FlushRegion(addr, size);
gpu_thread.FlushRegion(addr, size);
}
VideoCore::RasterizerDownloadArea OnCPURead(DAddr addr, u64 size) {
@ -284,7 +272,7 @@ struct GPU::Impl {
/// Notify rasterizer that any caches of the specified region should be invalidated
void InvalidateRegion(DAddr addr, u64 size) {
rasterizer->InvalidateRegion(addr, size);
gpu_thread.InvalidateRegion(addr, size);
}
bool OnCPUWrite(DAddr addr, u64 size) {
@ -293,7 +281,57 @@ struct GPU::Impl {
/// Notify rasterizer that any caches of the specified region should be flushed and invalidated
void FlushAndInvalidateRegion(DAddr addr, u64 size) {
rasterizer->FlushAndInvalidateRegion(addr, size);
gpu_thread.FlushAndInvalidateRegion(addr, size);
}
void RequestComposite(std::vector<Tegra::FramebufferConfig>&& layers,
std::vector<Service::Nvidia::NvFence>&& fences) {
size_t num_fences{fences.size()};
size_t current_request_counter{};
{
std::unique_lock<std::mutex> lk(request_swap_mutex);
if (free_swap_counters.empty()) {
current_request_counter = request_swap_counters.size();
request_swap_counters.emplace_back(num_fences);
} else {
current_request_counter = free_swap_counters.front();
request_swap_counters[current_request_counter] = num_fences;
free_swap_counters.pop_front();
}
}
const auto wait_fence =
RequestSyncOperation([this, current_request_counter, &layers, &fences, num_fences] {
auto& syncpoint_manager = host1x.GetSyncpointManager();
if (num_fences == 0) {
renderer->Composite(layers);
}
const auto executer = [this, current_request_counter, layers_copy = layers]() {
{
std::unique_lock<std::mutex> lk(request_swap_mutex);
if (--request_swap_counters[current_request_counter] != 0) {
return;
}
free_swap_counters.push_back(current_request_counter);
}
renderer->Composite(layers_copy);
};
for (size_t i = 0; i < num_fences; i++) {
syncpoint_manager.RegisterGuestAction(fences[i].id, fences[i].value, executer);
}
});
gpu_thread.TickGPU();
WaitForSyncOperation(wait_fence);
}
std::vector<u8> GetAppletCaptureBuffer() {
std::vector<u8> out;
const auto wait_fence =
RequestSyncOperation([&] { out = renderer->GetAppletCaptureBuffer(); });
gpu_thread.TickGPU();
WaitForSyncOperation(wait_fence);
return out;
}
GPU& gpu;
@ -310,12 +348,16 @@ struct GPU::Impl {
/// When true, we are about to shut down emulation session, so terminate outstanding tasks
std::atomic_bool shutting_down{};
std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
std::mutex sync_mutex;
std::mutex device_mutex;
std::condition_variable sync_cv;
std::list<std::pair<std::function<void()>, u64>> sync_requests;
std::list<std::function<void()>> sync_requests;
std::atomic<u64> current_sync_fence{};
u64 last_sync_fence{};
std::mutex sync_request_mutex;
@ -331,13 +373,182 @@ struct GPU::Impl {
Tegra::Control::ChannelState* current_channel;
s32 bound_channel{-1};
std::unique_ptr<Tegra::MemoryManager> memory_manager;
std::vector<u32> command_buffer;
std::unique_ptr<FenceManager> fence_manager;
static constexpr size_t COMMAND_BUFFER_SIZE = 4 * 1024 * 1024;
std::deque<size_t> free_swap_counters;
std::deque<size_t> request_swap_counters;
std::mutex request_swap_mutex;
};
// ... (rest of the implementation remains the same)
GPU::GPU(Core::System& system, bool is_async, bool use_nvdec)
: impl{std::make_unique<Impl>(*this, system, is_async, use_nvdec)} {}
GPU::~GPU() = default;
std::shared_ptr<Control::ChannelState> GPU::AllocateChannel() {
return impl->AllocateChannel();
}
void GPU::InitChannel(Control::ChannelState& to_init, u64 program_id) {
impl->InitChannel(to_init, program_id);
}
void GPU::BindChannel(s32 channel_id) {
impl->BindChannel(channel_id);
}
void GPU::ReleaseChannel(Control::ChannelState& to_release) {
impl->ReleaseChannel(to_release);
}
void GPU::InitAddressSpace(Tegra::MemoryManager& memory_manager) {
impl->InitAddressSpace(memory_manager);
}
void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer) {
impl->BindRenderer(std::move(renderer));
}
void GPU::FlushCommands() {
impl->FlushCommands();
}
void GPU::InvalidateGPUCache() {
impl->InvalidateGPUCache();
}
void GPU::OnCommandListEnd() {
impl->OnCommandListEnd();
}
u64 GPU::RequestFlush(DAddr addr, std::size_t size) {
return impl->RequestSyncOperation(
[this, addr, size]() { impl->rasterizer->FlushRegion(addr, size); });
}
u64 GPU::CurrentSyncRequestFence() const {
return impl->CurrentSyncRequestFence();
}
void GPU::WaitForSyncOperation(u64 fence) {
return impl->WaitForSyncOperation(fence);
}
void GPU::TickWork() {
impl->TickWork();
}
/// Gets a mutable reference to the Host1x interface
Host1x::Host1x& GPU::Host1x() {
return impl->host1x;
}
/// Gets an immutable reference to the Host1x interface.
const Host1x::Host1x& GPU::Host1x() const {
return impl->host1x;
}
Engines::Maxwell3D& GPU::Maxwell3D() {
return impl->Maxwell3D();
}
const Engines::Maxwell3D& GPU::Maxwell3D() const {
return impl->Maxwell3D();
}
Engines::KeplerCompute& GPU::KeplerCompute() {
return impl->KeplerCompute();
}
const Engines::KeplerCompute& GPU::KeplerCompute() const {
return impl->KeplerCompute();
}
Tegra::DmaPusher& GPU::DmaPusher() {
return impl->DmaPusher();
}
const Tegra::DmaPusher& GPU::DmaPusher() const {
return impl->DmaPusher();
}
VideoCore::RendererBase& GPU::Renderer() {
return impl->Renderer();
}
const VideoCore::RendererBase& GPU::Renderer() const {
return impl->Renderer();
}
VideoCore::ShaderNotify& GPU::ShaderNotify() {
return impl->ShaderNotify();
}
const VideoCore::ShaderNotify& GPU::ShaderNotify() const {
return impl->ShaderNotify();
}
void GPU::RequestComposite(std::vector<Tegra::FramebufferConfig>&& layers,
std::vector<Service::Nvidia::NvFence>&& fences) {
impl->RequestComposite(std::move(layers), std::move(fences));
}
std::vector<u8> GPU::GetAppletCaptureBuffer() {
return impl->GetAppletCaptureBuffer();
}
u64 GPU::GetTicks() const {
return impl->GetTicks();
}
bool GPU::IsAsync() const {
return impl->IsAsync();
}
bool GPU::UseNvdec() const {
return impl->UseNvdec();
}
void GPU::RendererFrameEndNotify() {
impl->RendererFrameEndNotify();
}
void GPU::Start() {
impl->Start();
}
void GPU::NotifyShutdown() {
impl->NotifyShutdown();
}
void GPU::ObtainContext() {
impl->ObtainContext();
}
void GPU::ReleaseContext() {
impl->ReleaseContext();
}
void GPU::PushGPUEntries(s32 channel, Tegra::CommandList&& entries) {
impl->PushGPUEntries(channel, std::move(entries));
}
VideoCore::RasterizerDownloadArea GPU::OnCPURead(PAddr addr, u64 size) {
return impl->OnCPURead(addr, size);
}
void GPU::FlushRegion(DAddr addr, u64 size) {
impl->FlushRegion(addr, size);
}
void GPU::InvalidateRegion(DAddr addr, u64 size) {
impl->InvalidateRegion(addr, size);
}
bool GPU::OnCPUWrite(DAddr addr, u64 size) {
return impl->OnCPUWrite(addr, size);
}
void GPU::FlushAndInvalidateRegion(DAddr addr, u64 size) {
impl->FlushAndInvalidateRegion(addr, size);
}
} // namespace Tegra

View file

@ -1,221 +0,0 @@
#include "video_core/optimized_rasterizer.h"
#include "common/settings.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
#include "video_core/engines/maxwell_3d.h"
namespace VideoCore {
OptimizedRasterizer::OptimizedRasterizer(Core::System& system, Tegra::GPU& gpu)
: system{system}, gpu{gpu}, memory_manager{gpu.MemoryManager()} {
InitializeShaderCache();
}
OptimizedRasterizer::~OptimizedRasterizer() = default;
void OptimizedRasterizer::Draw(bool is_indexed, u32 instance_count) {
MICROPROFILE_SCOPE(GPU_Rasterization);
PrepareRendertarget();
UpdateDynamicState();
if (is_indexed) {
DrawIndexed(instance_count);
} else {
DrawArrays(instance_count);
}
}
void OptimizedRasterizer::Clear(u32 layer_count) {
MICROPROFILE_SCOPE(GPU_Rasterization);
PrepareRendertarget();
ClearFramebuffer(layer_count);
}
void OptimizedRasterizer::DispatchCompute() {
MICROPROFILE_SCOPE(GPU_Compute);
PrepareCompute();
LaunchComputeShader();
}
void OptimizedRasterizer::ResetCounter(VideoCommon::QueryType type) {
query_cache.ResetCounter(type);
}
void OptimizedRasterizer::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) {
query_cache.Query(gpu_addr, type, flags, payload, subreport);
}
void OptimizedRasterizer::FlushAll() {
MICROPROFILE_SCOPE(GPU_Synchronization);
FlushShaderCache();
FlushRenderTargets();
}
void OptimizedRasterizer::FlushRegion(DAddr addr, u64 size, VideoCommon::CacheType which) {
MICROPROFILE_SCOPE(GPU_Synchronization);
if (which == VideoCommon::CacheType::All || which == VideoCommon::CacheType::Unified) {
FlushMemoryRegion(addr, size);
}
}
bool OptimizedRasterizer::MustFlushRegion(DAddr addr, u64 size, VideoCommon::CacheType which) {
if (which == VideoCommon::CacheType::All || which == VideoCommon::CacheType::Unified) {
return IsRegionCached(addr, size);
}
return false;
}
RasterizerDownloadArea OptimizedRasterizer::GetFlushArea(DAddr addr, u64 size) {
return GetFlushableArea(addr, size);
}
void OptimizedRasterizer::InvalidateRegion(DAddr addr, u64 size, VideoCommon::CacheType which) {
MICROPROFILE_SCOPE(GPU_Synchronization);
if (which == VideoCommon::CacheType::All || which == VideoCommon::CacheType::Unified) {
InvalidateMemoryRegion(addr, size);
}
}
void OptimizedRasterizer::OnCacheInvalidation(PAddr addr, u64 size) {
MICROPROFILE_SCOPE(GPU_Synchronization);
InvalidateCachedRegion(addr, size);
}
bool OptimizedRasterizer::OnCPUWrite(PAddr addr, u64 size) {
return HandleCPUWrite(addr, size);
}
void OptimizedRasterizer::InvalidateGPUCache() {
MICROPROFILE_SCOPE(GPU_Synchronization);
InvalidateAllCache();
}
void OptimizedRasterizer::UnmapMemory(DAddr addr, u64 size) {
MICROPROFILE_SCOPE(GPU_Synchronization);
UnmapGPUMemoryRegion(addr, size);
}
void OptimizedRasterizer::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) {
MICROPROFILE_SCOPE(GPU_Synchronization);
UpdateMappedGPUMemory(as_id, addr, size);
}
void OptimizedRasterizer::FlushAndInvalidateRegion(DAddr addr, u64 size, VideoCommon::CacheType which) {
MICROPROFILE_SCOPE(GPU_Synchronization);
if (which == VideoCommon::CacheType::All || which == VideoCommon::CacheType::Unified) {
FlushAndInvalidateMemoryRegion(addr, size);
}
}
void OptimizedRasterizer::WaitForIdle() {
MICROPROFILE_SCOPE(GPU_Synchronization);
WaitForGPUIdle();
}
void OptimizedRasterizer::FragmentBarrier() {
MICROPROFILE_SCOPE(GPU_Synchronization);
InsertFragmentBarrier();
}
void OptimizedRasterizer::TiledCacheBarrier() {
MICROPROFILE_SCOPE(GPU_Synchronization);
InsertTiledCacheBarrier();
}
void OptimizedRasterizer::FlushCommands() {
MICROPROFILE_SCOPE(GPU_Synchronization);
SubmitCommands();
}
void OptimizedRasterizer::TickFrame() {
MICROPROFILE_SCOPE(GPU_Synchronization);
EndFrame();
}
void OptimizedRasterizer::PrepareRendertarget() {
const auto& regs{gpu.Maxwell3D().regs};
const auto& framebuffer{regs.framebuffer};
render_targets.resize(framebuffer.num_color_buffers);
for (std::size_t index = 0; index < framebuffer.num_color_buffers; ++index) {
render_targets[index] = GetColorBuffer(index);
}
depth_stencil = GetDepthBuffer();
}
void OptimizedRasterizer::UpdateDynamicState() {
const auto& regs{gpu.Maxwell3D().regs};
UpdateViewport(regs.viewport_transform);
UpdateScissor(regs.scissor_test);
UpdateDepthBias(regs.polygon_offset_units, regs.polygon_offset_clamp, regs.polygon_offset_factor);
UpdateBlendConstants(regs.blend_color);
UpdateStencilFaceMask(regs.stencil_front_func_mask, regs.stencil_back_func_mask);
}
void OptimizedRasterizer::DrawIndexed(u32 instance_count) {
const auto& draw_state{gpu.Maxwell3D().draw_manager->GetDrawState()};
const auto& index_buffer{memory_manager.ReadBlockUnsafe(draw_state.index_buffer.Address(),
draw_state.index_buffer.size)};
shader_cache.BindComputeShader();
shader_cache.BindGraphicsShader();
DrawElementsInstanced(draw_state.topology, draw_state.index_buffer.count,
draw_state.index_buffer.format, index_buffer.data(), instance_count);
}
void OptimizedRasterizer::DrawArrays(u32 instance_count) {
const auto& draw_state{gpu.Maxwell3D().draw_manager->GetDrawState()};
shader_cache.BindComputeShader();
shader_cache.BindGraphicsShader();
DrawArraysInstanced(draw_state.topology, draw_state.vertex_buffer.first,
draw_state.vertex_buffer.count, instance_count);
}
void OptimizedRasterizer::ClearFramebuffer(u32 layer_count) {
const auto& regs{gpu.Maxwell3D().regs};
const auto& clear_state{regs.clear_buffers};
if (clear_state.R || clear_state.G || clear_state.B || clear_state.A) {
ClearColorBuffers(clear_state.R, clear_state.G, clear_state.B, clear_state.A,
regs.clear_color[0], regs.clear_color[1], regs.clear_color[2],
regs.clear_color[3], layer_count);
}
if (clear_state.Z || clear_state.S) {
ClearDepthStencilBuffer(clear_state.Z, clear_state.S, regs.clear_depth, regs.clear_stencil,
layer_count);
}
}
void OptimizedRasterizer::PrepareCompute() {
shader_cache.BindComputeShader();
}
void OptimizedRasterizer::LaunchComputeShader() {
const auto& launch_desc{gpu.KeplerCompute().launch_description};
DispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
}
} // namespace VideoCore

View file

@ -1,73 +0,0 @@
#pragma once
#include <memory>
#include <vector>
#include "common/common_types.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/engines/maxwell_3d.h"
namespace Core {
class System;
}
namespace Tegra {
class GPU;
class MemoryManager;
}
namespace VideoCore {
class ShaderCache;
class QueryCache;
class OptimizedRasterizer final : public RasterizerInterface {
public:
explicit OptimizedRasterizer(Core::System& system, Tegra::GPU& gpu);
~OptimizedRasterizer() override;
void Draw(bool is_indexed, u32 instance_count) override;
void Clear(u32 layer_count) override;
void DispatchCompute() override;
void ResetCounter(VideoCommon::QueryType type) override;
void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override;
void FlushAll() override;
void FlushRegion(DAddr addr, u64 size, VideoCommon::CacheType which) override;
bool MustFlushRegion(DAddr addr, u64 size, VideoCommon::CacheType which) override;
RasterizerDownloadArea GetFlushArea(DAddr addr, u64 size) override;
void InvalidateRegion(DAddr addr, u64 size, VideoCommon::CacheType which) override;
void OnCacheInvalidation(PAddr addr, u64 size) override;
bool OnCPUWrite(PAddr addr, u64 size) override;
void InvalidateGPUCache() override;
void UnmapMemory(DAddr addr, u64 size) override;
void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override;
void FlushAndInvalidateRegion(DAddr addr, u64 size, VideoCommon::CacheType which) override;
void WaitForIdle() override;
void FragmentBarrier() override;
void TiledCacheBarrier() override;
void FlushCommands() override;
void TickFrame() override;
private:
void PrepareRendertarget();
void UpdateDynamicState();
void DrawIndexed(u32 instance_count);
void DrawArrays(u32 instance_count);
void ClearFramebuffer(u32 layer_count);
void PrepareCompute();
void LaunchComputeShader();
Core::System& system;
Tegra::GPU& gpu;
Tegra::MemoryManager& memory_manager;
std::unique_ptr<ShaderCache> shader_cache;
std::unique_ptr<QueryCache> query_cache;
std::vector<RenderTargetConfig> render_targets;
DepthStencilConfig depth_stencil;
// Add any additional member variables needed for the optimized rasterizer
};
} // namespace VideoCore

View file

@ -3,18 +3,9 @@
#include <algorithm>
#include <array>
#include <atomic>
#include <filesystem>
#include <fstream>
#include <mutex>
#include <thread>
#include <vector>
#include "common/assert.h"
#include "common/fs/file.h"
#include "common/fs/path_util.h"
#include "common/logging/log.h"
#include "common/thread_worker.h"
#include "shader_recompiler/frontend/maxwell/control_flow.h"
#include "shader_recompiler/object_pool.h"
#include "video_core/control/channel_state.h"
@ -28,288 +19,233 @@
namespace VideoCommon {
constexpr size_t MAX_SHADER_CACHE_SIZE = 1024 * 1024 * 1024; // 1GB
class ShaderCacheWorker : public Common::ThreadWorker {
public:
explicit ShaderCacheWorker(const std::string& name) : ThreadWorker(name) {}
~ShaderCacheWorker() = default;
void CompileShader(ShaderInfo* shader) {
Push([shader]() {
// Compile shader here
// This is a placeholder for the actual compilation process
std::this_thread::sleep_for(std::chrono::milliseconds(10));
shader->is_compiled.store(true, std::memory_order_release);
});
}
};
class ShaderCache::Impl {
public:
explicit Impl(Tegra::MaxwellDeviceMemoryManager& device_memory_)
: device_memory{device_memory_}, workers{CreateWorkers()} {
LoadCache();
}
~Impl() {
SaveCache();
}
void InvalidateRegion(VAddr addr, size_t size) {
std::scoped_lock lock{invalidation_mutex};
InvalidatePagesInRegion(addr, size);
RemovePendingShaders();
}
void OnCacheInvalidation(VAddr addr, size_t size) {
std::scoped_lock lock{invalidation_mutex};
InvalidatePagesInRegion(addr, size);
}
void SyncGuestHost() {
std::scoped_lock lock{invalidation_mutex};
RemovePendingShaders();
}
bool RefreshStages(std::array<u64, 6>& unique_hashes);
const ShaderInfo* ComputeShader();
void GetGraphicsEnvironments(GraphicsEnvironments& result, const std::array<u64, NUM_PROGRAMS>& unique_hashes);
ShaderInfo* TryGet(VAddr addr) const {
std::scoped_lock lock{lookup_mutex};
const auto it = lookup_cache.find(addr);
if (it == lookup_cache.end()) {
return nullptr;
}
return it->second->data;
}
void Register(std::unique_ptr<ShaderInfo> data, VAddr addr, size_t size) {
std::scoped_lock lock{invalidation_mutex, lookup_mutex};
const VAddr addr_end = addr + size;
Entry* const entry = NewEntry(addr, addr_end, data.get());
const u64 page_end = (addr_end + SUYU_PAGESIZE - 1) >> SUYU_PAGEBITS;
for (u64 page = addr >> SUYU_PAGEBITS; page < page_end; ++page) {
invalidation_cache[page].push_back(entry);
}
storage.push_back(std::move(data));
device_memory.UpdatePagesCachedCount(addr, size, 1);
}
private:
std::vector<std::unique_ptr<ShaderCacheWorker>> CreateWorkers() {
const size_t num_workers = std::thread::hardware_concurrency();
std::vector<std::unique_ptr<ShaderCacheWorker>> workers;
workers.reserve(num_workers);
for (size_t i = 0; i < num_workers; ++i) {
workers.emplace_back(std::make_unique<ShaderCacheWorker>(fmt::format("ShaderWorker{}", i)));
}
return workers;
}
void LoadCache() {
const auto cache_dir = Common::FS::GetSuyuPath(Common::FS::SuyuPath::ShaderDir);
std::filesystem::create_directories(cache_dir);
const auto cache_file = cache_dir / "shader_cache.bin";
if (!std::filesystem::exists(cache_file)) {
return;
}
std::ifstream file(cache_file, std::ios::binary);
if (!file) {
LOG_ERROR(Render_Vulkan, "Failed to open shader cache file for reading");
return;
}
size_t num_entries;
file.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
for (size_t i = 0; i < num_entries; ++i) {
VAddr addr;
size_t size;
file.read(reinterpret_cast<char*>(&addr), sizeof(addr));
file.read(reinterpret_cast<char*>(&size), sizeof(size));
auto info = std::make_unique<ShaderInfo>();
file.read(reinterpret_cast<char*>(info.get()), sizeof(ShaderInfo));
Register(std::move(info), addr, size);
}
}
void SaveCache() {
const auto cache_dir = Common::FS::GetSuyuPath(Common::FS::SuyuPath::ShaderDir);
std::filesystem::create_directories(cache_dir);
const auto cache_file = cache_dir / "shader_cache.bin";
std::ofstream file(cache_file, std::ios::binary | std::ios::trunc);
if (!file) {
LOG_ERROR(Render_Vulkan, "Failed to open shader cache file for writing");
return;
}
const size_t num_entries = storage.size();
file.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
for (const auto& shader : storage) {
const VAddr addr = shader->addr;
const size_t size = shader->size_bytes;
file.write(reinterpret_cast<const char*>(&addr), sizeof(addr));
file.write(reinterpret_cast<const char*>(&size), sizeof(size));
file.write(reinterpret_cast<const char*>(shader.get()), sizeof(ShaderInfo));
}
}
void InvalidatePagesInRegion(VAddr addr, size_t size) {
const VAddr addr_end = addr + size;
const u64 page_end = (addr_end + SUYU_PAGESIZE - 1) >> SUYU_PAGEBITS;
for (u64 page = addr >> SUYU_PAGEBITS; page < page_end; ++page) {
auto it = invalidation_cache.find(page);
if (it == invalidation_cache.end()) {
continue;
}
InvalidatePageEntries(it->second, addr, addr_end);
}
}
void RemovePendingShaders() {
if (marked_for_removal.empty()) {
return;
}
// Remove duplicates
std::sort(marked_for_removal.begin(), marked_for_removal.end());
marked_for_removal.erase(std::unique(marked_for_removal.begin(), marked_for_removal.end()),
marked_for_removal.end());
std::vector<ShaderInfo*> removed_shaders;
std::scoped_lock lock{lookup_mutex};
for (Entry* const entry : marked_for_removal) {
removed_shaders.push_back(entry->data);
const auto it = lookup_cache.find(entry->addr_start);
ASSERT(it != lookup_cache.end());
lookup_cache.erase(it);
}
marked_for_removal.clear();
if (!removed_shaders.empty()) {
RemoveShadersFromStorage(removed_shaders);
}
}
void InvalidatePageEntries(std::vector<Entry*>& entries, VAddr addr, VAddr addr_end) {
size_t index = 0;
while (index < entries.size()) {
Entry* const entry = entries[index];
if (!entry->Overlaps(addr, addr_end)) {
++index;
continue;
}
UnmarkMemory(entry);
RemoveEntryFromInvalidationCache(entry);
marked_for_removal.push_back(entry);
}
}
void RemoveEntryFromInvalidationCache(const Entry* entry) {
const u64 page_end = (entry->addr_end + SUYU_PAGESIZE - 1) >> SUYU_PAGEBITS;
for (u64 page = entry->addr_start >> SUYU_PAGEBITS; page < page_end; ++page) {
const auto entries_it = invalidation_cache.find(page);
ASSERT(entries_it != invalidation_cache.end());
std::vector<Entry*>& entries = entries_it->second;
const auto entry_it = std::find(entries.begin(), entries.end(), entry);
ASSERT(entry_it != entries.end());
entries.erase(entry_it);
}
}
void UnmarkMemory(Entry* entry) {
if (!entry->is_memory_marked) {
return;
}
entry->is_memory_marked = false;
const VAddr addr = entry->addr_start;
const size_t size = entry->addr_end - addr;
device_memory.UpdatePagesCachedCount(addr, size, -1);
}
void RemoveShadersFromStorage(const std::vector<ShaderInfo*>& removed_shaders) {
storage.erase(
std::remove_if(storage.begin(), storage.end(),
[&removed_shaders](const std::unique_ptr<ShaderInfo>& shader) {
return std::find(removed_shaders.begin(), removed_shaders.end(),
shader.get()) != removed_shaders.end();
}),
storage.end());
}
Entry* NewEntry(VAddr addr, VAddr addr_end, ShaderInfo* data) {
auto entry = std::make_unique<Entry>(Entry{addr, addr_end, data});
Entry* const entry_pointer = entry.get();
lookup_cache.emplace(addr, std::move(entry));
return entry_pointer;
}
Tegra::MaxwellDeviceMemoryManager& device_memory;
std::vector<std::unique_ptr<ShaderCacheWorker>> workers;
mutable std::mutex lookup_mutex;
std::mutex invalidation_mutex;
std::unordered_map<VAddr, std::unique_ptr<Entry>> lookup_cache;
std::unordered_map<u64, std::vector<Entry*>> invalidation_cache;
std::vector<std::unique_ptr<ShaderInfo>> storage;
std::vector<Entry*> marked_for_removal;
};
ShaderCache::ShaderCache(Tegra::MaxwellDeviceMemoryManager& device_memory_)
: impl{std::make_unique<Impl>(device_memory_)} {}
ShaderCache::~ShaderCache() = default;
void ShaderCache::InvalidateRegion(VAddr addr, size_t size) {
impl->InvalidateRegion(addr, size);
std::scoped_lock lock{invalidation_mutex};
InvalidatePagesInRegion(addr, size);
RemovePendingShaders();
}
void ShaderCache::OnCacheInvalidation(VAddr addr, size_t size) {
impl->OnCacheInvalidation(addr, size);
std::scoped_lock lock{invalidation_mutex};
InvalidatePagesInRegion(addr, size);
}
void ShaderCache::SyncGuestHost() {
impl->SyncGuestHost();
std::scoped_lock lock{invalidation_mutex};
RemovePendingShaders();
}
ShaderCache::ShaderCache(Tegra::MaxwellDeviceMemoryManager& device_memory_)
: device_memory{device_memory_} {}
bool ShaderCache::RefreshStages(std::array<u64, 6>& unique_hashes) {
return impl->RefreshStages(unique_hashes);
auto& dirty{maxwell3d->dirty.flags};
if (!dirty[VideoCommon::Dirty::Shaders]) {
return last_shaders_valid;
}
dirty[VideoCommon::Dirty::Shaders] = false;
const GPUVAddr base_addr{maxwell3d->regs.program_region.Address()};
for (size_t index = 0; index < Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; ++index) {
if (!maxwell3d->regs.IsShaderConfigEnabled(index)) {
unique_hashes[index] = 0;
continue;
}
const auto& shader_config{maxwell3d->regs.pipelines[index]};
const auto program{static_cast<Tegra::Engines::Maxwell3D::Regs::ShaderType>(index)};
if (program == Tegra::Engines::Maxwell3D::Regs::ShaderType::Pixel &&
!maxwell3d->regs.rasterize_enable) {
unique_hashes[index] = 0;
continue;
}
const GPUVAddr shader_addr{base_addr + shader_config.offset};
const std::optional<VAddr> cpu_shader_addr{gpu_memory->GpuToCpuAddress(shader_addr)};
if (!cpu_shader_addr) {
LOG_ERROR(HW_GPU, "Invalid GPU address for shader 0x{:016x}", shader_addr);
last_shaders_valid = false;
return false;
}
const ShaderInfo* shader_info{TryGet(*cpu_shader_addr)};
if (!shader_info) {
const u32 start_address{shader_config.offset};
GraphicsEnvironment env{*maxwell3d, *gpu_memory, program, base_addr, start_address};
shader_info = MakeShaderInfo(env, *cpu_shader_addr);
}
shader_infos[index] = shader_info;
unique_hashes[index] = shader_info->unique_hash;
}
last_shaders_valid = true;
return true;
}
const ShaderInfo* ShaderCache::ComputeShader() {
return impl->ComputeShader();
const GPUVAddr program_base{kepler_compute->regs.code_loc.Address()};
const auto& qmd{kepler_compute->launch_description};
const GPUVAddr shader_addr{program_base + qmd.program_start};
const std::optional<VAddr> cpu_shader_addr{gpu_memory->GpuToCpuAddress(shader_addr)};
if (!cpu_shader_addr) {
LOG_ERROR(HW_GPU, "Invalid GPU address for shader 0x{:016x}", shader_addr);
return nullptr;
}
if (const ShaderInfo* const shader = TryGet(*cpu_shader_addr)) {
return shader;
}
ComputeEnvironment env{*kepler_compute, *gpu_memory, program_base, qmd.program_start};
return MakeShaderInfo(env, *cpu_shader_addr);
}
void ShaderCache::GetGraphicsEnvironments(GraphicsEnvironments& result,
const std::array<u64, NUM_PROGRAMS>& unique_hashes) {
impl->GetGraphicsEnvironments(result, unique_hashes);
size_t env_index{};
const GPUVAddr base_addr{maxwell3d->regs.program_region.Address()};
for (size_t index = 0; index < NUM_PROGRAMS; ++index) {
if (unique_hashes[index] == 0) {
continue;
}
const auto program{static_cast<Tegra::Engines::Maxwell3D::Regs::ShaderType>(index)};
auto& env{result.envs[index]};
const u32 start_address{maxwell3d->regs.pipelines[index].offset};
env = GraphicsEnvironment{*maxwell3d, *gpu_memory, program, base_addr, start_address};
env.SetCachedSize(shader_infos[index]->size_bytes);
result.env_ptrs[env_index++] = &env;
}
}
ShaderInfo* ShaderCache::TryGet(VAddr addr) const {
return impl->TryGet(addr);
std::scoped_lock lock{lookup_mutex};
const auto it = lookup_cache.find(addr);
if (it == lookup_cache.end()) {
return nullptr;
}
return it->second->data;
}
void ShaderCache::Register(std::unique_ptr<ShaderInfo> data, VAddr addr, size_t size) {
impl->Register(std::move(data), addr, size);
std::scoped_lock lock{invalidation_mutex, lookup_mutex};
const VAddr addr_end = addr + size;
Entry* const entry = NewEntry(addr, addr_end, data.get());
const u64 page_end = (addr_end + SUYU_PAGESIZE - 1) >> SUYU_PAGEBITS;
for (u64 page = addr >> SUYU_PAGEBITS; page < page_end; ++page) {
invalidation_cache[page].push_back(entry);
}
storage.push_back(std::move(data));
device_memory.UpdatePagesCachedCount(addr, size, 1);
}
void ShaderCache::InvalidatePagesInRegion(VAddr addr, size_t size) {
const VAddr addr_end = addr + size;
const u64 page_end = (addr_end + SUYU_PAGESIZE - 1) >> SUYU_PAGEBITS;
for (u64 page = addr >> SUYU_PAGEBITS; page < page_end; ++page) {
auto it = invalidation_cache.find(page);
if (it == invalidation_cache.end()) {
continue;
}
InvalidatePageEntries(it->second, addr, addr_end);
}
}
void ShaderCache::RemovePendingShaders() {
if (marked_for_removal.empty()) {
return;
}
// Remove duplicates
std::ranges::sort(marked_for_removal);
marked_for_removal.erase(std::unique(marked_for_removal.begin(), marked_for_removal.end()),
marked_for_removal.end());
boost::container::small_vector<ShaderInfo*, 16> removed_shaders;
std::scoped_lock lock{lookup_mutex};
for (Entry* const entry : marked_for_removal) {
removed_shaders.push_back(entry->data);
const auto it = lookup_cache.find(entry->addr_start);
ASSERT(it != lookup_cache.end());
lookup_cache.erase(it);
}
marked_for_removal.clear();
if (!removed_shaders.empty()) {
RemoveShadersFromStorage(removed_shaders);
}
}
void ShaderCache::InvalidatePageEntries(std::vector<Entry*>& entries, VAddr addr, VAddr addr_end) {
size_t index = 0;
while (index < entries.size()) {
Entry* const entry = entries[index];
if (!entry->Overlaps(addr, addr_end)) {
++index;
continue;
}
UnmarkMemory(entry);
RemoveEntryFromInvalidationCache(entry);
marked_for_removal.push_back(entry);
}
}
void ShaderCache::RemoveEntryFromInvalidationCache(const Entry* entry) {
const u64 page_end = (entry->addr_end + SUYU_PAGESIZE - 1) >> SUYU_PAGEBITS;
for (u64 page = entry->addr_start >> SUYU_PAGEBITS; page < page_end; ++page) {
const auto entries_it = invalidation_cache.find(page);
ASSERT(entries_it != invalidation_cache.end());
std::vector<Entry*>& entries = entries_it->second;
const auto entry_it = std::ranges::find(entries, entry);
ASSERT(entry_it != entries.end());
entries.erase(entry_it);
}
}
void ShaderCache::UnmarkMemory(Entry* entry) {
if (!entry->is_memory_marked) {
return;
}
entry->is_memory_marked = false;
const VAddr addr = entry->addr_start;
const size_t size = entry->addr_end - addr;
device_memory.UpdatePagesCachedCount(addr, size, -1);
}
void ShaderCache::RemoveShadersFromStorage(std::span<ShaderInfo*> removed_shaders) {
// Remove them from the cache
std::erase_if(storage, [&removed_shaders](const std::unique_ptr<ShaderInfo>& shader) {
return std::ranges::find(removed_shaders, shader.get()) != removed_shaders.end();
});
}
ShaderCache::Entry* ShaderCache::NewEntry(VAddr addr, VAddr addr_end, ShaderInfo* data) {
auto entry = std::make_unique<Entry>(Entry{addr, addr_end, data});
Entry* const entry_pointer = entry.get();
lookup_cache.emplace(addr, std::move(entry));
return entry_pointer;
}
const ShaderInfo* ShaderCache::MakeShaderInfo(GenericEnvironment& env, VAddr cpu_addr) {
auto info = std::make_unique<ShaderInfo>();
if (const std::optional<u64> cached_hash{env.Analyze()}) {
info->unique_hash = *cached_hash;
info->size_bytes = env.CachedSizeBytes();
} else {
// Slow path, not really hit on commercial games
// Build a control flow graph to get the real shader size
Shader::ObjectPool<Shader::Maxwell::Flow::Block> flow_block;
Shader::Maxwell::Flow::CFG cfg{env, flow_block, env.StartAddress()};
info->unique_hash = env.CalculateHash();
info->size_bytes = env.ReadSizeBytes();
}
const size_t size_bytes{info->size_bytes};
const ShaderInfo* const result{info.get()};
Register(std::move(info), cpu_addr, size_bytes);
return result;
}
} // namespace VideoCommon

View file

@ -13,7 +13,7 @@
template <>
struct fmt::formatter<VideoCore::Surface::PixelFormat> : fmt::formatter<fmt::string_view> {
template <typename FormatContext>
auto format(VideoCore::Surface::PixelFormat format, FormatContext& ctx) {
auto format(VideoCore::Surface::PixelFormat format, FormatContext& ctx) const {
using VideoCore::Surface::PixelFormat;
const string_view name = [format] {
switch (format) {
@ -234,7 +234,7 @@ struct fmt::formatter<VideoCore::Surface::PixelFormat> : fmt::formatter<fmt::str
template <>
struct fmt::formatter<VideoCommon::ImageType> : fmt::formatter<fmt::string_view> {
template <typename FormatContext>
auto format(VideoCommon::ImageType type, FormatContext& ctx) {
auto format(VideoCommon::ImageType type, FormatContext& ctx) const {
const string_view name = [type] {
using VideoCommon::ImageType;
switch (type) {
@ -262,7 +262,7 @@ struct fmt::formatter<VideoCommon::Extent3D> {
}
template <typename FormatContext>
auto format(const VideoCommon::Extent3D& extent, FormatContext& ctx) {
auto format(const VideoCommon::Extent3D& extent, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "{{{}, {}, {}}}", extent.width, extent.height,
extent.depth);
}