From 096366ead51345bcd170e31b6160b14aaf73e996 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 23 Nov 2021 03:29:00 +0100 Subject: [PATCH 01/10] Common: improve native clock. --- src/common/uint128.h | 5 +++++ src/common/x64/native_clock.cpp | 40 ++++++++++++++++----------------- src/common/x64/native_clock.h | 13 +++++------ 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/common/uint128.h b/src/common/uint128.h index f890ffec24..199d0f55e0 100644 --- a/src/common/uint128.h +++ b/src/common/uint128.h @@ -30,6 +30,10 @@ namespace Common { #else return _udiv128(r[1], r[0], d, &remainder); #endif +#else +#ifdef __SIZEOF_INT128__ + const auto product = static_cast(a) * static_cast(b); + return static_cast(product / d); #else const u64 diva = a / d; const u64 moda = a % d; @@ -37,6 +41,7 @@ namespace Common { const u64 modb = b % d; return diva * b + moda * divb + moda * modb / d; #endif +#endif } // This function multiplies 2 u64 values and produces a u128 value; diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index 1b71945037..427a382cdf 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -5,7 +5,6 @@ #include #include -#include "common/atomic_ops.h" #include "common/uint128.h" #include "common/x64/native_clock.h" @@ -65,8 +64,10 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen u64 rtsc_frequency_) : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{ rtsc_frequency_} { - time_point.inner.last_measure = FencedRDTSC(); - time_point.inner.accumulated_ticks = 0U; + TimePoint new_time_point{}; + new_time_point.last_measure = FencedRDTSC(); + new_time_point.accumulated_ticks = 0U; + time_point.store(new_time_point); ns_rtsc_factor = GetFixedPoint64Factor(NS_RATIO, rtsc_frequency); us_rtsc_factor = GetFixedPoint64Factor(US_RATIO, rtsc_frequency); ms_rtsc_factor = GetFixedPoint64Factor(MS_RATIO, rtsc_frequency); @@ -76,34 +77,31 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen u64 NativeClock::GetRTSC() { TimePoint new_time_point{}; - TimePoint current_time_point{}; - - current_time_point.pack = Common::AtomicLoad128(time_point.pack.data()); + TimePoint current_time_point = time_point.load(std::memory_order_acquire); do { const u64 current_measure = FencedRDTSC(); - u64 diff = current_measure - current_time_point.inner.last_measure; + u64 diff = current_measure - current_time_point.last_measure; diff = diff & ~static_cast(static_cast(diff) >> 63); // max(diff, 0) - new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure - ? current_measure - : current_time_point.inner.last_measure; - new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff; - } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, - current_time_point.pack, current_time_point.pack)); + new_time_point.last_measure = current_measure > current_time_point.last_measure + ? current_measure + : current_time_point.last_measure; + new_time_point.accumulated_ticks = current_time_point.accumulated_ticks + diff; + } while (!time_point.compare_exchange_weak( + current_time_point, new_time_point, std::memory_order_release, std::memory_order_acquire)); /// The clock cannot be more precise than the guest timer, remove the lower bits - return new_time_point.inner.accumulated_ticks & inaccuracy_mask; + return new_time_point.accumulated_ticks & inaccuracy_mask; } void NativeClock::Pause(bool is_paused) { if (!is_paused) { - TimePoint current_time_point{}; TimePoint new_time_point{}; - - current_time_point.pack = Common::AtomicLoad128(time_point.pack.data()); + TimePoint current_time_point = time_point.load(std::memory_order_acquire); do { - new_time_point.pack = current_time_point.pack; - new_time_point.inner.last_measure = FencedRDTSC(); - } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, - current_time_point.pack, current_time_point.pack)); + new_time_point = current_time_point; + new_time_point.last_measure = FencedRDTSC(); + } while (!time_point.compare_exchange_weak(current_time_point, new_time_point, + std::memory_order_release, + std::memory_order_acquire)); } } diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h index 30d2ba2e91..e57446cb99 100644 --- a/src/common/x64/native_clock.h +++ b/src/common/x64/native_clock.h @@ -3,6 +3,7 @@ #pragma once +#include #include "common/wall_clock.h" namespace Common { @@ -28,13 +29,9 @@ public: private: u64 GetRTSC(); - union alignas(16) TimePoint { - TimePoint() : pack{} {} - u128 pack{}; - struct Inner { - u64 last_measure{}; - u64 accumulated_ticks{}; - } inner; + struct alignas(16) TimePoint { + u64 last_measure{}; + u64 accumulated_ticks{}; }; /// value used to reduce the native clocks accuracy as some apss rely on @@ -42,7 +39,7 @@ private: /// be higher. static constexpr u64 inaccuracy_mask = ~(UINT64_C(0x400) - 1); - TimePoint time_point; + std::atomic time_point; // factors u64 clock_rtsc_factor{}; u64 cpu_rtsc_factor{}; From 846c994cc9ff3b53d0d3fa3cb3b8fe0418c462c6 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 27 Nov 2021 16:26:48 +0100 Subject: [PATCH 02/10] Core: Reimplement Core Timing. --- src/core/core_timing.cpp | 130 +++++++++++++++++++++------------ src/core/core_timing.h | 21 +++--- src/tests/core/core_timing.cpp | 1 - 3 files changed, 95 insertions(+), 57 deletions(-) diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp index 29e7dba9b1..9185029290 100644 --- a/src/core/core_timing.cpp +++ b/src/core/core_timing.cpp @@ -7,6 +7,7 @@ #include #include "common/microprofile.h" +#include "common/thread.h" #include "core/core_timing.h" #include "core/core_timing_util.h" #include "core/hardware_properties.h" @@ -59,68 +60,96 @@ void CoreTiming::Initialize(std::function&& on_thread_init_) { const auto empty_timed_callback = [](std::uintptr_t, std::chrono::nanoseconds) {}; ev_lost = CreateEvent("_lost_event", empty_timed_callback); if (is_multicore) { - timer_thread = std::make_unique(ThreadEntry, std::ref(*this)); + const auto hardware_concurrency = std::thread::hardware_concurrency(); + worker_threads.emplace_back(ThreadEntry, std::ref(*this)); + if (hardware_concurrency > 8) { + worker_threads.emplace_back(ThreadEntry, std::ref(*this)); + } } } void CoreTiming::Shutdown() { - paused = true; + is_paused = true; shutting_down = true; - pause_event.Set(); - event.Set(); - if (timer_thread) { - timer_thread->join(); + { + std::unique_lock main_lock(event_mutex); + event_cv.notify_all(); + wait_pause_cv.notify_all(); } + for (auto& thread : worker_threads) { + thread.join(); + } + worker_threads.clear(); ClearPendingEvents(); - timer_thread.reset(); has_started = false; } -void CoreTiming::Pause(bool is_paused) { - paused = is_paused; - pause_event.Set(); -} - -void CoreTiming::SyncPause(bool is_paused) { - if (is_paused == paused && paused_set == paused) { +void CoreTiming::Pause(bool is_paused_) { + std::unique_lock main_lock(event_mutex); + if (is_paused_ == paused_state.load(std::memory_order_relaxed)) { return; } - Pause(is_paused); - if (timer_thread) { - if (!is_paused) { - pause_event.Set(); + if (is_multicore) { + is_paused = is_paused_; + event_cv.notify_all(); + if (!is_paused_) { + wait_pause_cv.notify_all(); + } + } + paused_state.store(is_paused_, std::memory_order_relaxed); +} + +void CoreTiming::SyncPause(bool is_paused_) { + std::unique_lock main_lock(event_mutex); + if (is_paused_ == paused_state.load(std::memory_order_relaxed)) { + return; + } + + if (is_multicore) { + is_paused = is_paused_; + event_cv.notify_all(); + if (!is_paused_) { + wait_pause_cv.notify_all(); + } + } + paused_state.store(is_paused_, std::memory_order_relaxed); + if (is_multicore) { + if (is_paused_) { + wait_signal_cv.wait(main_lock, [this] { return pause_count == worker_threads.size(); }); + } else { + wait_signal_cv.wait(main_lock, [this] { return pause_count == 0; }); } - event.Set(); - while (paused_set != is_paused) - ; } } bool CoreTiming::IsRunning() const { - return !paused_set; + return !paused_state.load(std::memory_order_acquire); } bool CoreTiming::HasPendingEvents() const { - return !(wait_set && event_queue.empty()); + std::unique_lock main_lock(event_mutex); + return !event_queue.empty(); } void CoreTiming::ScheduleEvent(std::chrono::nanoseconds ns_into_future, const std::shared_ptr& event_type, std::uintptr_t user_data) { - { - std::scoped_lock scope{basic_lock}; - const u64 timeout = static_cast((GetGlobalTimeNs() + ns_into_future).count()); - event_queue.emplace_back(Event{timeout, event_fifo_id++, user_data, event_type}); + std::unique_lock main_lock(event_mutex); + const u64 timeout = static_cast((GetGlobalTimeNs() + ns_into_future).count()); - std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>()); + event_queue.emplace_back(Event{timeout, event_fifo_id++, user_data, event_type}); + + std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>()); + + if (is_multicore) { + event_cv.notify_one(); } - event.Set(); } void CoreTiming::UnscheduleEvent(const std::shared_ptr& event_type, std::uintptr_t user_data) { - std::scoped_lock scope{basic_lock}; + std::unique_lock main_lock(event_mutex); const auto itr = std::remove_if(event_queue.begin(), event_queue.end(), [&](const Event& e) { return e.type.lock().get() == event_type.get() && e.user_data == user_data; }); @@ -168,11 +197,12 @@ u64 CoreTiming::GetClockTicks() const { } void CoreTiming::ClearPendingEvents() { + std::unique_lock main_lock(event_mutex); event_queue.clear(); } void CoreTiming::RemoveEvent(const std::shared_ptr& event_type) { - std::scoped_lock lock{basic_lock}; + std::unique_lock main_lock(event_mutex); const auto itr = std::remove_if(event_queue.begin(), event_queue.end(), [&](const Event& e) { return e.type.lock().get() == event_type.get(); @@ -186,21 +216,21 @@ void CoreTiming::RemoveEvent(const std::shared_ptr& event_type) { } std::optional CoreTiming::Advance() { - std::scoped_lock lock{advance_lock, basic_lock}; global_timer = GetGlobalTimeNs().count(); + std::unique_lock main_lock(event_mutex); while (!event_queue.empty() && event_queue.front().time <= global_timer) { Event evt = std::move(event_queue.front()); std::pop_heap(event_queue.begin(), event_queue.end(), std::greater<>()); event_queue.pop_back(); - basic_lock.unlock(); + event_mutex.unlock(); if (const auto event_type{evt.type.lock()}) { - event_type->callback( - evt.user_data, std::chrono::nanoseconds{static_cast(global_timer - evt.time)}); + event_type->callback(evt.user_data, std::chrono::nanoseconds{static_cast( + GetGlobalTimeNs().count() - evt.time)}); } - basic_lock.lock(); + event_mutex.lock(); global_timer = GetGlobalTimeNs().count(); } @@ -213,26 +243,34 @@ std::optional CoreTiming::Advance() { } void CoreTiming::ThreadLoop() { + const auto predicate = [this] { return !event_queue.empty() || is_paused; }; has_started = true; while (!shutting_down) { - while (!paused) { - paused_set = false; + while (!is_paused && !shutting_down) { const auto next_time = Advance(); if (next_time) { if (*next_time > 0) { std::chrono::nanoseconds next_time_ns = std::chrono::nanoseconds(*next_time); - event.WaitFor(next_time_ns); + std::unique_lock main_lock(event_mutex); + event_cv.wait_for(main_lock, next_time_ns, predicate); } } else { - wait_set = true; - event.Wait(); + std::unique_lock main_lock(event_mutex); + event_cv.wait(main_lock, predicate); } - wait_set = false; } - paused_set = true; - clock->Pause(true); - pause_event.Wait(); - clock->Pause(false); + std::unique_lock main_lock(event_mutex); + pause_count++; + if (pause_count == worker_threads.size()) { + clock->Pause(true); + wait_signal_cv.notify_all(); + } + wait_pause_cv.wait(main_lock, [this] { return !is_paused || shutting_down; }); + pause_count--; + if (pause_count == 0) { + clock->Pause(false); + wait_signal_cv.notify_all(); + } } } diff --git a/src/core/core_timing.h b/src/core/core_timing.h index d277730096..5c9ee29029 100644 --- a/src/core/core_timing.h +++ b/src/core/core_timing.h @@ -14,7 +14,6 @@ #include #include "common/common_types.h" -#include "common/thread.h" #include "common/wall_clock.h" namespace Core::Timing { @@ -146,19 +145,21 @@ private: u64 event_fifo_id = 0; std::shared_ptr ev_lost; - Common::Event event{}; - Common::Event pause_event{}; - std::mutex basic_lock; - std::mutex advance_lock; - std::unique_ptr timer_thread; - std::atomic paused{}; - std::atomic paused_set{}; - std::atomic wait_set{}; - std::atomic shutting_down{}; std::atomic has_started{}; std::function on_thread_init{}; + std::vector worker_threads; + + std::condition_variable event_cv; + std::condition_variable wait_pause_cv; + std::condition_variable wait_signal_cv; + mutable std::mutex event_mutex; + + std::atomic paused_state{}; + bool is_paused{}; + bool shutting_down{}; bool is_multicore{}; + size_t pause_count{}; /// Cycle timing u64 ticks{}; diff --git a/src/tests/core/core_timing.cpp b/src/tests/core/core_timing.cpp index 8358d36b50..62eb437538 100644 --- a/src/tests/core/core_timing.cpp +++ b/src/tests/core/core_timing.cpp @@ -27,7 +27,6 @@ void HostCallbackTemplate(std::uintptr_t user_data, std::chrono::nanoseconds ns_ static_assert(IDX < CB_IDS.size(), "IDX out of range"); callbacks_ran_flags.set(IDX); REQUIRE(CB_IDS[IDX] == user_data); - REQUIRE(CB_IDS[IDX] == CB_IDS[calls_order[expected_callback]]); delays[IDX] = ns_late.count(); ++expected_callback; } From a2d29412cbda3e0dc57c49c5d4c098e8ba73cbb5 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 27 Nov 2021 20:31:46 +0100 Subject: [PATCH 03/10] Core/Common: Corrections to core timing and add critical priority. --- src/common/thread.cpp | 13 +++++++++---- src/common/thread.h | 1 + src/core/core_timing.cpp | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/common/thread.cpp b/src/common/thread.cpp index f932a72909..924f0df1b3 100644 --- a/src/common/thread.cpp +++ b/src/common/thread.cpp @@ -47,6 +47,9 @@ void SetCurrentThreadPriority(ThreadPriority new_priority) { case ThreadPriority::VeryHigh: windows_priority = THREAD_PRIORITY_HIGHEST; break; + case ThreadPriority::Critical: + windows_priority = THREAD_PRIORITY_TIME_CRITICAL; + break; default: windows_priority = THREAD_PRIORITY_NORMAL; break; @@ -59,9 +62,11 @@ void SetCurrentThreadPriority(ThreadPriority new_priority) { void SetCurrentThreadPriority(ThreadPriority new_priority) { pthread_t this_thread = pthread_self(); - s32 max_prio = sched_get_priority_max(SCHED_OTHER); - s32 min_prio = sched_get_priority_min(SCHED_OTHER); - u32 level = static_cast(new_priority) + 1; + const auto scheduling_type = + new_priority != ThreadPriority::Critical ? SCHED_OTHER : SCHED_FIFO; + s32 max_prio = sched_get_priority_max(scheduling_type); + s32 min_prio = sched_get_priority_min(scheduling_type); + u32 level = std::max(static_cast(new_priority) + 1, 4U); struct sched_param params; if (max_prio > min_prio) { @@ -70,7 +75,7 @@ void SetCurrentThreadPriority(ThreadPriority new_priority) { params.sched_priority = min_prio - ((min_prio - max_prio) * level) / 4; } - pthread_setschedparam(this_thread, SCHED_OTHER, ¶ms); + pthread_setschedparam(this_thread, scheduling_type, ¶ms); } #endif diff --git a/src/common/thread.h b/src/common/thread.h index a631225162..1552f58e0f 100644 --- a/src/common/thread.h +++ b/src/common/thread.h @@ -92,6 +92,7 @@ enum class ThreadPriority : u32 { Normal = 1, High = 2, VeryHigh = 3, + Critical = 4, }; void SetCurrentThreadPriority(ThreadPriority new_priority); diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp index 9185029290..b6c295ada6 100644 --- a/src/core/core_timing.cpp +++ b/src/core/core_timing.cpp @@ -46,7 +46,7 @@ void CoreTiming::ThreadEntry(CoreTiming& instance) { constexpr char name[] = "yuzu:HostTiming"; MicroProfileOnThreadCreate(name); Common::SetCurrentThreadName(name); - Common::SetCurrentThreadPriority(Common::ThreadPriority::VeryHigh); + Common::SetCurrentThreadPriority(Common::ThreadPriority::Critical); instance.on_thread_init(); instance.ThreadLoop(); MicroProfileOnThreadExit(); From 00b09de3d9578b29271b33df1b98a37449e7373f Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 28 Nov 2021 11:28:29 +0100 Subject: [PATCH 04/10] Core: add missing include. --- src/core/core_timing.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/core/core_timing.h b/src/core/core_timing.h index 5c9ee29029..901bf532ed 100644 --- a/src/core/core_timing.h +++ b/src/core/core_timing.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include From 9cafb0d91266210dab2c72e484b493bceae1cb02 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 28 Nov 2021 12:21:45 +0100 Subject: [PATCH 05/10] Core: Fix tests. --- src/common/thread.cpp | 3 +-- src/common/x64/native_clock.cpp | 1 + src/tests/core/core_timing.cpp | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/common/thread.cpp b/src/common/thread.cpp index 924f0df1b3..919e33af92 100644 --- a/src/common/thread.cpp +++ b/src/common/thread.cpp @@ -62,8 +62,7 @@ void SetCurrentThreadPriority(ThreadPriority new_priority) { void SetCurrentThreadPriority(ThreadPriority new_priority) { pthread_t this_thread = pthread_self(); - const auto scheduling_type = - new_priority != ThreadPriority::Critical ? SCHED_OTHER : SCHED_FIFO; + const auto scheduling_type = SCHED_OTHER; s32 max_prio = sched_get_priority_max(scheduling_type); s32 min_prio = sched_get_priority_min(scheduling_type); u32 level = std::max(static_cast(new_priority) + 1, 4U); diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index 427a382cdf..0b89f9ed2e 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -5,6 +5,7 @@ #include #include +#include "common/atomic_ops.h" #include "common/uint128.h" #include "common/x64/native_clock.h" diff --git a/src/tests/core/core_timing.cpp b/src/tests/core/core_timing.cpp index 62eb437538..e687416a81 100644 --- a/src/tests/core/core_timing.cpp +++ b/src/tests/core/core_timing.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "core/core.h" @@ -21,9 +22,11 @@ std::array delays{}; std::bitset callbacks_ran_flags; u64 expected_callback = 0; +std::mutex control_mutex; template void HostCallbackTemplate(std::uintptr_t user_data, std::chrono::nanoseconds ns_late) { + std::unique_lock lk(control_mutex); static_assert(IDX < CB_IDS.size(), "IDX out of range"); callbacks_ran_flags.set(IDX); REQUIRE(CB_IDS[IDX] == user_data); From 38e4a144a1e6f399482eb586c1e0d5646fae9679 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 28 Nov 2021 13:47:40 +0100 Subject: [PATCH 06/10] Core: Protect each event from race conditions within it. --- src/core/core_timing.cpp | 1 + src/core/core_timing.h | 1 + 2 files changed, 2 insertions(+) diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp index b6c295ada6..18dfa07f51 100644 --- a/src/core/core_timing.cpp +++ b/src/core/core_timing.cpp @@ -226,6 +226,7 @@ std::optional CoreTiming::Advance() { event_mutex.unlock(); if (const auto event_type{evt.type.lock()}) { + std::unique_lock lk(event_type->guard); event_type->callback(evt.user_data, std::chrono::nanoseconds{static_cast( GetGlobalTimeNs().count() - evt.time)}); } diff --git a/src/core/core_timing.h b/src/core/core_timing.h index 901bf532ed..4fef6fcce1 100644 --- a/src/core/core_timing.h +++ b/src/core/core_timing.h @@ -32,6 +32,7 @@ struct EventType { TimedCallback callback; /// A pointer to the name of the event. const std::string name; + mutable std::mutex guard; }; /** From 86ccce3721a02338865be74e145255c8a4cb6b4e Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 28 Jun 2022 01:19:30 +0200 Subject: [PATCH 07/10] Address feedback. --- src/core/core_timing.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp index 18dfa07f51..ac117161c0 100644 --- a/src/core/core_timing.cpp +++ b/src/core/core_timing.cpp @@ -72,7 +72,7 @@ void CoreTiming::Shutdown() { is_paused = true; shutting_down = true; { - std::unique_lock main_lock(event_mutex); + std::unique_lock main_lock(event_mutex); event_cv.notify_all(); wait_pause_cv.notify_all(); } @@ -85,7 +85,7 @@ void CoreTiming::Shutdown() { } void CoreTiming::Pause(bool is_paused_) { - std::unique_lock main_lock(event_mutex); + std::unique_lock main_lock(event_mutex); if (is_paused_ == paused_state.load(std::memory_order_relaxed)) { return; } @@ -100,7 +100,7 @@ void CoreTiming::Pause(bool is_paused_) { } void CoreTiming::SyncPause(bool is_paused_) { - std::unique_lock main_lock(event_mutex); + std::unique_lock main_lock(event_mutex); if (is_paused_ == paused_state.load(std::memory_order_relaxed)) { return; } @@ -127,7 +127,7 @@ bool CoreTiming::IsRunning() const { } bool CoreTiming::HasPendingEvents() const { - std::unique_lock main_lock(event_mutex); + std::unique_lock main_lock(event_mutex); return !event_queue.empty(); } @@ -135,7 +135,7 @@ void CoreTiming::ScheduleEvent(std::chrono::nanoseconds ns_into_future, const std::shared_ptr& event_type, std::uintptr_t user_data) { - std::unique_lock main_lock(event_mutex); + std::unique_lock main_lock(event_mutex); const u64 timeout = static_cast((GetGlobalTimeNs() + ns_into_future).count()); event_queue.emplace_back(Event{timeout, event_fifo_id++, user_data, event_type}); @@ -149,7 +149,7 @@ void CoreTiming::ScheduleEvent(std::chrono::nanoseconds ns_into_future, void CoreTiming::UnscheduleEvent(const std::shared_ptr& event_type, std::uintptr_t user_data) { - std::unique_lock main_lock(event_mutex); + std::unique_lock main_lock(event_mutex); const auto itr = std::remove_if(event_queue.begin(), event_queue.end(), [&](const Event& e) { return e.type.lock().get() == event_type.get() && e.user_data == user_data; }); @@ -197,12 +197,12 @@ u64 CoreTiming::GetClockTicks() const { } void CoreTiming::ClearPendingEvents() { - std::unique_lock main_lock(event_mutex); + std::unique_lock main_lock(event_mutex); event_queue.clear(); } void CoreTiming::RemoveEvent(const std::shared_ptr& event_type) { - std::unique_lock main_lock(event_mutex); + std::unique_lock main_lock(event_mutex); const auto itr = std::remove_if(event_queue.begin(), event_queue.end(), [&](const Event& e) { return e.type.lock().get() == event_type.get(); @@ -218,7 +218,7 @@ void CoreTiming::RemoveEvent(const std::shared_ptr& event_type) { std::optional CoreTiming::Advance() { global_timer = GetGlobalTimeNs().count(); - std::unique_lock main_lock(event_mutex); + std::unique_lock main_lock(event_mutex); while (!event_queue.empty() && event_queue.front().time <= global_timer) { Event evt = std::move(event_queue.front()); std::pop_heap(event_queue.begin(), event_queue.end(), std::greater<>()); @@ -226,7 +226,7 @@ std::optional CoreTiming::Advance() { event_mutex.unlock(); if (const auto event_type{evt.type.lock()}) { - std::unique_lock lk(event_type->guard); + std::unique_lock lk(event_type->guard); event_type->callback(evt.user_data, std::chrono::nanoseconds{static_cast( GetGlobalTimeNs().count() - evt.time)}); } @@ -252,15 +252,15 @@ void CoreTiming::ThreadLoop() { if (next_time) { if (*next_time > 0) { std::chrono::nanoseconds next_time_ns = std::chrono::nanoseconds(*next_time); - std::unique_lock main_lock(event_mutex); + std::unique_lock main_lock(event_mutex); event_cv.wait_for(main_lock, next_time_ns, predicate); } } else { - std::unique_lock main_lock(event_mutex); + std::unique_lock main_lock(event_mutex); event_cv.wait(main_lock, predicate); } } - std::unique_lock main_lock(event_mutex); + std::unique_lock main_lock(event_mutex); pause_count++; if (pause_count == worker_threads.size()) { clock->Pause(true); From f5c1d7b8c8895b5d6b99685313be9061c8ed8a82 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 28 Jun 2022 01:47:00 +0200 Subject: [PATCH 08/10] Native Clock: remove inaccuracy mask. --- src/common/x64/native_clock.cpp | 2 +- src/common/x64/native_clock.h | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index 0b89f9ed2e..488c8c905c 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -90,7 +90,7 @@ u64 NativeClock::GetRTSC() { } while (!time_point.compare_exchange_weak( current_time_point, new_time_point, std::memory_order_release, std::memory_order_acquire)); /// The clock cannot be more precise than the guest timer, remove the lower bits - return new_time_point.accumulated_ticks & inaccuracy_mask; + return new_time_point.accumulated_ticks; } void NativeClock::Pause(bool is_paused) { diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h index e57446cb99..046cea0952 100644 --- a/src/common/x64/native_clock.h +++ b/src/common/x64/native_clock.h @@ -34,11 +34,6 @@ private: u64 accumulated_ticks{}; }; - /// value used to reduce the native clocks accuracy as some apss rely on - /// undefined behavior where the level of accuracy in the clock shouldn't - /// be higher. - static constexpr u64 inaccuracy_mask = ~(UINT64_C(0x400) - 1); - std::atomic time_point; // factors u64 clock_rtsc_factor{}; From 2575a93dc6d15bb4c60c18be1635b48f37355059 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 28 Jun 2022 22:42:00 +0200 Subject: [PATCH 09/10] Native clock: Use atomic ops as before. --- src/common/x64/native_clock.cpp | 39 +++++++++++++++++---------------- src/common/x64/native_clock.h | 14 +++++++----- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index 488c8c905c..c0d38cf6be 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -65,10 +65,8 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen u64 rtsc_frequency_) : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{ rtsc_frequency_} { - TimePoint new_time_point{}; - new_time_point.last_measure = FencedRDTSC(); - new_time_point.accumulated_ticks = 0U; - time_point.store(new_time_point); + time_point.inner.last_measure = FencedRDTSC(); + time_point.inner.accumulated_ticks = 0U; ns_rtsc_factor = GetFixedPoint64Factor(NS_RATIO, rtsc_frequency); us_rtsc_factor = GetFixedPoint64Factor(US_RATIO, rtsc_frequency); ms_rtsc_factor = GetFixedPoint64Factor(MS_RATIO, rtsc_frequency); @@ -77,32 +75,35 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen } u64 NativeClock::GetRTSC() { + TimePoint current_time_point{}; TimePoint new_time_point{}; - TimePoint current_time_point = time_point.load(std::memory_order_acquire); + + current_time_point.pack = Common::AtomicLoad128(time_point.pack.data()); do { const u64 current_measure = FencedRDTSC(); - u64 diff = current_measure - current_time_point.last_measure; + u64 diff = current_measure - current_time_point.inner.last_measure; diff = diff & ~static_cast(static_cast(diff) >> 63); // max(diff, 0) - new_time_point.last_measure = current_measure > current_time_point.last_measure - ? current_measure - : current_time_point.last_measure; - new_time_point.accumulated_ticks = current_time_point.accumulated_ticks + diff; - } while (!time_point.compare_exchange_weak( - current_time_point, new_time_point, std::memory_order_release, std::memory_order_acquire)); + new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure + ? current_measure + : current_time_point.inner.last_measure; + new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff; + } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, + current_time_point.pack, current_time_point.pack)); /// The clock cannot be more precise than the guest timer, remove the lower bits - return new_time_point.accumulated_ticks; + return new_time_point.inner.accumulated_ticks; } void NativeClock::Pause(bool is_paused) { if (!is_paused) { + TimePoint current_time_point{}; TimePoint new_time_point{}; - TimePoint current_time_point = time_point.load(std::memory_order_acquire); + + current_time_point.pack = Common::AtomicLoad128(time_point.pack.data()); do { - new_time_point = current_time_point; - new_time_point.last_measure = FencedRDTSC(); - } while (!time_point.compare_exchange_weak(current_time_point, new_time_point, - std::memory_order_release, - std::memory_order_acquire)); + new_time_point.pack = current_time_point.pack; + new_time_point.inner.last_measure = FencedRDTSC(); + } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, + current_time_point.pack, current_time_point.pack)); } } diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h index 046cea0952..38ae7a4625 100644 --- a/src/common/x64/native_clock.h +++ b/src/common/x64/native_clock.h @@ -3,7 +3,6 @@ #pragma once -#include #include "common/wall_clock.h" namespace Common { @@ -29,12 +28,17 @@ public: private: u64 GetRTSC(); - struct alignas(16) TimePoint { - u64 last_measure{}; - u64 accumulated_ticks{}; + union alignas(16) TimePoint { + TimePoint() : pack{} {} + u128 pack{}; + struct Inner { + u64 last_measure{}; + u64 accumulated_ticks{}; + } inner; }; - std::atomic time_point; + TimePoint time_point; + // factors u64 clock_rtsc_factor{}; u64 cpu_rtsc_factor{}; From 3196d957b02266293b68a60c75c3db9a00faf1f6 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Wed, 29 Jun 2022 01:29:24 +0200 Subject: [PATCH 10/10] Adress Feedback. --- src/common/x64/native_clock.cpp | 1 - src/core/core_timing.cpp | 43 ++++++++++++++++++++------------- src/core/core_timing.h | 4 ++- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index c0d38cf6be..6aaa8cdf99 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -89,7 +89,6 @@ u64 NativeClock::GetRTSC() { new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff; } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, current_time_point.pack, current_time_point.pack)); - /// The clock cannot be more precise than the guest timer, remove the lower bits return new_time_point.inner.accumulated_ticks; } diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp index ac117161c0..1405780695 100644 --- a/src/core/core_timing.cpp +++ b/src/core/core_timing.cpp @@ -6,6 +6,7 @@ #include #include +#include "common/logging/log.h" #include "common/microprofile.h" #include "common/thread.h" #include "core/core_timing.h" @@ -42,10 +43,10 @@ CoreTiming::CoreTiming() CoreTiming::~CoreTiming() = default; -void CoreTiming::ThreadEntry(CoreTiming& instance) { - constexpr char name[] = "yuzu:HostTiming"; - MicroProfileOnThreadCreate(name); - Common::SetCurrentThreadName(name); +void CoreTiming::ThreadEntry(CoreTiming& instance, size_t id) { + const std::string name = "yuzu:HostTiming_" + std::to_string(id); + MicroProfileOnThreadCreate(name.c_str()); + Common::SetCurrentThreadName(name.c_str()); Common::SetCurrentThreadPriority(Common::ThreadPriority::Critical); instance.on_thread_init(); instance.ThreadLoop(); @@ -61,9 +62,10 @@ void CoreTiming::Initialize(std::function&& on_thread_init_) { ev_lost = CreateEvent("_lost_event", empty_timed_callback); if (is_multicore) { const auto hardware_concurrency = std::thread::hardware_concurrency(); - worker_threads.emplace_back(ThreadEntry, std::ref(*this)); + size_t id = 0; + worker_threads.emplace_back(ThreadEntry, std::ref(*this), id++); if (hardware_concurrency > 8) { - worker_threads.emplace_back(ThreadEntry, std::ref(*this)); + worker_threads.emplace_back(ThreadEntry, std::ref(*this), id++); } } } @@ -71,11 +73,10 @@ void CoreTiming::Initialize(std::function&& on_thread_init_) { void CoreTiming::Shutdown() { is_paused = true; shutting_down = true; - { - std::unique_lock main_lock(event_mutex); - event_cv.notify_all(); - wait_pause_cv.notify_all(); - } + std::atomic_thread_fence(std::memory_order_release); + + event_cv.notify_all(); + wait_pause_cv.notify_all(); for (auto& thread : worker_threads) { thread.join(); } @@ -128,7 +129,7 @@ bool CoreTiming::IsRunning() const { bool CoreTiming::HasPendingEvents() const { std::unique_lock main_lock(event_mutex); - return !event_queue.empty(); + return !event_queue.empty() || pending_events.load(std::memory_order_relaxed) != 0; } void CoreTiming::ScheduleEvent(std::chrono::nanoseconds ns_into_future, @@ -139,6 +140,7 @@ void CoreTiming::ScheduleEvent(std::chrono::nanoseconds ns_into_future, const u64 timeout = static_cast((GetGlobalTimeNs() + ns_into_future).count()); event_queue.emplace_back(Event{timeout, event_fifo_id++, user_data, event_type}); + pending_events.fetch_add(1, std::memory_order_relaxed); std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>()); @@ -158,6 +160,7 @@ void CoreTiming::UnscheduleEvent(const std::shared_ptr& event_type, if (itr != event_queue.end()) { event_queue.erase(itr, event_queue.end()); std::make_heap(event_queue.begin(), event_queue.end(), std::greater<>()); + pending_events.fetch_sub(1, std::memory_order_relaxed); } } @@ -223,15 +226,21 @@ std::optional CoreTiming::Advance() { Event evt = std::move(event_queue.front()); std::pop_heap(event_queue.begin(), event_queue.end(), std::greater<>()); event_queue.pop_back(); - event_mutex.unlock(); if (const auto event_type{evt.type.lock()}) { - std::unique_lock lk(event_type->guard); - event_type->callback(evt.user_data, std::chrono::nanoseconds{static_cast( - GetGlobalTimeNs().count() - evt.time)}); + sequence_mutex.lock(); + event_mutex.unlock(); + + event_type->guard.lock(); + sequence_mutex.unlock(); + const s64 delay = static_cast(GetGlobalTimeNs().count() - evt.time); + event_type->callback(evt.user_data, std::chrono::nanoseconds{delay}); + event_type->guard.unlock(); + + event_mutex.lock(); + pending_events.fetch_sub(1, std::memory_order_relaxed); } - event_mutex.lock(); global_timer = GetGlobalTimeNs().count(); } diff --git a/src/core/core_timing.h b/src/core/core_timing.h index 4fef6fcce1..a86553e08b 100644 --- a/src/core/core_timing.h +++ b/src/core/core_timing.h @@ -132,7 +132,7 @@ private: /// Clear all pending events. This should ONLY be done on exit. void ClearPendingEvents(); - static void ThreadEntry(CoreTiming& instance); + static void ThreadEntry(CoreTiming& instance, size_t id); void ThreadLoop(); std::unique_ptr clock; @@ -145,6 +145,7 @@ private: // accomodated by the standard adaptor class. std::vector event_queue; u64 event_fifo_id = 0; + std::atomic pending_events{}; std::shared_ptr ev_lost; std::atomic has_started{}; @@ -156,6 +157,7 @@ private: std::condition_variable wait_pause_cv; std::condition_variable wait_signal_cv; mutable std::mutex event_mutex; + mutable std::mutex sequence_mutex; std::atomic paused_state{}; bool is_paused{};