Query Cache: address issues

This commit is contained in:
Fernando Sahmkow 2023-08-06 09:38:16 +02:00
parent aa6587d854
commit 282ae8fa51
21 changed files with 270 additions and 214 deletions

View file

@ -276,9 +276,8 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad
}
template <class P>
std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainCPUBuffer(VAddr cpu_addr, u32 size,
ObtainBufferSynchronize sync_info,
ObtainBufferOperation post_op) {
std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainCPUBuffer(
VAddr cpu_addr, u32 size, ObtainBufferSynchronize sync_info, ObtainBufferOperation post_op) {
const BufferId buffer_id = FindBuffer(cpu_addr, size);
Buffer& buffer = slot_buffers[buffer_id];

View file

@ -596,12 +596,6 @@ void Maxwell3D::ProcessCounterReset() {
case Regs::ClearReport::ZPassPixelCount:
rasterizer->ResetCounter(VideoCommon::QueryType::ZPassPixelCount64);
break;
case Regs::ClearReport::PrimitivesGenerated:
rasterizer->ResetCounter(VideoCommon::QueryType::StreamingByteCount);
break;
case Regs::ClearReport::VtgPrimitivesOut:
rasterizer->ResetCounter(VideoCommon::QueryType::StreamingByteCount);
break;
default:
LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.clear_report_value);
break;

View file

@ -82,7 +82,8 @@ void Puller::ProcessSemaphoreTriggerMethod() {
if (op == GpuSemaphoreOperation::WriteLong) {
const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()};
const u32 payload = regs.semaphore_sequence;
rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0);
rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload,
VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0);
} else {
do {
const u32 word{memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress())};
@ -117,7 +118,8 @@ void Puller::ProcessSemaphoreTriggerMethod() {
void Puller::ProcessSemaphoreRelease() {
const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()};
const u32 payload = regs.semaphore_release;
rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, VideoCommon::QueryPropertiesFlags::IsAFence, payload, 0);
rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload,
VideoCommon::QueryPropertiesFlags::IsAFence, payload, 0);
}
void Puller::ProcessSemaphoreAcquire() {

View file

@ -55,6 +55,9 @@ public:
// Unlike other fences, this one doesn't
void SignalOrdering() {
if constexpr (!can_async_check) {
TryReleasePendingFences<false>();
}
std::scoped_lock lock{buffer_cache.mutex};
buffer_cache.AccumulateFlushes();
}
@ -104,13 +107,9 @@ public:
SignalFence(std::move(func));
}
void WaitPendingFences(bool force) {
void WaitPendingFences([[maybe_unused]] bool force) {
if constexpr (!can_async_check) {
if (force) {
TryReleasePendingFences<true>();
} else {
TryReleasePendingFences<false>();
}
} else {
if (!force) {
return;
@ -125,7 +124,8 @@ public:
});
SignalFence(std::move(func));
std::unique_lock lk(wait_mutex);
wait_cv.wait(lk, [&wait_finished] { return wait_finished.load(std::memory_order_relaxed); });
wait_cv.wait(
lk, [&wait_finished] { return wait_finished.load(std::memory_order_relaxed); });
}
}

View file

@ -7,21 +7,19 @@
#include <deque>
#include <utility>
#include "common/common_types.h"
namespace VideoCommon {
class BankBase {
protected:
const size_t base_bank_size;
size_t bank_size;
std::atomic<size_t> references;
size_t current_slot;
const size_t base_bank_size{};
size_t bank_size{};
std::atomic<size_t> references{};
size_t current_slot{};
public:
BankBase(size_t bank_size_)
: base_bank_size{bank_size_}, bank_size(bank_size_), references(0), current_slot(0) {}
explicit BankBase(size_t bank_size_) : base_bank_size{bank_size_}, bank_size(bank_size_) {}
virtual ~BankBase() = default;
@ -58,11 +56,11 @@ public:
bank_size = current_slot;
}
constexpr bool IsClosed() {
bool IsClosed() const {
return current_slot >= bank_size;
}
bool IsDead() {
bool IsDead() const {
return IsClosed() && references == 0;
}
};

View file

@ -9,7 +9,7 @@
namespace VideoCommon {
enum class QueryFlagBits : u32 {
HasTimestamp = 1 << 0, ///< Indicates if this query has a tiemstamp.
HasTimestamp = 1 << 0, ///< Indicates if this query has a timestamp.
IsFinalValueSynced = 1 << 1, ///< Indicates if the query has been synced in the host
IsHostSynced = 1 << 2, ///< Indicates if the query has been synced in the host
IsGuestSynced = 1 << 3, ///< Indicates if the query has been synced with the guest.
@ -24,13 +24,13 @@ DECLARE_ENUM_FLAG_OPERATORS(QueryFlagBits)
class QueryBase {
public:
VAddr guest_address;
QueryFlagBits flags;
u64 value;
VAddr guest_address{};
QueryFlagBits flags{};
u64 value{};
protected:
// Default constructor
QueryBase() : guest_address(0), flags{}, value{} {}
QueryBase() = default;
// Parameterized constructor
QueryBase(VAddr address, QueryFlagBits flags_, u64 value_)
@ -51,23 +51,21 @@ public:
class HostQueryBase : public QueryBase {
public:
// Default constructor
HostQueryBase()
: QueryBase(0, QueryFlagBits::IsHostManaged | QueryFlagBits::IsOrphan, 0), start_bank_id{},
size_banks{}, start_slot{}, size_slots{} {}
HostQueryBase() : QueryBase(0, QueryFlagBits::IsHostManaged | QueryFlagBits::IsOrphan, 0) {}
// Parameterized constructor
HostQueryBase(bool isLong, VAddr address)
HostQueryBase(bool has_timestamp, VAddr address)
: QueryBase(address, QueryFlagBits::IsHostManaged, 0), start_bank_id{}, size_banks{},
start_slot{}, size_slots{} {
if (isLong) {
if (has_timestamp) {
flags |= QueryFlagBits::HasTimestamp;
}
}
u32 start_bank_id;
u32 size_banks;
size_t start_slot;
size_t size_slots;
u32 start_bank_id{};
u32 size_banks{};
size_t start_slot{};
size_t size_slots{};
};
} // namespace VideoCommon

View file

@ -54,7 +54,7 @@ public:
return new_id;
}
bool HasPendingSync() override {
bool HasPendingSync() const override {
return !pending_sync.empty();
}
@ -71,8 +71,10 @@ public:
continue;
}
query.flags |= QueryFlagBits::IsHostSynced;
sync_values.emplace_back(query.guest_address, query.value,
True(query.flags & QueryFlagBits::HasTimestamp) ? 8 : 4);
sync_values.emplace_back(SyncValuesStruct{
.address = query.guest_address,
.value = query.value,
.size = static_cast<u64>(True(query.flags & QueryFlagBits::HasTimestamp) ? 8 : 4)});
}
pending_sync.clear();
if (sync_values.size() > 0) {
@ -90,15 +92,20 @@ class StubStreamer : public GuestStreamer<Traits> {
public:
using RuntimeType = typename Traits::RuntimeType;
StubStreamer(size_t id_, RuntimeType& runtime_) : GuestStreamer<Traits>(id_, runtime_) {}
StubStreamer(size_t id_, RuntimeType& runtime_, u32 stub_value_)
: GuestStreamer<Traits>(id_, runtime_), stub_value{stub_value_} {}
~StubStreamer() override = default;
size_t WriteCounter(VAddr address, bool has_timestamp, [[maybe_unused]] u32 value,
std::optional<u32> subreport = std::nullopt) override {
size_t new_id = GuestStreamer<Traits>::WriteCounter(address, has_timestamp, 1U, subreport);
size_t new_id =
GuestStreamer<Traits>::WriteCounter(address, has_timestamp, stub_value, subreport);
return new_id;
}
private:
u32 stub_value;
};
template <typename Traits>
@ -113,7 +120,7 @@ struct QueryCacheBase<Traits>::QueryCacheBaseImpl {
for (size_t i = 0; i < static_cast<size_t>(QueryType::MaxQueryTypes); i++) {
streamers[i] = runtime.GetStreamerInterface(static_cast<QueryType>(i));
if (streamers[i]) {
streamer_mask |= 1ULL << i;
streamer_mask |= 1ULL << streamers[i]->GetId();
}
}
}
@ -152,7 +159,7 @@ struct QueryCacheBase<Traits>::QueryCacheBaseImpl {
QueryCacheBase<Traits>* owner;
VideoCore::RasterizerInterface& rasterizer;
Core::Memory::Memory& cpu_memory;
Traits::RuntimeType& runtime;
RuntimeType& runtime;
Tegra::GPU& gpu;
std::array<StreamerInterface*, static_cast<size_t>(QueryType::MaxQueryTypes)> streamers;
u64 streamer_mask;
@ -223,15 +230,11 @@ void QueryCacheBase<Traits>::CounterReport(GPUVAddr addr, QueryType counter_type
const bool is_fence = True(flags & QueryPropertiesFlags::IsAFence);
size_t streamer_id = static_cast<size_t>(counter_type);
auto* streamer = impl->streamers[streamer_id];
if (!streamer) [[unlikely]] {
if (has_timestamp) {
u64 timestamp = impl->gpu.GetTicks();
gpu_memory->Write<u64>(addr + 8, timestamp);
gpu_memory->Write<u64>(addr, 1ULL);
} else {
gpu_memory->Write<u32>(addr, 1U);
}
return;
if (streamer == nullptr) [[unlikely]] {
counter_type = QueryType::Payload;
payload = 1U;
streamer_id = static_cast<size_t>(counter_type);
streamer = impl->streamers[streamer_id];
}
auto cpu_addr_opt = gpu_memory->GpuToCpuAddress(addr);
if (!cpu_addr_opt) [[unlikely]] {
@ -403,12 +406,6 @@ bool QueryCacheBase<Traits>::AccelerateHostConditionalRendering() {
impl->runtime.EndHostConditionalRendering();
return false;
}
/*if (!Settings::IsGPULevelHigh()) {
impl->runtime.EndHostConditionalRendering();
return gpu_memory->IsMemoryDirty(regs.render_enable.Address(), 24,
VideoCommon::CacheType::BufferCache |
VideoCommon::CacheType::QueryCache);
}*/
const ComparisonMode mode = static_cast<ComparisonMode>(regs.render_enable.mode);
const GPUVAddr address = regs.render_enable.Address();
switch (mode) {
@ -442,6 +439,9 @@ bool QueryCacheBase<Traits>::AccelerateHostConditionalRendering() {
// Async downloads
template <typename Traits>
void QueryCacheBase<Traits>::CommitAsyncFlushes() {
// Make sure to have the results synced in Host.
NotifyWFI();
u64 mask{};
{
std::scoped_lock lk(impl->flush_guard);
@ -458,8 +458,19 @@ void QueryCacheBase<Traits>::CommitAsyncFlushes() {
if (mask == 0) {
return;
}
impl->ForEachStreamerIn(mask,
[](StreamerInterface* streamer) { streamer->PushUnsyncedQueries(); });
u64 ran_mask = ~mask;
while (mask) {
impl->ForEachStreamerIn(mask, [&mask, &ran_mask](StreamerInterface* streamer) {
u64 dep_mask = streamer->GetDependentMask();
if ((dep_mask & ~ran_mask) != 0) {
return;
}
u64 index = streamer->GetId();
ran_mask |= (1ULL << index);
mask &= ~(1ULL << index);
streamer->PushUnsyncedQueries();
});
}
}
template <typename Traits>
@ -489,13 +500,11 @@ void QueryCacheBase<Traits>::PopAsyncFlushes() {
if (mask == 0) {
return;
}
u64 ran_mask = 0;
u64 next_phase = 0;
u64 ran_mask = ~mask;
while (mask) {
impl->ForEachStreamerIn(mask, [&mask, &ran_mask, &next_phase](StreamerInterface* streamer) {
impl->ForEachStreamerIn(mask, [&mask, &ran_mask](StreamerInterface* streamer) {
u64 dep_mask = streamer->GetDependenceMask();
if ((dep_mask & ~ran_mask) != 0) {
next_phase |= dep_mask;
return;
}
u64 index = streamer->GetId();
@ -503,7 +512,6 @@ void QueryCacheBase<Traits>::PopAsyncFlushes() {
mask &= ~(1ULL << index);
streamer->PopUnsyncedQueries();
});
ran_mask |= next_phase;
}
}

View file

@ -47,7 +47,7 @@ public:
BitField<0, 27, u32> query_id;
u32 raw;
std::pair<size_t, size_t> unpack() {
std::pair<size_t, size_t> unpack() const {
return {static_cast<size_t>(stream_id.Value()), static_cast<size_t>(query_id.Value())};
}
};
@ -73,7 +73,7 @@ public:
}
}
static u64 BuildMask(std::span<QueryType> types) {
static u64 BuildMask(std::span<const QueryType> types) {
u64 mask = 0;
for (auto query_type : types) {
mask |= 1ULL << (static_cast<u64>(query_type));
@ -160,7 +160,7 @@ protected:
}
}
using ContentCache = typename std::unordered_map<u64, std::unordered_map<u32, QueryLocation>>;
using ContentCache = std::unordered_map<u64, std::unordered_map<u32, QueryLocation>>;
void InvalidateQuery(QueryLocation location);
bool IsQueryDirty(QueryLocation location);
@ -175,7 +175,7 @@ protected:
friend struct QueryCacheBaseImpl;
friend RuntimeType;
std::unique_ptr<struct QueryCacheBaseImpl> impl;
std::unique_ptr<QueryCacheBaseImpl> impl;
};
} // namespace VideoCommon

View file

@ -16,7 +16,7 @@ namespace VideoCommon {
class StreamerInterface {
public:
StreamerInterface(size_t id_, u64 dependance_mask_ = 0) : id{id_}, dependance_mask{dependance_mask_} {}
explicit StreamerInterface(size_t id_) : id{id_}, dependence_mask{}, dependent_mask{} {}
virtual ~StreamerInterface() = default;
virtual QueryBase* GetQuery(size_t id) = 0;
@ -37,7 +37,7 @@ public:
/* Do Nothing */
}
virtual bool HasPendingSync() {
virtual bool HasPendingSync() const {
return false;
}
@ -52,7 +52,7 @@ public:
virtual size_t WriteCounter(VAddr address, bool has_timestamp, u32 value,
std::optional<u32> subreport = std::nullopt) = 0;
virtual bool HasUnsyncedQueries() {
virtual bool HasUnsyncedQueries() const {
return false;
}
@ -71,18 +71,28 @@ public:
}
u64 GetDependenceMask() const {
return dependance_mask;
return dependence_mask;
}
u64 GetDependentMask() const {
return dependence_mask;
}
protected:
void MakeDependent(StreamerInterface* depend_on) {
dependence_mask |= 1ULL << depend_on->id;
depend_on->dependent_mask |= 1ULL << id;
}
const size_t id;
const u64 dependance_mask;
u64 dependence_mask;
u64 dependent_mask;
};
template <typename QueryType>
class SimpleStreamer : public StreamerInterface {
public:
SimpleStreamer(size_t id_, u64 dependance_mask_ = 0) : StreamerInterface{id_, dependance_mask_} {}
explicit SimpleStreamer(size_t id_) : StreamerInterface{id_} {}
virtual ~SimpleStreamer() = default;
protected:

View file

@ -9,10 +9,10 @@
#include <utility>
#include "common/common_types.h"
#include "common/polyfill_thread.h"
#include "video_core/query_cache/types.h"
#include "video_core/cache_types.h"
#include "video_core/engines/fermi_2d.h"
#include "video_core/gpu.h"
#include "video_core/query_cache/types.h"
#include "video_core/rasterizer_download_area.h"
namespace Tegra {
@ -57,7 +57,8 @@ public:
virtual void ResetCounter(VideoCommon::QueryType type) = 0;
/// Records a GPU query and caches it
virtual void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) = 0;
virtual void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) = 0;
/// Signal an uniform buffer binding
virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,

View file

@ -43,7 +43,8 @@ public:
void Clear(u32 layer_count) override;
void DispatchCompute() override;
void ResetCounter(VideoCommon::QueryType type) override;
void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override;
void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override;
void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
void DisableGraphicsUniformBuffer(size_t stage, u32 index) override;
void FlushAll() override;

View file

@ -405,8 +405,6 @@ void RasterizerOpenGL::ResetCounter(VideoCommon::QueryType type) {
void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) {
if (type == VideoCommon::QueryType::ZPassPixelCount64) {
std::optional<u64> timestamp{True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)
? std::make_optional<u64>(gpu.GetTicks()) : std:: nullopt };
if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) {
query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, {gpu.GetTicks()});
} else {
@ -414,13 +412,23 @@ void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
}
return;
}
if (type != VideoCommon::QueryType::Payload) {
payload = 1u;
}
std::function<void()> func([this, gpu_addr, flags, memory_manager = gpu_memory, payload]() {
if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) {
u64 ticks = gpu.GetTicks();
gpu_memory->Write<u64>(gpu_addr + 8, ticks);
gpu_memory->Write<u64>(gpu_addr, static_cast<u64>(payload));
memory_manager->Write<u64>(gpu_addr + 8, ticks);
memory_manager->Write<u64>(gpu_addr, static_cast<u64>(payload));
} else {
gpu_memory->Write<u32>(gpu_addr, payload);
memory_manager->Write<u32>(gpu_addr, payload);
}
});
if (True(flags & VideoCommon::QueryPropertiesFlags::IsAFence)) {
SignalFence(std::move(func));
return;
}
func();
}
void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,

View file

@ -87,7 +87,8 @@ public:
void Clear(u32 layer_count) override;
void DispatchCompute() override;
void ResetCounter(VideoCommon::QueryType type) override;
void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override;
void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override;
void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
void DisableGraphicsUniformBuffer(size_t stage, u32 index) override;
void FlushAll() override;

View file

@ -303,9 +303,9 @@ std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble(
return {staging.buffer, staging.offset};
}
ConditionalRenderingResolvePass::ConditionalRenderingResolvePass(const Device& device_,
Scheduler& scheduler_,
DescriptorPool& descriptor_pool_, ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
ConditionalRenderingResolvePass::ConditionalRenderingResolvePass(
const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
: ComputePass(device_, descriptor_pool_, INPUT_OUTPUT_DESCRIPTOR_SET_BINDINGS,
INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE, INPUT_OUTPUT_BANK_INFO, nullptr,
RESOLVE_CONDITIONAL_RENDER_COMP_SPV),

View file

@ -7,8 +7,8 @@
#include "video_core/fence_manager.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
#include "video_core/renderer_vulkan/vk_texture_cache.h"
#include "video_core/renderer_vulkan/vk_query_cache.h"
#include "video_core/renderer_vulkan/vk_texture_cache.h"
namespace Core {
class System;

View file

@ -11,11 +11,9 @@
#include <utility>
#include <vector>
#include <boost/container/small_vector.hpp>
#include <boost/icl/interval_set.hpp>
#include "common/common_types.h"
#include "core/memory.h"
#include "video_core/engines/draw_manager.h"
#include "video_core/query_cache/query_cache.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
#include "video_core/renderer_vulkan/vk_compute_pass.h"
@ -30,6 +28,7 @@
namespace Vulkan {
using Tegra::Engines::Maxwell3D;
using VideoCommon::QueryType;
namespace {
@ -37,7 +36,7 @@ class SamplesQueryBank : public VideoCommon::BankBase {
public:
static constexpr size_t BANK_SIZE = 256;
static constexpr size_t QUERY_SIZE = 8;
SamplesQueryBank(const Device& device_, size_t index_)
explicit SamplesQueryBank(const Device& device_, size_t index_)
: BankBase(BANK_SIZE), device{device_}, index{index_} {
const auto& dev = device.GetLogical();
query_pool = dev.CreateQueryPool({
@ -109,18 +108,19 @@ struct HostSyncValues {
static constexpr bool GeneratesBaseBuffer = false;
};
template <typename Traits>
class SamplesStreamer : public BaseStreamer {
public:
SamplesStreamer(size_t id, QueryCacheRuntime& runtime_, const Device& device_,
explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_, const Device& device_,
Scheduler& scheduler_, const MemoryAllocator& memory_allocator_)
: BaseStreamer(id), runtime{runtime_}, device{device_}, scheduler{scheduler_},
: BaseStreamer(id_), runtime{runtime_}, device{device_}, scheduler{scheduler_},
memory_allocator{memory_allocator_} {
BuildResolveBuffer();
current_bank = nullptr;
current_query = nullptr;
}
~SamplesStreamer() = default;
void StartCounter() override {
if (has_started) {
return;
@ -157,7 +157,7 @@ public:
PauseCounter();
}
bool HasPendingSync() override {
bool HasPendingSync() const override {
return !pending_sync.empty();
}
@ -198,7 +198,7 @@ public:
}
resolve_slots_remaining = resolve_slots;
sync_values_stash.emplace_back();
sync_values = sync_values = &sync_values_stash.back();
sync_values = &sync_values_stash.back();
sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE);
}
resolve_slots_remaining--;
@ -207,6 +207,7 @@ public:
const size_t base_offset = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE *
(resolve_slots - resolve_slots_remaining - 1);
VkQueryPool query_pool = bank->GetInnerPool();
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([start, amount, base_offset, query_pool,
buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) {
size_t final_offset = base_offset + start * SamplesQueryBank::QUERY_SIZE;
@ -284,7 +285,7 @@ public:
return index;
}
bool HasUnsyncedQueries() override {
bool HasUnsyncedQueries() const override {
return !pending_flush_queries.empty();
}
@ -348,8 +349,8 @@ private:
for (auto q : queries) {
auto* query = GetQuery(q);
ApplyBankOp(query, [&indexer](SamplesQueryBank* bank, size_t start, size_t amount) {
auto id = bank->GetIndex();
auto pair = indexer.try_emplace(id, std::numeric_limits<size_t>::max(),
auto id_ = bank->GetIndex();
auto pair = indexer.try_emplace(id_, std::numeric_limits<size_t>::max(),
std::numeric_limits<size_t>::min());
auto& current_pair = pair.first->second;
current_pair.first = std::min(current_pair.first, start);
@ -434,13 +435,14 @@ private:
.pNext = nullptr,
.flags = 0,
.size = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * resolve_slots,
.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
};
resolve_buffers.emplace_back(
std::move(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)));
memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal));
}
static constexpr size_t resolve_slots = 8;
@ -476,7 +478,8 @@ class TFBQueryBank : public VideoCommon::BankBase {
public:
static constexpr size_t BANK_SIZE = 1024;
static constexpr size_t QUERY_SIZE = 4;
TFBQueryBank(Scheduler& scheduler_, const MemoryAllocator& memory_allocator, size_t index_)
explicit TFBQueryBank(Scheduler& scheduler_, const MemoryAllocator& memory_allocator,
size_t index_)
: BankBase(BANK_SIZE), scheduler{scheduler_}, index{index_} {
const VkBufferCreateInfo buffer_ci = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
@ -525,22 +528,21 @@ private:
vk::Buffer buffer;
};
template <typename Traits>
class PrimitivesSucceededStreamer;
template <typename Traits>
class TFBCounterStreamer : public BaseStreamer {
public:
TFBCounterStreamer(size_t id, QueryCacheRuntime& runtime_, const Device& device_,
explicit TFBCounterStreamer(size_t id_, QueryCacheRuntime& runtime_, const Device& device_,
Scheduler& scheduler_, const MemoryAllocator& memory_allocator_,
StagingBufferPool& staging_pool_)
: BaseStreamer(id), runtime{runtime_}, device{device_}, scheduler{scheduler_},
: BaseStreamer(id_), runtime{runtime_}, device{device_}, scheduler{scheduler_},
memory_allocator{memory_allocator_}, staging_pool{staging_pool_} {
buffers_count = 0;
current_bank = nullptr;
counter_buffers.fill(VK_NULL_HANDLE);
offsets.fill(0);
last_queries.fill(0);
last_queries_stride.fill(1);
const VkBufferCreateInfo buffer_ci = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
@ -564,6 +566,8 @@ public:
}
}
~TFBCounterStreamer() = default;
void StartCounter() override {
FlushBeginTFB();
has_started = true;
@ -581,15 +585,15 @@ public:
if (has_flushed_end_pending) {
FlushEndTFB();
}
runtime.View3DRegs([this](Tegra::Engines::Maxwell3D::Regs& regs) {
if (regs.transform_feedback_enabled == 0) {
runtime.View3DRegs([this](Maxwell3D& maxwell3d) {
if (maxwell3d.regs.transform_feedback_enabled == 0) {
streams_mask = 0;
has_started = false;
}
});
}
bool HasPendingSync() override {
bool HasPendingSync() const override {
return !pending_sync.empty();
}
@ -650,14 +654,19 @@ public:
return index;
}
std::optional<VAddr> GetLastQueryStream(size_t stream) {
std::optional<std::pair<VAddr, size_t>> GetLastQueryStream(size_t stream) {
if (last_queries[stream] != 0) {
return {last_queries[stream]};
std::pair<VAddr, size_t> result(last_queries[stream], last_queries_stride[stream]);
return result;
}
return std::nullopt;
}
bool HasUnsyncedQueries() override {
Maxwell3D::Regs::PrimitiveTopology GetOutputTopology() const {
return out_topology;
}
bool HasUnsyncedQueries() const override {
return !pending_flush_queries.empty();
}
@ -762,15 +771,17 @@ private:
void UpdateBuffers() {
last_queries.fill(0);
runtime.View3DRegs([this](Tegra::Engines::Maxwell3D::Regs& regs) {
last_queries_stride.fill(1);
runtime.View3DRegs([this](Maxwell3D& maxwell3d) {
buffers_count = 0;
for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers;
i++) {
const auto& tf = regs.transform_feedback;
out_topology = maxwell3d.draw_manager->GetDrawState().topology;
for (size_t i = 0; i < Maxwell3D::Regs::NumTransformFeedbackBuffers; i++) {
const auto& tf = maxwell3d.regs.transform_feedback;
if (tf.buffers[i].enable == 0) {
continue;
}
const size_t stream = tf.controls[i].stream;
last_queries_stride[stream] = tf.controls[i].stride;
streams_mask |= 1ULL << stream;
buffers_count = std::max<size_t>(buffers_count, stream + 1);
}
@ -785,7 +796,8 @@ private:
});
current_bank = &bank_pool.GetBank(current_bank_id);
}
auto [dont_care, slot] = current_bank->Reserve();
auto [dont_care, other] = current_bank->Reserve();
const size_t slot = other; // workaround to compile bug.
current_bank->AddReference();
static constexpr VkMemoryBarrier READ_BARRIER{
@ -818,11 +830,9 @@ private:
return {current_bank_id, slot};
}
template <typename Traits>
friend class PrimitivesSucceededStreamer;
static constexpr size_t NUM_STREAMS = 4;
static constexpr size_t STREAMS_MASK = (1ULL << NUM_STREAMS) - 1ULL;
QueryCacheRuntime& runtime;
const Device& device;
@ -851,6 +861,8 @@ private:
std::array<VkBuffer, NUM_STREAMS> counter_buffers{};
std::array<VkDeviceSize, NUM_STREAMS> offsets{};
std::array<VAddr, NUM_STREAMS> last_queries;
std::array<size_t, NUM_STREAMS> last_queries_stride;
Maxwell3D::Regs::PrimitiveTopology out_topology;
u64 streams_mask;
};
@ -858,32 +870,34 @@ class PrimitivesQueryBase : public VideoCommon::QueryBase {
public:
// Default constructor
PrimitivesQueryBase()
: VideoCommon::QueryBase(0, VideoCommon::QueryFlagBits::IsHostManaged, 0), stride{},
dependant_index{}, dependant_manage{} {}
: VideoCommon::QueryBase(0, VideoCommon::QueryFlagBits::IsHostManaged, 0) {}
// Parameterized constructor
PrimitivesQueryBase(bool is_long, VAddr address)
: VideoCommon::QueryBase(address, VideoCommon::QueryFlagBits::IsHostManaged, 0), stride{},
dependant_index{}, dependant_manage{} {
if (is_long) {
PrimitivesQueryBase(bool has_timestamp, VAddr address)
: VideoCommon::QueryBase(address, VideoCommon::QueryFlagBits::IsHostManaged, 0) {
if (has_timestamp) {
flags |= VideoCommon::QueryFlagBits::HasTimestamp;
}
}
u64 stride;
VAddr dependant_address;
size_t dependant_index;
bool dependant_manage;
u64 stride{};
VAddr dependant_address{};
Maxwell3D::Regs::PrimitiveTopology topology{Maxwell3D::Regs::PrimitiveTopology::Points};
size_t dependant_index{};
bool dependant_manage{};
};
template <typename Traits>
class PrimitivesSucceededStreamer : public VideoCommon::SimpleStreamer<PrimitivesQueryBase> {
public:
PrimitivesSucceededStreamer(size_t id, QueryCacheRuntime& runtime_,
TFBCounterStreamer<QueryCacheParams>& tfb_streamer_, Core::Memory::Memory& cpu_memory_)
: VideoCommon::SimpleStreamer<PrimitivesQueryBase>(
id, 1ULL << static_cast<u64>(VideoCommon::QueryType::StreamingByteCount)),
runtime{runtime_}, tfb_streamer{tfb_streamer_}, cpu_memory{cpu_memory_} {}
explicit PrimitivesSucceededStreamer(size_t id_, QueryCacheRuntime& runtime_,
TFBCounterStreamer& tfb_streamer_,
Core::Memory::Memory& cpu_memory_)
: VideoCommon::SimpleStreamer<PrimitivesQueryBase>(id_), runtime{runtime_},
tfb_streamer{tfb_streamer_}, cpu_memory{cpu_memory_} {
MakeDependent(&tfb_streamer);
}
~PrimitivesSucceededStreamer() = default;
size_t WriteCounter(VAddr address, bool has_timestamp, u32 value,
std::optional<u32> subreport_) override {
@ -901,8 +915,11 @@ public:
const size_t subreport = static_cast<size_t>(*subreport_);
auto dependant_address_opt = tfb_streamer.GetLastQueryStream(subreport);
bool must_manage_dependance = false;
new_query->topology = tfb_streamer.GetOutputTopology();
if (dependant_address_opt) {
new_query->dependant_address = *dependant_address_opt;
auto [dep_address, stride] = *dependant_address_opt;
new_query->dependant_address = dep_address;
new_query->stride = stride;
} else {
new_query->dependant_index =
tfb_streamer.WriteCounter(address, has_timestamp, value, subreport_);
@ -917,13 +934,13 @@ public:
}
return index;
}
new_query->stride = 1;
runtime.View3DRegs([new_query, subreport](Maxwell3D& maxwell3d) {
for (size_t i = 0; i < Maxwell3D::Regs::NumTransformFeedbackBuffers; i++) {
const auto& tf = maxwell3d.regs.transform_feedback;
if (tf.buffers[i].enable == 0) {
continue;
}
new_query->dependant_manage = must_manage_dependance;
runtime.View3DRegs([new_query, subreport](Tegra::Engines::Maxwell3D::Regs& regs) {
for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers;
i++) {
const auto& tf = regs.transform_feedback;
if (tf.controls[i].stream != subreport) {
continue;
}
@ -931,11 +948,14 @@ public:
break;
}
});
}
new_query->dependant_manage = must_manage_dependance;
pending_flush_queries.push_back(index);
return index;
}
bool HasUnsyncedQueries() override {
bool HasUnsyncedQueries() const override {
return !pending_flush_queries.empty();
}
@ -960,22 +980,49 @@ public:
}
query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced;
u64 num_vertices = 0;
if (query->dependant_manage) {
auto* dependant_query = tfb_streamer.GetQuery(query->dependant_index);
query->value = dependant_query->value / query->stride;
num_vertices = dependant_query->value / query->stride;
tfb_streamer.Free(query->dependant_index);
} else {
u8* pointer = cpu_memory.GetPointer(query->dependant_address);
u32 result;
std::memcpy(&result, pointer, sizeof(u32));
query->value = static_cast<u64>(result) / query->stride;
num_vertices = static_cast<u64>(result) / query->stride;
}
query->value = [&]() -> u64 {
switch (query->topology) {
case Maxwell3D::Regs::PrimitiveTopology::Points:
return num_vertices;
case Maxwell3D::Regs::PrimitiveTopology::Lines:
return num_vertices / 2;
case Maxwell3D::Regs::PrimitiveTopology::LineLoop:
return (num_vertices / 2) + 1;
case Maxwell3D::Regs::PrimitiveTopology::LineStrip:
return num_vertices - 1;
case Maxwell3D::Regs::PrimitiveTopology::Patches:
case Maxwell3D::Regs::PrimitiveTopology::Triangles:
case Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency:
return num_vertices / 3;
case Maxwell3D::Regs::PrimitiveTopology::TriangleFan:
case Maxwell3D::Regs::PrimitiveTopology::TriangleStrip:
case Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
return num_vertices - 2;
case Maxwell3D::Regs::PrimitiveTopology::Quads:
return num_vertices / 4;
case Maxwell3D::Regs::PrimitiveTopology::Polygon:
return 1U;
default:
return num_vertices;
}
}();
}
}
private:
QueryCacheRuntime& runtime;
TFBCounterStreamer<QueryCacheParams>& tfb_streamer;
TFBCounterStreamer& tfb_streamer;
Core::Memory::Memory& cpu_memory;
// syncing queue
@ -1005,7 +1052,10 @@ struct QueryCacheRuntimeImpl {
tfb_streamer(static_cast<size_t>(QueryType::StreamingByteCount), runtime, device,
scheduler, memory_allocator, staging_pool),
primitives_succeeded_streamer(
static_cast<size_t>(QueryType::StreamingPrimitivesSucceeded), runtime, tfb_streamer, cpu_memory_),
static_cast<size_t>(QueryType::StreamingPrimitivesSucceeded), runtime, tfb_streamer,
cpu_memory_),
primitives_needed_minus_suceeded_streamer(
static_cast<size_t>(QueryType::StreamingPrimitivesNeededMinusSucceeded), runtime, 0u),
hcr_setup{}, hcr_is_set{}, is_hcr_running{} {
hcr_setup.sType = VK_STRUCTURE_TYPE_CONDITIONAL_RENDERING_BEGIN_INFO_EXT;
@ -1040,9 +1090,10 @@ struct QueryCacheRuntimeImpl {
// Streamers
VideoCommon::GuestStreamer<QueryCacheParams> guest_streamer;
SamplesStreamer<QueryCacheParams> sample_streamer;
TFBCounterStreamer<QueryCacheParams> tfb_streamer;
PrimitivesSucceededStreamer<QueryCacheParams> primitives_succeeded_streamer;
SamplesStreamer sample_streamer;
TFBCounterStreamer tfb_streamer;
PrimitivesSucceededStreamer primitives_succeeded_streamer;
VideoCommon::StubStreamer<QueryCacheParams> primitives_needed_minus_suceeded_streamer;
std::vector<std::pair<VAddr, VAddr>> little_cache;
std::vector<std::pair<VkBuffer, VkDeviceSize>> buffers_to_upload_to;
@ -1059,7 +1110,7 @@ struct QueryCacheRuntimeImpl {
bool is_hcr_running;
// maxwell3d
Tegra::Engines::Maxwell3D* maxwell3d;
Maxwell3D* maxwell3d;
};
QueryCacheRuntime::QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer,
@ -1074,13 +1125,13 @@ QueryCacheRuntime::QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer,
staging_pool_, compute_pass_descriptor_queue, descriptor_pool);
}
void QueryCacheRuntime::Bind3DEngine(Tegra::Engines::Maxwell3D* maxwell3d) {
void QueryCacheRuntime::Bind3DEngine(Maxwell3D* maxwell3d) {
impl->maxwell3d = maxwell3d;
}
template <typename Func>
void QueryCacheRuntime::View3DRegs(Func&& func) {
func(impl->maxwell3d->regs);
func(*impl->maxwell3d);
}
void QueryCacheRuntime::EndHostConditionalRendering() {
@ -1240,8 +1291,12 @@ VideoCommon::StreamerInterface* QueryCacheRuntime::GetStreamerInterface(QueryTyp
return &impl->sample_streamer;
case QueryType::StreamingByteCount:
return &impl->tfb_streamer;
case QueryType::StreamingPrimitivesNeeded:
case QueryType::VtgPrimitivesOut:
case QueryType::StreamingPrimitivesSucceeded:
return &impl->primitives_succeeded_streamer;
case QueryType::StreamingPrimitivesNeededMinusSucceeded:
return &impl->primitives_needed_minus_suceeded_streamer;
default:
return nullptr;
}

View file

@ -49,7 +49,8 @@ public:
bool HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, bool qc_dirty);
bool HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1,
VideoCommon::LookupData object_2, bool qc_dirty, bool equal_check);
VideoCommon::LookupData object_2, bool qc_dirty,
bool equal_check);
VideoCommon::StreamerInterface* GetStreamerInterface(VideoCommon::QueryType query_type);
@ -66,7 +67,7 @@ private:
};
struct QueryCacheParams {
using RuntimeType = Vulkan::QueryCacheRuntime;
using RuntimeType = typename Vulkan::QueryCacheRuntime;
};
using QueryCache = VideoCommon::QueryCacheBase<QueryCacheParams>;

View file

@ -194,15 +194,6 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {
query_cache.NotifySegment(true);
#if ANDROID
if (Settings::IsGPULevelHigh()) {
// This is problematic on Android, disable on GPU Normal.
// query_cache.UpdateCounters();
}
#else
// query_cache.UpdateCounters();
#endif
GraphicsPipeline* const pipeline{pipeline_cache.CurrentGraphicsPipeline()};
if (!pipeline) {
return;
@ -294,15 +285,6 @@ void RasterizerVulkan::DrawTexture() {
query_cache.NotifySegment(true);
#if ANDROID
if (Settings::IsGPULevelHigh()) {
// This is problematic on Android, disable on GPU Normal.
// query_cache.UpdateCounters();
}
#else
// query_cache.UpdateCounters();
#endif
texture_cache.SynchronizeGraphicsDescriptors();
texture_cache.UpdateRenderTargets(false);
@ -332,15 +314,6 @@ void RasterizerVulkan::Clear(u32 layer_count) {
FlushWork();
gpu_memory->FlushCaching();
#if ANDROID
if (Settings::IsGPULevelHigh()) {
// This is problematic on Android, disable on GPU Normal.
// query_cache.UpdateCounters();
}
#else
// query_cache.UpdateCounters();
#endif
query_cache.NotifySegment(true);
query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64,
maxwell3d->regs.zpass_pixel_count_enable);

View file

@ -85,7 +85,8 @@ public:
void Clear(u32 layer_count) override;
void DispatchCompute() override;
void ResetCounter(VideoCommon::QueryType type) override;
void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override;
void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override;
void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
void DisableGraphicsUniformBuffer(size_t stage, u32 index) override;
void FlushAll() override;

View file

@ -15,9 +15,13 @@
#include "common/common_types.h"
#include "common/polyfill_thread.h"
#include "video_core/renderer_vulkan/vk_master_semaphore.h"
#include "video_core/renderer_vulkan/vk_query_cache.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
namespace VideoCommon {
template <typename Trait>
class QueryCacheBase;
}
namespace Vulkan {
class CommandPool;
@ -26,6 +30,8 @@ class Framebuffer;
class GraphicsPipeline;
class StateTracker;
struct QueryCacheParams;
/// The scheduler abstracts command buffer and fence management with an interface that's able to do
/// OpenGL-like operations on Vulkan command buffers.
class Scheduler {
@ -63,7 +69,7 @@ public:
void InvalidateState();
/// Assigns the query cache.
void SetQueryCache(QueryCache& query_cache_) {
void SetQueryCache(VideoCommon::QueryCacheBase<QueryCacheParams>& query_cache_) {
query_cache = &query_cache_;
}
@ -219,7 +225,7 @@ private:
std::unique_ptr<MasterSemaphore> master_semaphore;
std::unique_ptr<CommandPool> command_pool;
QueryCache* query_cache = nullptr;
VideoCommon::QueryCacheBase<QueryCacheParams>* query_cache = nullptr;
vk::CommandBuffer current_cmdbuf;