1
0
Fork 0
forked from suyu/suyu

Merge pull request #1730 from hrydgard/vertex-loader

* Remove late accesses to attribute_config

* Refactor: Extract VertexLoader from command_processor.cpp.

Preparation for a similar concept to Dolphin or PPSSPP. These can be JIT-ed and cached.

* Move "&" to their proper place, add missing includes and make some properly relative.

* Don't keep base_address in the loader, it doesn't belong there (with it, the loader can't be cached).

* Optimize the vertex loader, nearly doubling its speed.

* Debugger fix

* Move and rename the MemoryAccesses class to MemoryAccessTracker.
This commit is contained in:
bunnei 2016-04-29 09:42:47 -04:00
commit 90243c56fb
6 changed files with 210 additions and 121 deletions

View file

@ -16,6 +16,7 @@ set(SRCS
shader/shader_interpreter.cpp shader/shader_interpreter.cpp
swrasterizer.cpp swrasterizer.cpp
utils.cpp utils.cpp
vertex_loader.cpp
video_core.cpp video_core.cpp
) )
@ -43,6 +44,7 @@ set(HEADERS
shader/shader_interpreter.h shader/shader_interpreter.h
swrasterizer.h swrasterizer.h
utils.h utils.h
vertex_loader.h
video_core.h video_core.h
) )

View file

@ -21,6 +21,7 @@
#include "video_core/video_core.h" #include "video_core/video_core.h"
#include "video_core/debug_utils/debug_utils.h" #include "video_core/debug_utils/debug_utils.h"
#include "video_core/shader/shader_interpreter.h" #include "video_core/shader/shader_interpreter.h"
#include "video_core/vertex_loader.h"
namespace Pica { namespace Pica {
@ -188,54 +189,14 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
#if PICA_LOG_TEV #if PICA_LOG_TEV
DebugUtils::DumpTevStageConfig(regs.GetTevStages()); DebugUtils::DumpTevStageConfig(regs.GetTevStages());
#endif #endif
if (g_debug_context) if (g_debug_context)
g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr); g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr);
const auto& attribute_config = regs.vertex_attributes; // Processes information about internal vertex attributes to figure out how a vertex is loaded.
const u32 base_address = attribute_config.GetPhysicalBaseAddress(); // Later, these can be compiled and cached.
VertexLoader loader;
// Information about internal vertex attributes const u32 base_address = regs.vertex_attributes.GetPhysicalBaseAddress();
u32 vertex_attribute_sources[16]; loader.Setup(regs);
boost::fill(vertex_attribute_sources, 0xdeadbeef);
u32 vertex_attribute_strides[16] = {};
Regs::VertexAttributeFormat vertex_attribute_formats[16] = {};
u32 vertex_attribute_elements[16] = {};
u32 vertex_attribute_element_size[16] = {};
// Setup attribute data from loaders
for (int loader = 0; loader < 12; ++loader) {
const auto& loader_config = attribute_config.attribute_loaders[loader];
u32 offset = 0;
// TODO: What happens if a loader overwrites a previous one's data?
for (unsigned component = 0; component < loader_config.component_count; ++component) {
if (component >= 12) {
LOG_ERROR(HW_GPU, "Overflow in the vertex attribute loader %u trying to load component %u", loader, component);
continue;
}
u32 attribute_index = loader_config.GetComponent(component);
if (attribute_index < 12) {
int element_size = attribute_config.GetElementSizeInBytes(attribute_index);
offset = Common::AlignUp(offset, element_size);
vertex_attribute_sources[attribute_index] = base_address + loader_config.data_offset + offset;
vertex_attribute_strides[attribute_index] = static_cast<u32>(loader_config.byte_count);
vertex_attribute_formats[attribute_index] = attribute_config.GetFormat(attribute_index);
vertex_attribute_elements[attribute_index] = attribute_config.GetNumElements(attribute_index);
vertex_attribute_element_size[attribute_index] = element_size;
offset += attribute_config.GetStride(attribute_index);
} else if (attribute_index < 16) {
// Attribute ids 12, 13, 14 and 15 signify 4, 8, 12 and 16-byte paddings, respectively
offset = Common::AlignUp(offset, 4);
offset += (attribute_index - 11) * 4;
} else {
UNREACHABLE(); // This is truly unreachable due to the number of bits for each component
}
}
}
// Load vertices // Load vertices
bool is_indexed = (id == PICA_REG_INDEX(trigger_draw_indexed)); bool is_indexed = (id == PICA_REG_INDEX(trigger_draw_indexed));
@ -259,32 +220,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
} }
} }
class { DebugUtils::MemoryAccessTracker memory_accesses;
/// Combine overlapping and close ranges
void SimplifyRanges() {
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
// NOTE: We add 32 to the range end address to make sure "close" ranges are combined, too
auto it2 = std::next(it);
while (it2 != ranges.end() && it->first + it->second + 32 >= it2->first) {
it->second = std::max(it->second, it2->first + it2->second - it->first);
it2 = ranges.erase(it2);
}
}
}
public:
/// Record a particular memory access in the list
void AddAccess(u32 paddr, u32 size) {
// Create new range or extend existing one
ranges[paddr] = std::max(ranges[paddr], size);
// Simplify ranges...
SimplifyRanges();
}
/// Map of accessed ranges (mapping start address to range size)
std::map<u32, u32> ranges;
} memory_accesses;
// Simple circular-replacement vertex cache // Simple circular-replacement vertex cache
// The size has been tuned for optimal balance between hit-rate and the cost of lookup // The size has been tuned for optimal balance between hit-rate and the cost of lookup
@ -328,60 +264,13 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
if (!vertex_cache_hit) { if (!vertex_cache_hit) {
// Initialize data for the current vertex // Initialize data for the current vertex
Shader::InputVertex input; Shader::InputVertex input;
loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
if (vertex_attribute_elements[i] != 0) {
// Default attribute values set if array elements have < 4 components. This
// is *not* carried over from the default attribute settings even if they're
// enabled for this attribute.
static const float24 zero = float24::FromFloat32(0.0f);
static const float24 one = float24::FromFloat32(1.0f);
input.attr[i] = Math::Vec4<float24>(zero, zero, zero, one);
// Load per-vertex data from the loader arrays
for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
u32 source_addr = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i];
const u8* srcdata = Memory::GetPhysicalPointer(source_addr);
if (g_debug_context && Pica::g_debug_context->recorder) {
memory_accesses.AddAccess(source_addr,
(vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4
: (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1);
}
const float srcval =
(vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *reinterpret_cast<const s8*>(srcdata) :
(vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *reinterpret_cast<const u8*>(srcdata) :
(vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *reinterpret_cast<const s16*>(srcdata) :
*reinterpret_cast<const float*>(srcdata);
input.attr[i][comp] = float24::FromFloat32(srcval);
LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08x + 0x%04x: %f",
comp, i, vertex, index,
attribute_config.GetPhysicalBaseAddress(),
vertex_attribute_sources[i] - base_address,
vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i],
input.attr[i][comp].ToFloat32());
}
} else if (attribute_config.IsDefaultAttribute(i)) {
// Load the default attribute if we're configured to do so
input.attr[i] = g_state.vs.default_attributes[i];
LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)",
i, vertex, index,
input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),
input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
} else {
// TODO(yuriks): In this case, no data gets loaded and the vertex
// remains with the last value it had. This isn't currently maintained
// as global state, however, and so won't work in Citra yet.
}
}
if (g_debug_context) if (g_debug_context)
g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input); g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input);
// Send to vertex shader // Send to vertex shader
output = Shader::Run(shader_unit, input, attribute_config.GetNumTotalAttributes()); output = Shader::Run(shader_unit, input, loader.GetNumTotalAttributes());
if (is_indexed) { if (is_indexed) {
vertex_cache[vertex_cache_pos] = output; vertex_cache[vertex_cache_pos] = output;

View file

@ -216,6 +216,36 @@ void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data);
void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages); void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages);
/**
* Used in the vertex loader to merge access records. TODO: Investigate if actually useful.
*/
class MemoryAccessTracker {
/// Combine overlapping and close ranges
void SimplifyRanges() {
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
// NOTE: We add 32 to the range end address to make sure "close" ranges are combined, too
auto it2 = std::next(it);
while (it2 != ranges.end() && it->first + it->second + 32 >= it2->first) {
it->second = std::max(it->second, it2->first + it2->second - it->first);
it2 = ranges.erase(it2);
}
}
}
public:
/// Record a particular memory access in the list
void AddAccess(u32 paddr, u32 size) {
// Create new range or extend existing one
ranges[paddr] = std::max(ranges[paddr], size);
// Simplify ranges...
SimplifyRanges();
}
/// Map of accessed ranges (mapping start address to range size)
std::map<u32, u32> ranges;
};
} // namespace } // namespace
} // namespace } // namespace

View file

@ -25,7 +25,7 @@ namespace Pica {
namespace Shader { namespace Shader {
struct InputVertex { struct InputVertex {
Math::Vec4<float24> attr[16]; alignas(16) Math::Vec4<float24> attr[16];
}; };
struct OutputVertex { struct OutputVertex {

View file

@ -0,0 +1,140 @@
#include <cmath>
#include <string>
#include "boost/range/algorithm/fill.hpp"
#include "common/assert.h"
#include "common/alignment.h"
#include "common/bit_field.h"
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "common/logging/log.h"
#include "core/memory.h"
#include "video_core/debug_utils/debug_utils.h"
#include "video_core/pica.h"
#include "video_core/pica_state.h"
#include "video_core/pica_types.h"
#include "video_core/vertex_loader.h"
namespace Pica {
void VertexLoader::Setup(const Pica::Regs& regs) {
const auto& attribute_config = regs.vertex_attributes;
num_total_attributes = attribute_config.GetNumTotalAttributes();
boost::fill(vertex_attribute_sources, 0xdeadbeef);
for (int i = 0; i < 16; i++) {
vertex_attribute_is_default[i] = attribute_config.IsDefaultAttribute(i);
}
// Setup attribute data from loaders
for (int loader = 0; loader < 12; ++loader) {
const auto& loader_config = attribute_config.attribute_loaders[loader];
u32 offset = 0;
// TODO: What happens if a loader overwrites a previous one's data?
for (unsigned component = 0; component < loader_config.component_count; ++component) {
if (component >= 12) {
LOG_ERROR(HW_GPU, "Overflow in the vertex attribute loader %u trying to load component %u", loader, component);
continue;
}
u32 attribute_index = loader_config.GetComponent(component);
if (attribute_index < 12) {
offset = Common::AlignUp(offset, attribute_config.GetElementSizeInBytes(attribute_index));
vertex_attribute_sources[attribute_index] = loader_config.data_offset + offset;
vertex_attribute_strides[attribute_index] = static_cast<u32>(loader_config.byte_count);
vertex_attribute_formats[attribute_index] = attribute_config.GetFormat(attribute_index);
vertex_attribute_elements[attribute_index] = attribute_config.GetNumElements(attribute_index);
offset += attribute_config.GetStride(attribute_index);
} else if (attribute_index < 16) {
// Attribute ids 12, 13, 14 and 15 signify 4, 8, 12 and 16-byte paddings, respectively
offset = Common::AlignUp(offset, 4);
offset += (attribute_index - 11) * 4;
} else {
UNREACHABLE(); // This is truly unreachable due to the number of bits for each component
}
}
}
}
void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, Shader::InputVertex& input, DebugUtils::MemoryAccessTracker& memory_accesses) {
for (int i = 0; i < num_total_attributes; ++i) {
if (vertex_attribute_elements[i] != 0) {
// Load per-vertex data from the loader arrays
u32 source_addr = base_address + vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex;
if (g_debug_context && Pica::g_debug_context->recorder) {
memory_accesses.AddAccess(source_addr, vertex_attribute_elements[i] * (
(vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4
: (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1));
}
switch (vertex_attribute_formats[i]) {
case Regs::VertexAttributeFormat::BYTE:
{
const s8* srcdata = reinterpret_cast<const s8*>(Memory::GetPhysicalPointer(source_addr));
for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
input.attr[i][comp] = float24::FromFloat32(srcdata[comp]);
}
break;
}
case Regs::VertexAttributeFormat::UBYTE:
{
const u8* srcdata = reinterpret_cast<const u8*>(Memory::GetPhysicalPointer(source_addr));
for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
input.attr[i][comp] = float24::FromFloat32(srcdata[comp]);
}
break;
}
case Regs::VertexAttributeFormat::SHORT:
{
const s16* srcdata = reinterpret_cast<const s16*>(Memory::GetPhysicalPointer(source_addr));
for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
input.attr[i][comp] = float24::FromFloat32(srcdata[comp]);
}
break;
}
case Regs::VertexAttributeFormat::FLOAT:
{
const float* srcdata = reinterpret_cast<const float*>(Memory::GetPhysicalPointer(source_addr));
for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
input.attr[i][comp] = float24::FromFloat32(srcdata[comp]);
}
break;
}
}
// Default attribute values set if array elements have < 4 components. This
// is *not* carried over from the default attribute settings even if they're
// enabled for this attribute.
for (unsigned int comp = vertex_attribute_elements[i]; comp < 4; ++comp) {
input.attr[i][comp] = comp == 3 ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f);
}
LOG_TRACE(HW_GPU, "Loaded %d components of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08x + 0x%04x: %f %f %f %f",
vertex_attribute_elements[i], i, vertex, index,
base_address,
vertex_attribute_sources[i],
vertex_attribute_strides[i] * vertex,
input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
} else if (vertex_attribute_is_default[i]) {
// Load the default attribute if we're configured to do so
input.attr[i] = g_state.vs.default_attributes[i];
LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)",
i, vertex, index,
input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),
input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
} else {
// TODO(yuriks): In this case, no data gets loaded and the vertex
// remains with the last value it had. This isn't currently maintained
// as global state, however, and so won't work in Citra yet.
}
}
}
} // namespace Pica

View file

@ -0,0 +1,28 @@
#pragma once
#include <iterator>
#include <algorithm>
#include "video_core/pica.h"
#include "video_core/shader/shader.h"
#include "video_core/debug_utils/debug_utils.h"
namespace Pica {
class VertexLoader {
public:
void Setup(const Pica::Regs& regs);
void LoadVertex(u32 base_address, int index, int vertex, Shader::InputVertex& input, DebugUtils::MemoryAccessTracker& memory_accesses);
int GetNumTotalAttributes() const { return num_total_attributes; }
private:
u32 vertex_attribute_sources[16];
u32 vertex_attribute_strides[16] = {};
Regs::VertexAttributeFormat vertex_attribute_formats[16] = {};
u32 vertex_attribute_elements[16] = {};
bool vertex_attribute_is_default[16];
int num_total_attributes;
};
} // namespace Pica