Merge pull request #562 from neobrain/pica_progress3
More PICA200 Emulation Fixes
This commit is contained in:
commit
4a48b017ca
9 changed files with 339 additions and 210 deletions
|
@ -372,15 +372,15 @@ static void ExecuteCommand(const Command& command, u32 thread_id) {
|
||||||
Memory::VirtualToPhysicalAddress(params.start1) >> 3);
|
Memory::VirtualToPhysicalAddress(params.start1) >> 3);
|
||||||
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_end)),
|
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_end)),
|
||||||
Memory::VirtualToPhysicalAddress(params.end1) >> 3);
|
Memory::VirtualToPhysicalAddress(params.end1) >> 3);
|
||||||
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].size)), params.end1 - params.start1);
|
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].value_32bit)), params.value1);
|
||||||
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].value)), params.value1);
|
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].control)), params.control1);
|
||||||
|
|
||||||
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_start)),
|
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_start)),
|
||||||
Memory::VirtualToPhysicalAddress(params.start2) >> 3);
|
Memory::VirtualToPhysicalAddress(params.start2) >> 3);
|
||||||
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_end)),
|
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_end)),
|
||||||
Memory::VirtualToPhysicalAddress(params.end2) >> 3);
|
Memory::VirtualToPhysicalAddress(params.end2) >> 3);
|
||||||
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].size)), params.end2 - params.start2);
|
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].value_32bit)), params.value2);
|
||||||
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].value)), params.value2);
|
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].control)), params.control2);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -109,9 +109,13 @@ struct Command {
|
||||||
u32 start1;
|
u32 start1;
|
||||||
u32 value1;
|
u32 value1;
|
||||||
u32 end1;
|
u32 end1;
|
||||||
|
|
||||||
u32 start2;
|
u32 start2;
|
||||||
u32 value2;
|
u32 value2;
|
||||||
u32 end2;
|
u32 end2;
|
||||||
|
|
||||||
|
u16 control1;
|
||||||
|
u16 control2;
|
||||||
} memory_fill;
|
} memory_fill;
|
||||||
|
|
||||||
struct {
|
struct {
|
||||||
|
|
|
@ -67,23 +67,38 @@ inline void Write(u32 addr, const T data) {
|
||||||
switch (index) {
|
switch (index) {
|
||||||
|
|
||||||
// Memory fills are triggered once the fill value is written.
|
// Memory fills are triggered once the fill value is written.
|
||||||
// NOTE: This is not verified.
|
case GPU_REG_INDEX_WORKAROUND(memory_fill_config[0].trigger, 0x00004 + 0x3):
|
||||||
case GPU_REG_INDEX_WORKAROUND(memory_fill_config[0].value, 0x00004 + 0x3):
|
case GPU_REG_INDEX_WORKAROUND(memory_fill_config[1].trigger, 0x00008 + 0x3):
|
||||||
case GPU_REG_INDEX_WORKAROUND(memory_fill_config[1].value, 0x00008 + 0x3):
|
|
||||||
{
|
{
|
||||||
const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].value));
|
const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].trigger));
|
||||||
const auto& config = g_regs.memory_fill_config[is_second_filler];
|
auto& config = g_regs.memory_fill_config[is_second_filler];
|
||||||
|
|
||||||
// TODO: Not sure if this check should be done at GSP level instead
|
if (config.address_start && config.trigger) {
|
||||||
if (config.address_start) {
|
u8* start = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetStartAddress()));
|
||||||
// TODO: Not sure if this algorithm is correct, particularly because it doesn't use the size member at all
|
u8* end = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetEndAddress()));
|
||||||
u32* start = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetStartAddress()));
|
|
||||||
u32* end = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetEndAddress()));
|
if (config.fill_24bit) {
|
||||||
for (u32* ptr = start; ptr < end; ++ptr)
|
// fill with 24-bit values
|
||||||
*ptr = bswap32(config.value); // TODO: This is just a workaround to missing framebuffer format emulation
|
for (u8* ptr = start; ptr < end; ptr += 3) {
|
||||||
|
ptr[0] = config.value_24bit_b;
|
||||||
|
ptr[1] = config.value_24bit_g;
|
||||||
|
ptr[2] = config.value_24bit_r;
|
||||||
|
}
|
||||||
|
} else if (config.fill_32bit) {
|
||||||
|
// fill with 32-bit values
|
||||||
|
for (u32* ptr = (u32*)start; ptr < (u32*)end; ++ptr)
|
||||||
|
*ptr = config.value_32bit;
|
||||||
|
} else {
|
||||||
|
// fill with 16-bit values
|
||||||
|
for (u16* ptr = (u16*)start; ptr < (u16*)end; ++ptr)
|
||||||
|
*ptr = config.value_16bit;
|
||||||
|
}
|
||||||
|
|
||||||
LOG_TRACE(HW_GPU, "MemoryFill from 0x%08x to 0x%08x", config.GetStartAddress(), config.GetEndAddress());
|
LOG_TRACE(HW_GPU, "MemoryFill from 0x%08x to 0x%08x", config.GetStartAddress(), config.GetEndAddress());
|
||||||
|
|
||||||
|
config.trigger = 0;
|
||||||
|
config.finished = 1;
|
||||||
|
|
||||||
if (!is_second_filler) {
|
if (!is_second_filler) {
|
||||||
GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC0);
|
GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC0);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -84,9 +84,35 @@ struct Regs {
|
||||||
|
|
||||||
struct {
|
struct {
|
||||||
u32 address_start;
|
u32 address_start;
|
||||||
u32 address_end; // ?
|
u32 address_end;
|
||||||
u32 size;
|
|
||||||
u32 value; // ?
|
union {
|
||||||
|
u32 value_32bit;
|
||||||
|
|
||||||
|
BitField<0, 16, u32> value_16bit;
|
||||||
|
|
||||||
|
// TODO: Verify component order
|
||||||
|
BitField< 0, 8, u32> value_24bit_r;
|
||||||
|
BitField< 8, 8, u32> value_24bit_g;
|
||||||
|
BitField<16, 8, u32> value_24bit_b;
|
||||||
|
};
|
||||||
|
|
||||||
|
union {
|
||||||
|
u32 control;
|
||||||
|
|
||||||
|
// Setting this field to 1 triggers the memory fill.
|
||||||
|
// This field also acts as a status flag, and gets reset to 0 upon completion.
|
||||||
|
BitField<0, 1, u32> trigger;
|
||||||
|
|
||||||
|
// Set to 1 upon completion.
|
||||||
|
BitField<0, 1, u32> finished;
|
||||||
|
|
||||||
|
// 0: fill with 16- or 32-bit wide values; 1: fill with 24-bit wide values
|
||||||
|
BitField<8, 1, u32> fill_24bit;
|
||||||
|
|
||||||
|
// 0: fill with 16-bit wide values; 1: fill with 32-bit wide values
|
||||||
|
BitField<9, 1, u32> fill_32bit;
|
||||||
|
};
|
||||||
|
|
||||||
inline u32 GetStartAddress() const {
|
inline u32 GetStartAddress() const {
|
||||||
return DecodeAddressRegister(address_start);
|
return DecodeAddressRegister(address_start);
|
||||||
|
|
|
@ -15,30 +15,18 @@ namespace Clipper {
|
||||||
|
|
||||||
struct ClippingEdge {
|
struct ClippingEdge {
|
||||||
public:
|
public:
|
||||||
enum Type {
|
ClippingEdge(Math::Vec4<float24> coeffs,
|
||||||
POS_X = 0,
|
Math::Vec4<float24> bias = Math::Vec4<float24>(float24::FromFloat32(0),
|
||||||
NEG_X = 1,
|
float24::FromFloat32(0),
|
||||||
POS_Y = 2,
|
float24::FromFloat32(0),
|
||||||
NEG_Y = 3,
|
float24::FromFloat32(0)))
|
||||||
POS_Z = 4,
|
: coeffs(coeffs),
|
||||||
NEG_Z = 5,
|
bias(bias)
|
||||||
};
|
{
|
||||||
|
}
|
||||||
ClippingEdge(Type type, float24 position) : type(type), pos(position) {}
|
|
||||||
|
|
||||||
bool IsInside(const OutputVertex& vertex) const {
|
bool IsInside(const OutputVertex& vertex) const {
|
||||||
switch (type) {
|
return Math::Dot(vertex.pos + bias, coeffs) <= float24::FromFloat32(0);
|
||||||
case POS_X: return vertex.pos.x <= pos * vertex.pos.w;
|
|
||||||
case NEG_X: return vertex.pos.x >= pos * vertex.pos.w;
|
|
||||||
case POS_Y: return vertex.pos.y <= pos * vertex.pos.w;
|
|
||||||
case NEG_Y: return vertex.pos.y >= pos * vertex.pos.w;
|
|
||||||
|
|
||||||
// TODO: Check z compares ... should be 0..1 instead?
|
|
||||||
case POS_Z: return vertex.pos.z <= pos * vertex.pos.w;
|
|
||||||
|
|
||||||
default:
|
|
||||||
case NEG_Z: return vertex.pos.z >= pos * vertex.pos.w;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool IsOutSide(const OutputVertex& vertex) const {
|
bool IsOutSide(const OutputVertex& vertex) const {
|
||||||
|
@ -46,31 +34,17 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const {
|
OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const {
|
||||||
auto dotpr = [this](const OutputVertex& vtx) {
|
float24 dp = Math::Dot(v0.pos + bias, coeffs);
|
||||||
switch (type) {
|
float24 dp_prev = Math::Dot(v1.pos + bias, coeffs);
|
||||||
case POS_X: return vtx.pos.x - vtx.pos.w;
|
|
||||||
case NEG_X: return -vtx.pos.x - vtx.pos.w;
|
|
||||||
case POS_Y: return vtx.pos.y - vtx.pos.w;
|
|
||||||
case NEG_Y: return -vtx.pos.y - vtx.pos.w;
|
|
||||||
|
|
||||||
// TODO: Verify z clipping
|
|
||||||
case POS_Z: return vtx.pos.z - vtx.pos.w;
|
|
||||||
|
|
||||||
default:
|
|
||||||
case NEG_Z: return -vtx.pos.w;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
float24 dp = dotpr(v0);
|
|
||||||
float24 dp_prev = dotpr(v1);
|
|
||||||
float24 factor = dp_prev / (dp_prev - dp);
|
float24 factor = dp_prev / (dp_prev - dp);
|
||||||
|
|
||||||
return OutputVertex::Lerp(factor, v0, v1);
|
return OutputVertex::Lerp(factor, v0, v1);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Type type;
|
|
||||||
float24 pos;
|
float24 pos;
|
||||||
|
Math::Vec4<float24> coeffs;
|
||||||
|
Math::Vec4<float24> bias;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void InitScreenCoordinates(OutputVertex& vtx)
|
static void InitScreenCoordinates(OutputVertex& vtx)
|
||||||
|
@ -98,10 +72,9 @@ static void InitScreenCoordinates(OutputVertex& vtx)
|
||||||
vtx.tc2 *= inv_w;
|
vtx.tc2 *= inv_w;
|
||||||
vtx.pos.w = inv_w;
|
vtx.pos.w = inv_w;
|
||||||
|
|
||||||
// TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not
|
|
||||||
vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
|
vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
|
||||||
vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
|
vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
|
||||||
vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale;
|
vtx.screenpos[2] = viewport.offset_z + vtx.pos.z * inv_w * viewport.zscale;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
|
void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
|
||||||
|
@ -117,14 +90,29 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
|
||||||
auto* output_list = &buffer_a;
|
auto* output_list = &buffer_a;
|
||||||
auto* input_list = &buffer_b;
|
auto* input_list = &buffer_b;
|
||||||
|
|
||||||
|
// NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value.
|
||||||
|
// TODO: Not sure if this is a valid approach. Also should probably instead use the smallest
|
||||||
|
// epsilon possible within float24 accuracy.
|
||||||
|
static const float24 EPSILON = float24::FromFloat32(0.00001);
|
||||||
|
static const float24 f0 = float24::FromFloat32(0.0);
|
||||||
|
static const float24 f1 = float24::FromFloat32(1.0);
|
||||||
|
static const std::array<ClippingEdge, 7> clipping_edges = {{
|
||||||
|
{ Math::MakeVec( f1, f0, f0, -f1) }, // x = +w
|
||||||
|
{ Math::MakeVec(-f1, f0, f0, -f1) }, // x = -w
|
||||||
|
{ Math::MakeVec( f0, f1, f0, -f1) }, // y = +w
|
||||||
|
{ Math::MakeVec( f0, -f1, f0, -f1) }, // y = -w
|
||||||
|
{ Math::MakeVec( f0, f0, f1, f0) }, // z = 0
|
||||||
|
{ Math::MakeVec( f0, f0, -f1, -f1) }, // z = -w
|
||||||
|
{ Math::MakeVec( f0, f0, f0, -f1), Math::Vec4<float24>(f0, f0, f0, EPSILON) }, // w = EPSILON
|
||||||
|
}};
|
||||||
|
|
||||||
|
// TODO: If one vertex lies outside one of the depth clipping planes, some platforms (e.g. Wii)
|
||||||
|
// drop the whole primitive instead of clipping the primitive properly. We should test if
|
||||||
|
// this happens on the 3DS, too.
|
||||||
|
|
||||||
// Simple implementation of the Sutherland-Hodgman clipping algorithm.
|
// Simple implementation of the Sutherland-Hodgman clipping algorithm.
|
||||||
// TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
|
// TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
|
||||||
for (auto edge : { ClippingEdge(ClippingEdge::POS_X, float24::FromFloat32(+1.0)),
|
for (auto edge : clipping_edges) {
|
||||||
ClippingEdge(ClippingEdge::NEG_X, float24::FromFloat32(-1.0)),
|
|
||||||
ClippingEdge(ClippingEdge::POS_Y, float24::FromFloat32(+1.0)),
|
|
||||||
ClippingEdge(ClippingEdge::NEG_Y, float24::FromFloat32(-1.0)),
|
|
||||||
ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),
|
|
||||||
ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {
|
|
||||||
|
|
||||||
std::swap(input_list, output_list);
|
std::swap(input_list, output_list);
|
||||||
output_list->clear();
|
output_list->clear();
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
// Licensed under GPLv2 or any later version
|
// Licensed under GPLv2 or any later version
|
||||||
// Refer to the license.txt file included.
|
// Refer to the license.txt file included.
|
||||||
|
|
||||||
|
#include <boost/range/algorithm/fill.hpp>
|
||||||
|
|
||||||
#include "clipper.h"
|
#include "clipper.h"
|
||||||
#include "command_processor.h"
|
#include "command_processor.h"
|
||||||
#include "math.h"
|
#include "math.h"
|
||||||
|
@ -23,10 +25,6 @@ static int float_regs_counter = 0;
|
||||||
|
|
||||||
static u32 uniform_write_buffer[4];
|
static u32 uniform_write_buffer[4];
|
||||||
|
|
||||||
// Used for VSLoadProgramData and VSLoadSwizzleData
|
|
||||||
static u32 vs_binary_write_offset = 0;
|
|
||||||
static u32 vs_swizzle_write_offset = 0;
|
|
||||||
|
|
||||||
static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
|
static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
|
||||||
|
|
||||||
if (id >= registers.NumIds())
|
if (id >= registers.NumIds())
|
||||||
|
@ -65,10 +63,14 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
|
||||||
|
|
||||||
// Information about internal vertex attributes
|
// Information about internal vertex attributes
|
||||||
u32 vertex_attribute_sources[16];
|
u32 vertex_attribute_sources[16];
|
||||||
std::fill(vertex_attribute_sources, &vertex_attribute_sources[16], 0xdeadbeef);
|
boost::fill(vertex_attribute_sources, 0xdeadbeef);
|
||||||
u32 vertex_attribute_strides[16];
|
u32 vertex_attribute_strides[16];
|
||||||
u32 vertex_attribute_formats[16];
|
u32 vertex_attribute_formats[16];
|
||||||
u32 vertex_attribute_elements[16];
|
|
||||||
|
// HACK: Initialize vertex_attribute_elements to zero to prevent infinite loops below.
|
||||||
|
// This is one of the hacks required to deal with uninitalized vertex attributes.
|
||||||
|
// TODO: Fix this properly.
|
||||||
|
u32 vertex_attribute_elements[16] = {};
|
||||||
u32 vertex_attribute_element_size[16];
|
u32 vertex_attribute_element_size[16];
|
||||||
|
|
||||||
// Setup attribute data from loaders
|
// Setup attribute data from loaders
|
||||||
|
@ -252,11 +254,6 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Seems to be used to reset the write pointer for VSLoadProgramData
|
|
||||||
case PICA_REG_INDEX(vs_program.begin_load):
|
|
||||||
vs_binary_write_offset = 0;
|
|
||||||
break;
|
|
||||||
|
|
||||||
// Load shader program code
|
// Load shader program code
|
||||||
case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc):
|
case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc):
|
||||||
case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd):
|
case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd):
|
||||||
|
@ -267,16 +264,11 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
|
||||||
case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[6], 0x2d2):
|
case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[6], 0x2d2):
|
||||||
case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[7], 0x2d3):
|
case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[7], 0x2d3):
|
||||||
{
|
{
|
||||||
VertexShader::SubmitShaderMemoryChange(vs_binary_write_offset, value);
|
VertexShader::SubmitShaderMemoryChange(registers.vs_program.offset, value);
|
||||||
vs_binary_write_offset++;
|
registers.vs_program.offset++;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Seems to be used to reset the write pointer for VSLoadSwizzleData
|
|
||||||
case PICA_REG_INDEX(vs_swizzle_patterns.begin_load):
|
|
||||||
vs_swizzle_write_offset = 0;
|
|
||||||
break;
|
|
||||||
|
|
||||||
// Load swizzle pattern data
|
// Load swizzle pattern data
|
||||||
case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[0], 0x2d6):
|
case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[0], 0x2d6):
|
||||||
case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[1], 0x2d7):
|
case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[1], 0x2d7):
|
||||||
|
@ -287,8 +279,8 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
|
||||||
case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc):
|
case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc):
|
||||||
case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd):
|
case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd):
|
||||||
{
|
{
|
||||||
VertexShader::SubmitSwizzleDataChange(vs_swizzle_write_offset, value);
|
VertexShader::SubmitSwizzleDataChange(registers.vs_swizzle_patterns.offset, value);
|
||||||
vs_swizzle_write_offset++;
|
registers.vs_swizzle_patterns.offset++;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -120,6 +120,7 @@ struct Regs {
|
||||||
enum WrapMode : u32 {
|
enum WrapMode : u32 {
|
||||||
ClampToEdge = 0,
|
ClampToEdge = 0,
|
||||||
Repeat = 2,
|
Repeat = 2,
|
||||||
|
MirroredRepeat = 3,
|
||||||
};
|
};
|
||||||
|
|
||||||
INSERT_PADDING_WORDS(0x1);
|
INSERT_PADDING_WORDS(0x1);
|
||||||
|
@ -131,7 +132,7 @@ struct Regs {
|
||||||
|
|
||||||
union {
|
union {
|
||||||
BitField< 8, 2, WrapMode> wrap_s;
|
BitField< 8, 2, WrapMode> wrap_s;
|
||||||
BitField<11, 2, WrapMode> wrap_t;
|
BitField<12, 2, WrapMode> wrap_t;
|
||||||
};
|
};
|
||||||
|
|
||||||
INSERT_PADDING_WORDS(0x1);
|
INSERT_PADDING_WORDS(0x1);
|
||||||
|
@ -223,6 +224,8 @@ struct Regs {
|
||||||
struct TevStageConfig {
|
struct TevStageConfig {
|
||||||
enum class Source : u32 {
|
enum class Source : u32 {
|
||||||
PrimaryColor = 0x0,
|
PrimaryColor = 0x0,
|
||||||
|
PrimaryFragmentColor = 0x1,
|
||||||
|
|
||||||
Texture0 = 0x3,
|
Texture0 = 0x3,
|
||||||
Texture1 = 0x4,
|
Texture1 = 0x4,
|
||||||
Texture2 = 0x5,
|
Texture2 = 0x5,
|
||||||
|
@ -265,6 +268,9 @@ struct Regs {
|
||||||
AddSigned = 3,
|
AddSigned = 3,
|
||||||
Lerp = 4,
|
Lerp = 4,
|
||||||
Subtract = 5,
|
Subtract = 5,
|
||||||
|
|
||||||
|
MultiplyThenAdd = 8,
|
||||||
|
AddThenMultiply = 9,
|
||||||
};
|
};
|
||||||
|
|
||||||
union {
|
union {
|
||||||
|
@ -337,7 +343,7 @@ struct Regs {
|
||||||
};
|
};
|
||||||
|
|
||||||
union {
|
union {
|
||||||
enum BlendEquation : u32 {
|
enum class BlendEquation : u32 {
|
||||||
Add = 0,
|
Add = 0,
|
||||||
Subtract = 1,
|
Subtract = 1,
|
||||||
ReverseSubtract = 2,
|
ReverseSubtract = 2,
|
||||||
|
@ -421,7 +427,7 @@ struct Regs {
|
||||||
INSERT_PADDING_WORDS(0x6);
|
INSERT_PADDING_WORDS(0x6);
|
||||||
|
|
||||||
u32 depth_format;
|
u32 depth_format;
|
||||||
u32 color_format;
|
BitField<16, 3, u32> color_format;
|
||||||
|
|
||||||
INSERT_PADDING_WORDS(0x4);
|
INSERT_PADDING_WORDS(0x4);
|
||||||
|
|
||||||
|
@ -678,7 +684,9 @@ struct Regs {
|
||||||
INSERT_PADDING_WORDS(0x2);
|
INSERT_PADDING_WORDS(0x2);
|
||||||
|
|
||||||
struct {
|
struct {
|
||||||
u32 begin_load;
|
// Offset of the next instruction to write code to.
|
||||||
|
// Incremented with each instruction write.
|
||||||
|
u32 offset;
|
||||||
|
|
||||||
// Writing to these registers sets the "current" word in the shader program.
|
// Writing to these registers sets the "current" word in the shader program.
|
||||||
// TODO: It's not clear how the hardware stores what the "current" word is.
|
// TODO: It's not clear how the hardware stores what the "current" word is.
|
||||||
|
@ -690,7 +698,9 @@ struct Regs {
|
||||||
// This register group is used to load an internal table of swizzling patterns,
|
// This register group is used to load an internal table of swizzling patterns,
|
||||||
// which are indexed by each shader instruction to specify vector component swizzling.
|
// which are indexed by each shader instruction to specify vector component swizzling.
|
||||||
struct {
|
struct {
|
||||||
u32 begin_load;
|
// Offset of the next swizzle pattern to write code to.
|
||||||
|
// Incremented with each instruction write.
|
||||||
|
u32 offset;
|
||||||
|
|
||||||
// Writing to these registers sets the "current" swizzle pattern in the table.
|
// Writing to these registers sets the "current" swizzle pattern in the table.
|
||||||
// TODO: It's not clear how the hardware stores what the "current" swizzle pattern is.
|
// TODO: It's not clear how the hardware stores what the "current" swizzle pattern is.
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
#include "common/common_types.h"
|
#include "common/common_types.h"
|
||||||
|
#include "common/math_util.h"
|
||||||
|
|
||||||
#include "math.h"
|
#include "math.h"
|
||||||
#include "pica.h"
|
#include "pica.h"
|
||||||
|
@ -20,16 +21,31 @@ namespace Rasterizer {
|
||||||
static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
|
static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
|
||||||
const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
|
const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
|
||||||
u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
|
u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
|
||||||
u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b();
|
|
||||||
|
|
||||||
// Assuming RGBA8 format until actual framebuffer format handling is implemented
|
// Similarly to textures, the render framebuffer is laid out from bottom to top, too.
|
||||||
|
// NOTE: The framebuffer height register contains the actual FB height minus one.
|
||||||
|
y = (registers.framebuffer.height - y);
|
||||||
|
|
||||||
|
switch (registers.framebuffer.color_format) {
|
||||||
|
case registers.framebuffer.RGBA8:
|
||||||
|
{
|
||||||
|
u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b();
|
||||||
*(color_buffer + x + y * registers.framebuffer.GetWidth()) = value;
|
*(color_buffer + x + y * registers.framebuffer.GetWidth()) = value;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
LOG_CRITICAL(Render_Software, "Unknown framebuffer color format %x", registers.framebuffer.color_format);
|
||||||
|
UNIMPLEMENTED();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static const Math::Vec4<u8> GetPixel(int x, int y) {
|
static const Math::Vec4<u8> GetPixel(int x, int y) {
|
||||||
const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
|
const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
|
||||||
u32* color_buffer_u32 = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
|
u32* color_buffer_u32 = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
|
||||||
|
|
||||||
|
y = (registers.framebuffer.height - y);
|
||||||
|
|
||||||
u32 value = *(color_buffer_u32 + x + y * registers.framebuffer.GetWidth());
|
u32 value = *(color_buffer_u32 + x + y * registers.framebuffer.GetWidth());
|
||||||
Math::Vec4<u8> ret;
|
Math::Vec4<u8> ret;
|
||||||
ret.a() = value >> 24;
|
ret.a() = value >> 24;
|
||||||
|
@ -43,6 +59,8 @@ static u32 GetDepth(int x, int y) {
|
||||||
const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
|
const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
|
||||||
u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
|
u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
|
||||||
|
|
||||||
|
y = (registers.framebuffer.height - y);
|
||||||
|
|
||||||
// Assuming 16-bit depth buffer format until actual format handling is implemented
|
// Assuming 16-bit depth buffer format until actual format handling is implemented
|
||||||
return *(depth_buffer + x + y * registers.framebuffer.GetWidth());
|
return *(depth_buffer + x + y * registers.framebuffer.GetWidth());
|
||||||
}
|
}
|
||||||
|
@ -51,6 +69,8 @@ static void SetDepth(int x, int y, u16 value) {
|
||||||
const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
|
const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
|
||||||
u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
|
u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
|
||||||
|
|
||||||
|
y = (registers.framebuffer.height - y);
|
||||||
|
|
||||||
// Assuming 16-bit depth buffer format until actual format handling is implemented
|
// Assuming 16-bit depth buffer format until actual format handling is implemented
|
||||||
*(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value;
|
*(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value;
|
||||||
}
|
}
|
||||||
|
@ -90,15 +110,22 @@ static int SignedArea (const Math::Vec2<Fix12P4>& vtx1,
|
||||||
return Math::Cross(vec1, vec2).z;
|
return Math::Cross(vec1, vec2).z;
|
||||||
};
|
};
|
||||||
|
|
||||||
void ProcessTriangle(const VertexShader::OutputVertex& v0,
|
/**
|
||||||
|
* Helper function for ProcessTriangle with the "reversed" flag to allow for implementing
|
||||||
|
* culling via recursion.
|
||||||
|
*/
|
||||||
|
static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
|
||||||
const VertexShader::OutputVertex& v1,
|
const VertexShader::OutputVertex& v1,
|
||||||
const VertexShader::OutputVertex& v2)
|
const VertexShader::OutputVertex& v2,
|
||||||
|
bool reversed = false)
|
||||||
{
|
{
|
||||||
// vertex positions in rasterizer coordinates
|
// vertex positions in rasterizer coordinates
|
||||||
auto FloatToFix = [](float24 flt) {
|
static auto FloatToFix = [](float24 flt) {
|
||||||
return Fix12P4(static_cast<unsigned short>(flt.ToFloat32() * 16.0f));
|
// TODO: Rounding here is necessary to prevent garbage pixels at
|
||||||
|
// triangle borders. Is it that the correct solution, though?
|
||||||
|
return Fix12P4(static_cast<unsigned short>(round(flt.ToFloat32() * 16.0f)));
|
||||||
};
|
};
|
||||||
auto ScreenToRasterizerCoordinates = [FloatToFix](const Math::Vec3<float24> vec) {
|
static auto ScreenToRasterizerCoordinates = [](const Math::Vec3<float24>& vec) {
|
||||||
return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
|
return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -106,14 +133,20 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
|
||||||
ScreenToRasterizerCoordinates(v1.screenpos),
|
ScreenToRasterizerCoordinates(v1.screenpos),
|
||||||
ScreenToRasterizerCoordinates(v2.screenpos) };
|
ScreenToRasterizerCoordinates(v2.screenpos) };
|
||||||
|
|
||||||
if (registers.cull_mode == Regs::CullMode::KeepClockWise) {
|
if (registers.cull_mode == Regs::CullMode::KeepAll) {
|
||||||
|
// Make sure we always end up with a triangle wound counter-clockwise
|
||||||
|
if (!reversed && SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) {
|
||||||
|
ProcessTriangleInternal(v0, v2, v1, true);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (!reversed && registers.cull_mode == Regs::CullMode::KeepClockWise) {
|
||||||
// Reverse vertex order and use the CCW code path.
|
// Reverse vertex order and use the CCW code path.
|
||||||
std::swap(vtxpos[1], vtxpos[2]);
|
ProcessTriangleInternal(v0, v2, v1, true);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (registers.cull_mode != Regs::CullMode::KeepAll) {
|
|
||||||
// Cull away triangles which are wound clockwise.
|
// Cull away triangles which are wound clockwise.
|
||||||
// TODO: A check for degenerate triangles ("== 0") should be considered for CullMode::KeepAll
|
|
||||||
if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0)
|
if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -155,9 +188,10 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
|
||||||
auto textures = registers.GetTextures();
|
auto textures = registers.GetTextures();
|
||||||
auto tev_stages = registers.GetTevStages();
|
auto tev_stages = registers.GetTevStages();
|
||||||
|
|
||||||
|
// Enter rasterization loop, starting at the center of the topleft bounding box corner.
|
||||||
// TODO: Not sure if looping through x first might be faster
|
// TODO: Not sure if looping through x first might be faster
|
||||||
for (u16 y = min_y; y < max_y; y += 0x10) {
|
for (u16 y = min_y + 8; y < max_y; y += 0x10) {
|
||||||
for (u16 x = min_x; x < max_x; x += 0x10) {
|
for (u16 x = min_x + 8; x < max_x; x += 0x10) {
|
||||||
|
|
||||||
// Calculate the barycentric coordinates w0, w1 and w2
|
// Calculate the barycentric coordinates w0, w1 and w2
|
||||||
int w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
|
int w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
|
||||||
|
@ -220,7 +254,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
|
||||||
|
|
||||||
int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
|
int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
|
||||||
int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
|
int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
|
||||||
auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) {
|
static auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) {
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case Regs::TextureConfig::ClampToEdge:
|
case Regs::TextureConfig::ClampToEdge:
|
||||||
val = std::max(val, 0);
|
val = std::max(val, 0);
|
||||||
|
@ -228,7 +262,15 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
|
||||||
return val;
|
return val;
|
||||||
|
|
||||||
case Regs::TextureConfig::Repeat:
|
case Regs::TextureConfig::Repeat:
|
||||||
return (int)(((unsigned)val) % size);
|
return (int)((unsigned)val % size);
|
||||||
|
|
||||||
|
case Regs::TextureConfig::MirroredRepeat:
|
||||||
|
{
|
||||||
|
int val = (int)((unsigned)val % (2 * size));
|
||||||
|
if (val >= size)
|
||||||
|
val = 2 * size - 1 - val;
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
default:
|
default:
|
||||||
LOG_ERROR(HW_GPU, "Unknown texture coordinate wrapping mode %x\n", (int)mode);
|
LOG_ERROR(HW_GPU, "Unknown texture coordinate wrapping mode %x\n", (int)mode);
|
||||||
|
@ -236,6 +278,10 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Textures are laid out from bottom to top, hence we invert the t coordinate.
|
||||||
|
// NOTE: This may not be the right place for the inversion.
|
||||||
|
// TODO: Check if this applies to ETC textures, too.
|
||||||
s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width);
|
s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width);
|
||||||
t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height);
|
t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height);
|
||||||
|
|
||||||
|
@ -262,7 +308,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
|
||||||
|
|
||||||
auto GetSource = [&](Source source) -> Math::Vec4<u8> {
|
auto GetSource = [&](Source source) -> Math::Vec4<u8> {
|
||||||
switch (source) {
|
switch (source) {
|
||||||
|
// TODO: What's the difference between these two?
|
||||||
case Source::PrimaryColor:
|
case Source::PrimaryColor:
|
||||||
|
case Source::PrimaryFragmentColor:
|
||||||
return primary_color;
|
return primary_color;
|
||||||
|
|
||||||
case Source::Texture0:
|
case Source::Texture0:
|
||||||
|
@ -378,6 +426,25 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
|
||||||
return result.Cast<u8>();
|
return result.Cast<u8>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case Operation::MultiplyThenAdd:
|
||||||
|
{
|
||||||
|
auto result = (input[0] * input[1] + 255 * input[2].Cast<int>()) / 255;
|
||||||
|
result.r() = std::min(255, result.r());
|
||||||
|
result.g() = std::min(255, result.g());
|
||||||
|
result.b() = std::min(255, result.b());
|
||||||
|
return result.Cast<u8>();
|
||||||
|
}
|
||||||
|
|
||||||
|
case Operation::AddThenMultiply:
|
||||||
|
{
|
||||||
|
auto result = input[0] + input[1];
|
||||||
|
result.r() = std::min(255, result.r());
|
||||||
|
result.g() = std::min(255, result.g());
|
||||||
|
result.b() = std::min(255, result.b());
|
||||||
|
result = (result * input[2].Cast<int>()) / 255;
|
||||||
|
return result.Cast<u8>();
|
||||||
|
}
|
||||||
|
|
||||||
default:
|
default:
|
||||||
LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
|
LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
|
||||||
UNIMPLEMENTED();
|
UNIMPLEMENTED();
|
||||||
|
@ -402,6 +469,12 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
|
||||||
case Operation::Subtract:
|
case Operation::Subtract:
|
||||||
return std::max(0, (int)input[0] - (int)input[1]);
|
return std::max(0, (int)input[0] - (int)input[1]);
|
||||||
|
|
||||||
|
case Operation::MultiplyThenAdd:
|
||||||
|
return std::min(255, (input[0] * input[1] + 255 * input[2]) / 255);
|
||||||
|
|
||||||
|
case Operation::AddThenMultiply:
|
||||||
|
return (std::min(255, (input[0] + input[1])) * input[2]) / 255;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
LOG_ERROR(HW_GPU, "Unknown alpha combiner operation %d\n", (int)op);
|
LOG_ERROR(HW_GPU, "Unknown alpha combiner operation %d\n", (int)op);
|
||||||
UNIMPLEMENTED();
|
UNIMPLEMENTED();
|
||||||
|
@ -475,7 +548,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
|
||||||
|
|
||||||
// TODO: Does depth indeed only get written even if depth testing is enabled?
|
// TODO: Does depth indeed only get written even if depth testing is enabled?
|
||||||
if (registers.output_merger.depth_test_enable) {
|
if (registers.output_merger.depth_test_enable) {
|
||||||
u16 z = (u16)(-(v0.screenpos[2].ToFloat32() * w0 +
|
u16 z = (u16)((v0.screenpos[2].ToFloat32() * w0 +
|
||||||
v1.screenpos[2].ToFloat32() * w1 +
|
v1.screenpos[2].ToFloat32() * w1 +
|
||||||
v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);
|
v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);
|
||||||
u16 ref_z = GetDepth(x >> 4, y >> 4);
|
u16 ref_z = GetDepth(x >> 4, y >> 4);
|
||||||
|
@ -524,6 +597,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
|
||||||
}
|
}
|
||||||
|
|
||||||
auto dest = GetPixel(x >> 4, y >> 4);
|
auto dest = GetPixel(x >> 4, y >> 4);
|
||||||
|
Math::Vec4<u8> blend_output = combiner_output;
|
||||||
|
|
||||||
if (registers.output_merger.alphablend_enable) {
|
if (registers.output_merger.alphablend_enable) {
|
||||||
auto params = registers.output_merger.alpha_blending;
|
auto params = registers.output_merger.alpha_blending;
|
||||||
|
@ -574,7 +648,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
|
||||||
|
|
||||||
default:
|
default:
|
||||||
LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor);
|
LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor);
|
||||||
exit(0);
|
UNIMPLEMENTED();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -607,86 +681,78 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
|
||||||
|
|
||||||
default:
|
default:
|
||||||
LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor);
|
LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor);
|
||||||
exit(0);
|
UNIMPLEMENTED();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
using BlendEquation = decltype(params)::BlendEquation;
|
||||||
|
static auto EvaluateBlendEquation = [](const Math::Vec4<u8>& src, const Math::Vec4<u8>& srcfactor,
|
||||||
|
const Math::Vec4<u8>& dest, const Math::Vec4<u8>& destfactor,
|
||||||
|
BlendEquation equation) {
|
||||||
|
Math::Vec4<int> result;
|
||||||
|
|
||||||
|
auto src_result = (src * srcfactor).Cast<int>();
|
||||||
|
auto dst_result = (dest * destfactor).Cast<int>();
|
||||||
|
|
||||||
|
switch (equation) {
|
||||||
|
case BlendEquation::Add:
|
||||||
|
result = (src_result + dst_result) / 255;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case BlendEquation::Subtract:
|
||||||
|
result = (src_result - dst_result) / 255;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case BlendEquation::ReverseSubtract:
|
||||||
|
result = (dst_result - src_result) / 255;
|
||||||
|
break;
|
||||||
|
|
||||||
|
// TODO: How do these two actually work?
|
||||||
|
// OpenGL doesn't include the blend factors in the min/max computations,
|
||||||
|
// but is this what the 3DS actually does?
|
||||||
|
case BlendEquation::Min:
|
||||||
|
result.r() = std::min(src.r(), dest.r());
|
||||||
|
result.g() = std::min(src.g(), dest.g());
|
||||||
|
result.b() = std::min(src.b(), dest.b());
|
||||||
|
result.a() = std::min(src.a(), dest.a());
|
||||||
|
break;
|
||||||
|
|
||||||
|
case BlendEquation::Max:
|
||||||
|
result.r() = std::max(src.r(), dest.r());
|
||||||
|
result.g() = std::max(src.g(), dest.g());
|
||||||
|
result.b() = std::max(src.b(), dest.b());
|
||||||
|
result.a() = std::max(src.a(), dest.a());
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", equation);
|
||||||
|
UNIMPLEMENTED();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Math::Vec4<u8>(MathUtil::Clamp(result.r(), 0, 255),
|
||||||
|
MathUtil::Clamp(result.g(), 0, 255),
|
||||||
|
MathUtil::Clamp(result.b(), 0, 255),
|
||||||
|
MathUtil::Clamp(result.a(), 0, 255));
|
||||||
|
};
|
||||||
|
|
||||||
auto srcfactor = Math::MakeVec(LookupFactorRGB(params.factor_source_rgb),
|
auto srcfactor = Math::MakeVec(LookupFactorRGB(params.factor_source_rgb),
|
||||||
LookupFactorA(params.factor_source_a));
|
LookupFactorA(params.factor_source_a));
|
||||||
auto dstfactor = Math::MakeVec(LookupFactorRGB(params.factor_dest_rgb),
|
auto dstfactor = Math::MakeVec(LookupFactorRGB(params.factor_dest_rgb),
|
||||||
LookupFactorA(params.factor_dest_a));
|
LookupFactorA(params.factor_dest_a));
|
||||||
|
|
||||||
auto src_result = (combiner_output * srcfactor).Cast<int>();
|
blend_output = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_rgb);
|
||||||
auto dst_result = (dest * dstfactor).Cast<int>();
|
blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_a).a();
|
||||||
|
|
||||||
switch (params.blend_equation_rgb) {
|
|
||||||
case params.Add:
|
|
||||||
{
|
|
||||||
auto result = (src_result + dst_result) / 255;
|
|
||||||
result.r() = std::min(255, result.r());
|
|
||||||
result.g() = std::min(255, result.g());
|
|
||||||
result.b() = std::min(255, result.b());
|
|
||||||
combiner_output = result.Cast<u8>();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
case params.Subtract:
|
|
||||||
{
|
|
||||||
auto result = (src_result - dst_result) / 255;
|
|
||||||
result.r() = std::max(0, result.r());
|
|
||||||
result.g() = std::max(0, result.g());
|
|
||||||
result.b() = std::max(0, result.b());
|
|
||||||
combiner_output = result.Cast<u8>();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
case params.ReverseSubtract:
|
|
||||||
{
|
|
||||||
auto result = (dst_result - src_result) / 255;
|
|
||||||
result.r() = std::max(0, result.r());
|
|
||||||
result.g() = std::max(0, result.g());
|
|
||||||
result.b() = std::max(0, result.b());
|
|
||||||
combiner_output = result.Cast<u8>();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
case params.Min:
|
|
||||||
{
|
|
||||||
// TODO: GL spec says to do it without the factors, but is this what the 3DS does?
|
|
||||||
Math::Vec4<int> result;
|
|
||||||
result.r() = std::min(combiner_output.r(),dest.r());
|
|
||||||
result.g() = std::min(combiner_output.g(),dest.g());
|
|
||||||
result.b() = std::min(combiner_output.b(),dest.b());
|
|
||||||
combiner_output = result.Cast<u8>();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
case params.Max:
|
|
||||||
{
|
|
||||||
// TODO: GL spec says to do it without the factors, but is this what the 3DS does?
|
|
||||||
Math::Vec4<int> result;
|
|
||||||
result.r() = std::max(combiner_output.r(),dest.r());
|
|
||||||
result.g() = std::max(combiner_output.g(),dest.g());
|
|
||||||
result.b() = std::max(combiner_output.b(),dest.b());
|
|
||||||
combiner_output = result.Cast<u8>();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
default:
|
|
||||||
LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", params.blend_equation_rgb.Value());
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
LOG_CRITICAL(HW_GPU, "logic op: %x", registers.output_merger.logic_op);
|
LOG_CRITICAL(HW_GPU, "logic op: %x", registers.output_merger.logic_op);
|
||||||
exit(0);
|
UNIMPLEMENTED();
|
||||||
}
|
}
|
||||||
|
|
||||||
const Math::Vec4<u8> result = {
|
const Math::Vec4<u8> result = {
|
||||||
registers.output_merger.red_enable ? combiner_output.r() : dest.r(),
|
registers.output_merger.red_enable ? blend_output.r() : dest.r(),
|
||||||
registers.output_merger.green_enable ? combiner_output.g() : dest.g(),
|
registers.output_merger.green_enable ? blend_output.g() : dest.g(),
|
||||||
registers.output_merger.blue_enable ? combiner_output.b() : dest.b(),
|
registers.output_merger.blue_enable ? blend_output.b() : dest.b(),
|
||||||
registers.output_merger.alpha_enable ? combiner_output.a() : dest.a()
|
registers.output_merger.alpha_enable ? blend_output.a() : dest.a()
|
||||||
};
|
};
|
||||||
|
|
||||||
DrawPixel(x >> 4, y >> 4, result);
|
DrawPixel(x >> 4, y >> 4, result);
|
||||||
|
@ -694,6 +760,12 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ProcessTriangle(const VertexShader::OutputVertex& v0,
|
||||||
|
const VertexShader::OutputVertex& v1,
|
||||||
|
const VertexShader::OutputVertex& v2) {
|
||||||
|
ProcessTriangleInternal(v0, v1, v2);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace Rasterizer
|
} // namespace Rasterizer
|
||||||
|
|
||||||
} // namespace Pica
|
} // namespace Pica
|
||||||
|
|
|
@ -85,8 +85,11 @@ struct VertexShaderState {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct CallStackElement {
|
struct CallStackElement {
|
||||||
u32 final_address;
|
u32 final_address; // Address upon which we jump to return_address
|
||||||
u32 return_address;
|
u32 return_address; // Where to jump when leaving scope
|
||||||
|
u8 repeat_counter; // How often to repeat until this call stack element is removed
|
||||||
|
u8 loop_increment; // Which value to add to the loop counter after an iteration
|
||||||
|
// TODO: Should this be a signed value? Does it even matter?
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: Is there a maximal size for this?
|
// TODO: Is there a maximal size for this?
|
||||||
|
@ -105,9 +108,14 @@ static void ProcessShaderCode(VertexShaderState& state) {
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
if (!state.call_stack.empty()) {
|
if (!state.call_stack.empty()) {
|
||||||
if (state.program_counter - shader_memory.data() == state.call_stack.top().final_address) {
|
auto& top = state.call_stack.top();
|
||||||
state.program_counter = &shader_memory[state.call_stack.top().return_address];
|
if (state.program_counter - shader_memory.data() == top.final_address) {
|
||||||
|
state.address_registers[2] += top.loop_increment;
|
||||||
|
|
||||||
|
if (top.repeat_counter-- == 0) {
|
||||||
|
state.program_counter = &shader_memory[top.return_address];
|
||||||
state.call_stack.pop();
|
state.call_stack.pop();
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: Is "trying again" accurate to hardware?
|
// TODO: Is "trying again" accurate to hardware?
|
||||||
continue;
|
continue;
|
||||||
|
@ -118,9 +126,10 @@ static void ProcessShaderCode(VertexShaderState& state) {
|
||||||
const Instruction& instr = *(const Instruction*)state.program_counter;
|
const Instruction& instr = *(const Instruction*)state.program_counter;
|
||||||
const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id];
|
const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id];
|
||||||
|
|
||||||
auto call = [&](VertexShaderState& state, u32 offset, u32 num_instructions, u32 return_offset) {
|
static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions,
|
||||||
|
u32 return_offset, u8 repeat_count, u8 loop_increment) {
|
||||||
state.program_counter = &shader_memory[offset] - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
|
state.program_counter = &shader_memory[offset] - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
|
||||||
state.call_stack.push({ offset + num_instructions, return_offset });
|
state.call_stack.push({ offset + num_instructions, return_offset, repeat_count, loop_increment });
|
||||||
};
|
};
|
||||||
u32 binary_offset = state.program_counter - shader_memory.data();
|
u32 binary_offset = state.program_counter - shader_memory.data();
|
||||||
|
|
||||||
|
@ -457,7 +466,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
|
||||||
call(state,
|
call(state,
|
||||||
instr.flow_control.dest_offset,
|
instr.flow_control.dest_offset,
|
||||||
instr.flow_control.num_instructions,
|
instr.flow_control.num_instructions,
|
||||||
binary_offset + 1);
|
binary_offset + 1, 0, 0);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case Instruction::OpCode::CALLU:
|
case Instruction::OpCode::CALLU:
|
||||||
|
@ -465,7 +474,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
|
||||||
call(state,
|
call(state,
|
||||||
instr.flow_control.dest_offset,
|
instr.flow_control.dest_offset,
|
||||||
instr.flow_control.num_instructions,
|
instr.flow_control.num_instructions,
|
||||||
binary_offset + 1);
|
binary_offset + 1, 0, 0);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -474,7 +483,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
|
||||||
call(state,
|
call(state,
|
||||||
instr.flow_control.dest_offset,
|
instr.flow_control.dest_offset,
|
||||||
instr.flow_control.num_instructions,
|
instr.flow_control.num_instructions,
|
||||||
binary_offset + 1);
|
binary_offset + 1, 0, 0);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -486,12 +495,12 @@ static void ProcessShaderCode(VertexShaderState& state) {
|
||||||
call(state,
|
call(state,
|
||||||
binary_offset + 1,
|
binary_offset + 1,
|
||||||
instr.flow_control.dest_offset - binary_offset - 1,
|
instr.flow_control.dest_offset - binary_offset - 1,
|
||||||
instr.flow_control.dest_offset + instr.flow_control.num_instructions);
|
instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
|
||||||
} else {
|
} else {
|
||||||
call(state,
|
call(state,
|
||||||
instr.flow_control.dest_offset,
|
instr.flow_control.dest_offset,
|
||||||
instr.flow_control.num_instructions,
|
instr.flow_control.num_instructions,
|
||||||
instr.flow_control.dest_offset + instr.flow_control.num_instructions);
|
instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
@ -504,17 +513,30 @@ static void ProcessShaderCode(VertexShaderState& state) {
|
||||||
call(state,
|
call(state,
|
||||||
binary_offset + 1,
|
binary_offset + 1,
|
||||||
instr.flow_control.dest_offset - binary_offset - 1,
|
instr.flow_control.dest_offset - binary_offset - 1,
|
||||||
instr.flow_control.dest_offset + instr.flow_control.num_instructions);
|
instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
|
||||||
} else {
|
} else {
|
||||||
call(state,
|
call(state,
|
||||||
instr.flow_control.dest_offset,
|
instr.flow_control.dest_offset,
|
||||||
instr.flow_control.num_instructions,
|
instr.flow_control.num_instructions,
|
||||||
instr.flow_control.dest_offset + instr.flow_control.num_instructions);
|
instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case Instruction::OpCode::LOOP:
|
||||||
|
{
|
||||||
|
state.address_registers[2] = shader_uniforms.i[instr.flow_control.int_uniform_id].y;
|
||||||
|
|
||||||
|
call(state,
|
||||||
|
binary_offset + 1,
|
||||||
|
instr.flow_control.dest_offset - binary_offset + 1,
|
||||||
|
instr.flow_control.dest_offset + 1,
|
||||||
|
shader_uniforms.i[instr.flow_control.int_uniform_id].x,
|
||||||
|
shader_uniforms.i[instr.flow_control.int_uniform_id].z);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
default:
|
default:
|
||||||
LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
|
LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
|
||||||
(int)instr.opcode.Value(), instr.opcode.GetInfo().name, instr.hex);
|
(int)instr.opcode.Value(), instr.opcode.GetInfo().name, instr.hex);
|
||||||
|
|
Loading…
Reference in a new issue