From c52651261916b136f2ea4ff022fb9cead5a73a93 Mon Sep 17 00:00:00 2001 From: Tony Wasserka <NeoBrainX@gmail.com> Date: Sat, 26 Jul 2014 19:17:09 +0200 Subject: [PATCH] Pica: Add vertex shader implementation. --- src/video_core/CMakeLists.txt | 2 + src/video_core/command_processor.cpp | 108 ++++++++- src/video_core/pica.h | 137 ++++++++++- src/video_core/vertex_shader.cpp | 270 ++++++++++++++++++++++ src/video_core/vertex_shader.h | 211 +++++++++++++++++ src/video_core/video_core.vcxproj | 2 + src/video_core/video_core.vcxproj.filters | 2 + 7 files changed, 722 insertions(+), 10 deletions(-) create mode 100644 src/video_core/vertex_shader.cpp create mode 100644 src/video_core/vertex_shader.h diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 8977c8dcac..74304ee495 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -1,5 +1,6 @@ set(SRCS command_processor.cpp utils.cpp + vertex_shader.cpp video_core.cpp renderer_opengl/renderer_opengl.cpp) @@ -8,6 +9,7 @@ set(HEADERS command_processor.h utils.h video_core.h renderer_base.h + vertex_shader.h video_core.h renderer_opengl/renderer_opengl.h) diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index e909c8c32b..339fa77266 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -2,9 +2,10 @@ // Licensed under GPLv2 // Refer to the license.txt file included. -#include "pica.h" #include "command_processor.h" #include "math.h" +#include "pica.h" +#include "vertex_shader.h" namespace Pica { @@ -13,6 +14,14 @@ Regs registers; namespace CommandProcessor { +static int float_regs_counter = 0; + +static u32 uniform_write_buffer[4]; + +// Used for VSLoadProgramData and VSLoadSwizzleData +static u32 vs_binary_write_offset = 0; +static u32 vs_swizzle_write_offset = 0; + static inline void WritePicaReg(u32 id, u32 value) { u32 old_value = registers[id]; registers[id] = value; @@ -67,9 +76,7 @@ static inline void WritePicaReg(u32 id, u32 value) { } // Initialize data for the current vertex - struct { - Math::Vec4<float24> attr[16]; - } input; + VertexShader::InputVertex input; for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) { for (int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { @@ -87,7 +94,7 @@ static inline void WritePicaReg(u32 id, u32 value) { input.attr[i][comp].ToFloat32()); } } - // TODO: Run vertex data through vertex shader + VertexShader::OutputVertex output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes()); if (is_indexed) { // TODO: Add processed vertex to vertex cache! @@ -98,6 +105,97 @@ static inline void WritePicaReg(u32 id, u32 value) { break; } + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[0], 0x2c1): + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[1], 0x2c2): + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[2], 0x2c3): + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[3], 0x2c4): + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[4], 0x2c5): + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[5], 0x2c6): + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[6], 0x2c7): + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[7], 0x2c8): + { + auto& uniform_setup = registers.vs_uniform_setup; + + // TODO: Does actual hardware indeed keep an intermediate buffer or does + // it directly write the values? + uniform_write_buffer[float_regs_counter++] = value; + + // Uniforms are written in a packed format such that 4 float24 values are encoded in + // three 32-bit numbers. We write to internal memory once a full such vector is + // written. + if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) || + (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) { + float_regs_counter = 0; + + auto& uniform = VertexShader::GetFloatUniform(uniform_setup.index); + + if (uniform_setup.index > 95) { + ERROR_LOG(GPU, "Invalid VS uniform index %d", (int)uniform_setup.index); + break; + } + + // NOTE: The destination component order indeed is "backwards" + if (uniform_setup.IsFloat32()) { + for (auto i : {0,1,2,3}) + uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i])); + } else { + // TODO: Untested + uniform.w = float24::FromRawFloat24(uniform_write_buffer[0] >> 8); + uniform.z = float24::FromRawFloat24(((uniform_write_buffer[0] & 0xFF)<<16) | ((uniform_write_buffer[1] >> 16) & 0xFFFF)); + uniform.y = float24::FromRawFloat24(((uniform_write_buffer[1] & 0xFFFF)<<8) | ((uniform_write_buffer[2] >> 24) & 0xFF)); + uniform.x = float24::FromRawFloat24(uniform_write_buffer[2] & 0xFFFFFF); + } + + DEBUG_LOG(GPU, "Set uniform %x to (%f %f %f %f)", (int)uniform_setup.index, + uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(), + uniform.w.ToFloat32()); + + // TODO: Verify that this actually modifies the register! + uniform_setup.index = uniform_setup.index + 1; + } + break; + } + + // Seems to be used to reset the write pointer for VSLoadProgramData + case PICA_REG_INDEX(vs_program.begin_load): + vs_binary_write_offset = 0; + break; + + // Load shader program code + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc): + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd): + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[2], 0x2ce): + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[3], 0x2cf): + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[4], 0x2d0): + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[5], 0x2d1): + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[6], 0x2d2): + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[7], 0x2d3): + { + VertexShader::SubmitShaderMemoryChange(vs_binary_write_offset, value); + vs_binary_write_offset++; + break; + } + + // Seems to be used to reset the write pointer for VSLoadSwizzleData + case PICA_REG_INDEX(vs_swizzle_patterns.begin_load): + vs_swizzle_write_offset = 0; + break; + + // Load swizzle pattern data + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[0], 0x2d6): + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[1], 0x2d7): + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[2], 0x2d8): + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[3], 0x2d9): + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[4], 0x2da): + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[5], 0x2db): + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc): + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd): + { + VertexShader::SubmitSwizzleDataChange(vs_swizzle_write_offset, value); + vs_swizzle_write_offset++; + break; + } + default: break; } diff --git a/src/video_core/pica.h b/src/video_core/pica.h index faf124c3d2..42303a5850 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h @@ -50,7 +50,39 @@ struct Regs { INSERT_PADDING_WORDS(0x1); BitField<0, 24, u32> viewport_size_y; - INSERT_PADDING_WORDS(0x1bc); + INSERT_PADDING_WORDS(0xc); + + union { + // Maps components of output vertex attributes to semantics + enum Semantic : u32 + { + POSITION_X = 0, + POSITION_Y = 1, + POSITION_Z = 2, + POSITION_W = 3, + + COLOR_R = 8, + COLOR_G = 9, + COLOR_B = 10, + COLOR_A = 11, + + TEXCOORD0_U = 12, + TEXCOORD0_V = 13, + TEXCOORD1_U = 14, + TEXCOORD1_V = 15, + TEXCOORD2_U = 22, + TEXCOORD2_V = 23, + + INVALID = 31, + }; + + BitField< 0, 5, Semantic> map_x; + BitField< 8, 5, Semantic> map_y; + BitField<16, 5, Semantic> map_z; + BitField<24, 5, Semantic> map_w; + } vs_output_attributes[7]; + + INSERT_PADDING_WORDS(0x1a9); struct { enum class Format : u64 { @@ -133,7 +165,7 @@ struct Regs { // Attribute loaders map the source vertex data to input attributes // This e.g. allows to load different attributes from different memory locations - struct Loader { + struct { // Source attribute data offset from the base address u32 data_offset; @@ -189,7 +221,90 @@ struct Regs { u32 trigger_draw; u32 trigger_draw_indexed; - INSERT_PADDING_WORDS(0xd0); + INSERT_PADDING_WORDS(0x8a); + + // Offset to shader program entry point (in words) + BitField<0, 16, u32> vs_main_offset; + + union { + BitField< 0, 4, u64> attribute0_register; + BitField< 4, 4, u64> attribute1_register; + BitField< 8, 4, u64> attribute2_register; + BitField<12, 4, u64> attribute3_register; + BitField<16, 4, u64> attribute4_register; + BitField<20, 4, u64> attribute5_register; + BitField<24, 4, u64> attribute6_register; + BitField<28, 4, u64> attribute7_register; + BitField<32, 4, u64> attribute8_register; + BitField<36, 4, u64> attribute9_register; + BitField<40, 4, u64> attribute10_register; + BitField<44, 4, u64> attribute11_register; + BitField<48, 4, u64> attribute12_register; + BitField<52, 4, u64> attribute13_register; + BitField<56, 4, u64> attribute14_register; + BitField<60, 4, u64> attribute15_register; + + int GetRegisterForAttribute(int attribute_index) { + u64 fields[] = { + attribute0_register, attribute1_register, attribute2_register, attribute3_register, + attribute4_register, attribute5_register, attribute6_register, attribute7_register, + attribute8_register, attribute9_register, attribute10_register, attribute11_register, + attribute12_register, attribute13_register, attribute14_register, attribute15_register, + }; + return (int)fields[attribute_index]; + } + } vs_input_register_map; + + INSERT_PADDING_WORDS(0x3); + + struct { + enum Format : u32 + { + FLOAT24 = 0, + FLOAT32 = 1 + }; + + bool IsFloat32() const { + return format == FLOAT32; + } + + union { + // Index of the next uniform to write to + // TODO: ctrulib uses 8 bits for this, however that seems to yield lots of invalid indices + BitField<0, 7, u32> index; + + BitField<31, 1, Format> format; + }; + + // Writing to these registers sets the "current" uniform. + // TODO: It's not clear how the hardware stores what the "current" uniform is. + u32 set_value[8]; + + } vs_uniform_setup; + + INSERT_PADDING_WORDS(0x2); + + struct { + u32 begin_load; + + // Writing to these registers sets the "current" word in the shader program. + // TODO: It's not clear how the hardware stores what the "current" word is. + u32 set_word[8]; + } vs_program; + + INSERT_PADDING_WORDS(0x1); + + // This register group is used to load an internal table of swizzling patterns, + // which are indexed by each shader instruction to specify vector component swizzling. + struct { + u32 begin_load; + + // Writing to these registers sets the "current" swizzle pattern in the table. + // TODO: It's not clear how the hardware stores what the "current" swizzle pattern is. + u32 set_word[8]; + } vs_swizzle_patterns; + + INSERT_PADDING_WORDS(0x22); #undef INSERT_PADDING_WORDS_HELPER1 #undef INSERT_PADDING_WORDS_HELPER2 @@ -219,6 +334,11 @@ struct Regs { ADD_FIELD(num_vertices); ADD_FIELD(trigger_draw); ADD_FIELD(trigger_draw_indexed); + ADD_FIELD(vs_main_offset); + ADD_FIELD(vs_input_register_map); + ADD_FIELD(vs_uniform_setup); + ADD_FIELD(vs_program); + ADD_FIELD(vs_swizzle_patterns); #undef ADD_FIELD #endif // _MSC_VER @@ -259,17 +379,25 @@ private: ASSERT_REG_POSITION(viewport_size_x, 0x41); ASSERT_REG_POSITION(viewport_size_y, 0x43); +ASSERT_REG_POSITION(vs_output_attributes[0], 0x50); +ASSERT_REG_POSITION(vs_output_attributes[1], 0x51); ASSERT_REG_POSITION(vertex_attributes, 0x200); ASSERT_REG_POSITION(index_array, 0x227); ASSERT_REG_POSITION(num_vertices, 0x228); ASSERT_REG_POSITION(trigger_draw, 0x22e); ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f); +ASSERT_REG_POSITION(vs_main_offset, 0x2ba); +ASSERT_REG_POSITION(vs_input_register_map, 0x2bb); +ASSERT_REG_POSITION(vs_uniform_setup, 0x2c0); +ASSERT_REG_POSITION(vs_program, 0x2cb); +ASSERT_REG_POSITION(vs_swizzle_patterns, 0x2d5); #undef ASSERT_REG_POSITION #endif // !defined(_MSC_VER) // The total number of registers is chosen arbitrarily, but let's make sure it's not some odd value anyway. -static_assert(sizeof(Regs) == 0x300 * sizeof(u32), "Invalid total size of register set"); +static_assert(sizeof(Regs) <= 0x300 * sizeof(u32), "Register set structure larger than it should be"); +static_assert(sizeof(Regs) >= 0x300 * sizeof(u32), "Register set structure smaller than it should be"); extern Regs registers; // TODO: Not sure if we want to have one global instance for this @@ -347,7 +475,6 @@ private: float value; }; - union CommandHeader { CommandHeader(u32 h) : hex(h) {} diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp new file mode 100644 index 0000000000..93830a96af --- /dev/null +++ b/src/video_core/vertex_shader.cpp @@ -0,0 +1,270 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#include "pica.h" +#include "vertex_shader.h" +#include <core/mem_map.h> +#include <common/file_util.h> + +namespace Pica { + +namespace VertexShader { + +static struct { + Math::Vec4<float24> f[96]; +} shader_uniforms; + + +// TODO: Not sure where the shader binary and swizzle patterns are supposed to be loaded to! +// For now, we just keep these local arrays around. +static u32 shader_memory[1024]; +static u32 swizzle_data[1024]; + +void SubmitShaderMemoryChange(u32 addr, u32 value) +{ + shader_memory[addr] = value; +} + +void SubmitSwizzleDataChange(u32 addr, u32 value) +{ + swizzle_data[addr] = value; +} + +Math::Vec4<float24>& GetFloatUniform(u32 index) +{ + return shader_uniforms.f[index]; +} + +struct VertexShaderState { + u32* program_counter; + + const float24* input_register_table[16]; + float24* output_register_table[7*4]; + + Math::Vec4<float24> temporary_registers[16]; + bool status_registers[2]; + + enum { + INVALID_ADDRESS = 0xFFFFFFFF + }; + u32 call_stack[8]; // TODO: What is the maximal call stack depth? + u32* call_stack_pointer; +}; + +static void ProcessShaderCode(VertexShaderState& state) { + while (true) { + bool increment_pc = true; + bool exit_loop = false; + const Instruction& instr = *(const Instruction*)state.program_counter; + + const float24* src1_ = (instr.common.src1 < 0x10) ? state.input_register_table[instr.common.src1] + : (instr.common.src1 < 0x20) ? &state.temporary_registers[instr.common.src1-0x10].x + : (instr.common.src1 < 0x80) ? &shader_uniforms.f[instr.common.src1-0x20].x + : nullptr; + const float24* src2_ = (instr.common.src2 < 0x10) ? state.input_register_table[instr.common.src2] + : &state.temporary_registers[instr.common.src2-0x10].x; + // TODO: Unsure about the limit values + float24* dest = (instr.common.dest <= 0x1C) ? state.output_register_table[instr.common.dest] + : (instr.common.dest <= 0x3C) ? nullptr + : (instr.common.dest <= 0x7C) ? &state.temporary_registers[(instr.common.dest-0x40)/4][instr.common.dest%4] + : nullptr; + + const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id]; + + const float24 src1[4] = { + src1_[(int)swizzle.GetSelectorSrc1(0)], + src1_[(int)swizzle.GetSelectorSrc1(1)], + src1_[(int)swizzle.GetSelectorSrc1(2)], + src1_[(int)swizzle.GetSelectorSrc1(3)], + }; + const float24 src2[4] = { + src2_[(int)swizzle.GetSelectorSrc2(0)], + src2_[(int)swizzle.GetSelectorSrc2(1)], + src2_[(int)swizzle.GetSelectorSrc2(2)], + src2_[(int)swizzle.GetSelectorSrc2(3)], + }; + + switch (instr.opcode) { + case Instruction::OpCode::ADD: + { + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = src1[i] + src2[i]; + } + + break; + } + + case Instruction::OpCode::MUL: + { + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = src1[i] * src2[i]; + } + + break; + } + + case Instruction::OpCode::DP3: + case Instruction::OpCode::DP4: + { + float24 dot = float24::FromFloat32(0.f); + int num_components = (instr.opcode == Instruction::OpCode::DP3) ? 3 : 4; + for (int i = 0; i < num_components; ++i) + dot = dot + src1[i] * src2[i]; + + for (int i = 0; i < num_components; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = dot; + } + break; + } + + // Reciprocal + case Instruction::OpCode::RCP: + { + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + // TODO: Be stable against division by zero! + // TODO: I think this might be wrong... we should only use one component here + dest[i] = float24::FromFloat32(1.0 / src1[i].ToFloat32()); + } + + break; + } + + // Reciprocal Square Root + case Instruction::OpCode::RSQ: + { + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + // TODO: Be stable against division by zero! + // TODO: I think this might be wrong... we should only use one component here + dest[i] = float24::FromFloat32(1.0 / sqrt(src1[i].ToFloat32())); + } + + break; + } + + case Instruction::OpCode::MOV: + { + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = src1[i]; + } + break; + } + + case Instruction::OpCode::RET: + if (*state.call_stack_pointer == VertexShaderState::INVALID_ADDRESS) { + exit_loop = true; + } else { + state.program_counter = &shader_memory[*state.call_stack_pointer--]; + *state.call_stack_pointer = VertexShaderState::INVALID_ADDRESS; + } + + break; + + case Instruction::OpCode::CALL: + increment_pc = false; + + _dbg_assert_(GPU, state.call_stack_pointer - state.call_stack < sizeof(state.call_stack)); + + *++state.call_stack_pointer = state.program_counter - shader_memory; + // TODO: Does this offset refer to the beginning of shader memory? + state.program_counter = &shader_memory[instr.flow_control.offset_words]; + break; + + case Instruction::OpCode::FLS: + // TODO: Do whatever needs to be done here? + break; + + default: + ERROR_LOG(GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x", + (int)instr.opcode.Value(), instr.GetOpCodeName().c_str(), instr.hex); + break; + } + + if (increment_pc) + ++state.program_counter; + + if (exit_loop) + break; + } +} + +OutputVertex RunShader(const InputVertex& input, int num_attributes) +{ + VertexShaderState state; + + const u32* main = &shader_memory[registers.vs_main_offset]; + state.program_counter = (u32*)main; + + // Setup input register table + const auto& attribute_register_map = registers.vs_input_register_map; + float24 dummy_register; + std::fill(&state.input_register_table[0], &state.input_register_table[16], &dummy_register); + if(num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x; + if(num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x; + if(num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x; + if(num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x; + if(num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x; + if(num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x; + if(num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x; + if(num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x; + if(num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x; + if(num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x; + if(num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x; + if(num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x; + if(num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x; + if(num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x; + if(num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x; + if(num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x; + + // Setup output register table + OutputVertex ret; + for (int i = 0; i < 7; ++i) { + const auto& output_register_map = registers.vs_output_attributes[i]; + + u32 semantics[4] = { + output_register_map.map_x, output_register_map.map_y, + output_register_map.map_z, output_register_map.map_w + }; + + for (int comp = 0; comp < 4; ++comp) + state.output_register_table[4*i+comp] = ((float24*)&ret) + semantics[comp]; + } + + state.status_registers[0] = false; + state.status_registers[1] = false; + std::fill(state.call_stack, state.call_stack + sizeof(state.call_stack) / sizeof(state.call_stack[0]), + VertexShaderState::INVALID_ADDRESS); + state.call_stack_pointer = &state.call_stack[0]; + + ProcessShaderCode(state); + + DEBUG_LOG(GPU, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", + ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), + ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), + ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32()); + + return ret; +} + + +} // namespace + +} // namespace diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h new file mode 100644 index 0000000000..1b71e367b3 --- /dev/null +++ b/src/video_core/vertex_shader.h @@ -0,0 +1,211 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#pragma once + +#include <initializer_list> + +#include <common/common_types.h> + +#include "math.h" +#include "pica.h" + +namespace Pica { + +namespace VertexShader { + +struct InputVertex { + Math::Vec4<float24> attr[16]; +}; + +struct OutputVertex { + OutputVertex() = default; + + // VS output attributes + Math::Vec4<float24> pos; + Math::Vec4<float24> dummy; // quaternions (not implemented, yet) + Math::Vec4<float24> color; + Math::Vec2<float24> tc0; + float24 tc0_v; + + // Padding for optimal alignment + float24 pad[14]; + + // Attributes used to store intermediate results + + // position after perspective divide + Math::Vec3<float24> screenpos; + + // Linear interpolation + // factor: 0=this, 1=vtx + void Lerp(float24 factor, const OutputVertex& vtx) { + pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); + + // TODO: Should perform perspective correct interpolation here... + tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); + + screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); + + color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); + } + + // Linear interpolation + // factor: 0=v0, 1=v1 + static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { + OutputVertex ret = v0; + ret.Lerp(factor, v1); + return ret; + } +}; +static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); + +union Instruction { + enum class OpCode : u32 { + ADD = 0x0, + DP3 = 0x1, + DP4 = 0x2, + + MUL = 0x8, + + MAX = 0xC, + MIN = 0xD, + RCP = 0xE, + RSQ = 0xF, + + MOV = 0x13, + + RET = 0x21, + FLS = 0x22, // Flush + CALL = 0x24, + }; + + std::string GetOpCodeName() const { + std::map<OpCode, std::string> map = { + { OpCode::ADD, "ADD" }, + { OpCode::DP3, "DP3" }, + { OpCode::DP4, "DP4" }, + { OpCode::MUL, "MUL" }, + { OpCode::MAX, "MAX" }, + { OpCode::MIN, "MIN" }, + { OpCode::RCP, "RCP" }, + { OpCode::RSQ, "RSQ" }, + { OpCode::MOV, "MOV" }, + { OpCode::RET, "RET" }, + { OpCode::FLS, "FLS" }, + }; + auto it = map.find(opcode); + if (it == map.end()) + return "UNK"; + else + return it->second; + } + + u32 hex; + + BitField<0x1a, 0x6, OpCode> opcode; + + // General notes: + // + // When two input registers are used, one of them uses a 5-bit index while the other + // one uses a 7-bit index. This is because at most one floating point uniform may be used + // as an input. + + + // Format used e.g. by arithmetic instructions and comparisons + // "src1" and "src2" specify register indices (i.e. indices referring to groups of 4 floats), + // while "dest" addresses individual floats. + union { + BitField<0x00, 0x5, u32> operand_desc_id; + BitField<0x07, 0x5, u32> src2; + BitField<0x0c, 0x7, u32> src1; + BitField<0x13, 0x7, u32> dest; + } common; + + // Format used for flow control instructions ("if") + union { + BitField<0x00, 0x8, u32> num_instructions; + BitField<0x0a, 0xc, u32> offset_words; + } flow_control; +}; + +union SwizzlePattern { + u32 hex; + + enum class Selector : u32 { + x = 0, + y = 1, + z = 2, + w = 3 + }; + + Selector GetSelectorSrc1(int comp) const { + Selector selectors[] = { + src1_selector_0, src1_selector_1, src1_selector_2, src1_selector_3 + }; + return selectors[comp]; + } + + Selector GetSelectorSrc2(int comp) const { + Selector selectors[] = { + src2_selector_0, src2_selector_1, src2_selector_2, src2_selector_3 + }; + return selectors[comp]; + } + + bool DestComponentEnabled(int i) const { + return (dest_mask & (0x8 >> i)); + } + + std::string SelectorToString(bool src2) const { + std::map<Selector, std::string> map = { + { Selector::x, "x" }, + { Selector::y, "y" }, + { Selector::z, "z" }, + { Selector::w, "w" } + }; + std::string ret; + for (int i = 0; i < 4; ++i) { + ret += map.at(src2 ? GetSelectorSrc2(i) : GetSelectorSrc1(i)); + } + return ret; + } + + std::string DestMaskToString() const { + std::string ret; + for (int i = 0; i < 4; ++i) { + if (!DestComponentEnabled(i)) + ret += "_"; + else + ret += "xyzw"[i]; + } + return ret; + } + + // Components of "dest" that should be written to: LSB=dest.w, MSB=dest.x + BitField< 0, 4, u32> dest_mask; + + BitField< 5, 2, Selector> src1_selector_3; + BitField< 7, 2, Selector> src1_selector_2; + BitField< 9, 2, Selector> src1_selector_1; + BitField<11, 2, Selector> src1_selector_0; + + BitField<14, 2, Selector> src2_selector_3; + BitField<16, 2, Selector> src2_selector_2; + BitField<18, 2, Selector> src2_selector_1; + BitField<20, 2, Selector> src2_selector_0; + + BitField<31, 1, u32> flag; // not sure what this means, maybe it's the sign? +}; + +void SubmitShaderMemoryChange(u32 addr, u32 value); +void SubmitSwizzleDataChange(u32 addr, u32 value); + +OutputVertex RunShader(const InputVertex& input, int num_attributes); + +Math::Vec4<float24>& GetFloatUniform(u32 index); + +} // namespace + +} // namespace + diff --git a/src/video_core/video_core.vcxproj b/src/video_core/video_core.vcxproj index 28eb212848..56729dc03e 100644 --- a/src/video_core/video_core.vcxproj +++ b/src/video_core/video_core.vcxproj @@ -22,6 +22,7 @@ <ClCompile Include="renderer_opengl\renderer_opengl.cpp" /> <ClCompile Include="command_processor.cpp" /> <ClCompile Include="utils.cpp" /> + <ClCompile Include="vertex_shader.cpp" /> <ClCompile Include="video_core.cpp" /> </ItemGroup> <ItemGroup> @@ -31,6 +32,7 @@ <ClInclude Include="pica.h" /> <ClInclude Include="renderer_base.h" /> <ClInclude Include="utils.h" /> + <ClInclude Include="vertex_shader.h" /> <ClInclude Include="video_core.h" /> <ClInclude Include="renderer_opengl\renderer_opengl.h" /> </ItemGroup> diff --git a/src/video_core/video_core.vcxproj.filters b/src/video_core/video_core.vcxproj.filters index 713458fcff..db0b37018f 100644 --- a/src/video_core/video_core.vcxproj.filters +++ b/src/video_core/video_core.vcxproj.filters @@ -11,6 +11,7 @@ </ClCompile> <ClCompile Include="command_processor.cpp" /> <ClCompile Include="utils.cpp" /> + <ClCompile Include="vertex_shader.cpp" /> <ClCompile Include="video_core.cpp" /> </ItemGroup> <ItemGroup> @@ -23,6 +24,7 @@ <ClInclude Include="pica.h" /> <ClInclude Include="renderer_base.h" /> <ClInclude Include="utils.h" /> + <ClInclude Include="vertex_shader.h" /> <ClInclude Include="video_core.h" /> </ItemGroup> <ItemGroup>