From d151d797b1c281d5813ca705722f43b4be20ca6d Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sun, 28 Dec 2014 18:20:33 -0200 Subject: [PATCH 1/7] Vertex Shader: Zero OutputVertex to avoid denormals Unused OutputVertex attributes were being left un-initialized. The leftover garbage sometimes decoded as floating-point denormalized values, causing fallbacks to microcode and massive slowdowns in the rest of the rasterization pipeline even though the results were unused. By zeroing the structure we ensure these attributes only contain harmless zeros. --- src/video_core/vertex_shader.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp index e31bc3bc73..bed5081a0e 100644 --- a/src/video_core/vertex_shader.cpp +++ b/src/video_core/vertex_shader.cpp @@ -469,6 +469,10 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes) // Setup output register table OutputVertex ret; + // Zero output so that attributes which aren't output won't have denormals in them, which will + // slow us down later. + memset(&ret, 0, sizeof(ret)); + for (int i = 0; i < 7; ++i) { const auto& output_register_map = registers.vs_output_attributes[i]; From a320d1a5b4b7ce3b90372697fbe50242b78d082e Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sun, 28 Dec 2014 00:56:32 -0200 Subject: [PATCH 2/7] Clipper: Avoid dynamic allocations The triangle clipper was allocating its temporary input, output and work buffers using a std::vector. Since this is a hot path, it's desirable to use stack allocation instead. --- externals/boost | 2 +- src/video_core/clipper.cpp | 17 +++++++---------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/externals/boost b/externals/boost index b060148c08..97052c28ac 160000 --- a/externals/boost +++ b/externals/boost @@ -1 +1 @@ -Subproject commit b060148c08ae87a3a5809c4f48cb26ac667487ab +Subproject commit 97052c28acb141dbf3c5e14114af99045344b695 diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp index 0bcd0b8950..e89b7a0c0e 100644 --- a/src/video_core/clipper.cpp +++ b/src/video_core/clipper.cpp @@ -2,7 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include +#include #include "clipper.h" #include "pica.h" @@ -98,18 +98,15 @@ static void InitScreenCoordinates(OutputVertex& vtx) } void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { + using boost::container::static_vector; // TODO (neobrain): // The list of output vertices has some fixed maximum size, // however I haven't taken the time to figure out what it is exactly. - // For now, we hence just assume a maximal size of 1000 vertices. - const size_t max_vertices = 1000; - std::vector buffer_vertices; - std::vector output_list{ &v0, &v1, &v2 }; - - // Make sure to reserve space for all vertices. - // Without this, buffer reallocation would invalidate references. - buffer_vertices.reserve(max_vertices); + // For now, we hence just assume a maximal size of 256 vertices. + static const size_t MAX_VERTICES = 256; + static_vector buffer_vertices; + static_vector output_list = { &v0, &v1, &v2 }; // Simple implementation of the Sutherland-Hodgman clipping algorithm. // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here) @@ -120,7 +117,7 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)), ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) { - const std::vector input_list = output_list; + const static_vector input_list = output_list; output_list.clear(); const OutputVertex* reference_vertex = input_list.back(); From da049764377804f055ff1898ba0e58c8ee096805 Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sun, 28 Dec 2014 02:46:29 -0200 Subject: [PATCH 3/7] CMake: Require Boost 1.57.0 (fixes Travis OS X) --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5bb87d50dd..884520cef0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,11 +41,11 @@ else() message(STATUS "libpng not found. Some debugging features have been disabled.") endif() -find_package(Boost) +find_package(Boost 1.57.0) if (Boost_FOUND) include_directories(${Boost_INCLUDE_DIRS}) else() - message(STATUS "Boost not found, falling back to externals") + message(STATUS "Boost 1.57.0 or newer not found, falling back to externals") include_directories(externals/boost) endif() From 7e9bc85cc826c55a5aa612a3c2f14b8fb631a68c Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sun, 28 Dec 2014 23:05:15 -0200 Subject: [PATCH 4/7] Clipper: Compact buffers on each clipping pass Use a new buffer management scheme in the clipper that allows using a bounded minimal amount of buffer space. Even though it copies more data it is still slightly faster likely due to using less cache. --- src/video_core/clipper.cpp | 55 +++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp index e89b7a0c0e..0521ef8661 100644 --- a/src/video_core/clipper.cpp +++ b/src/video_core/clipper.cpp @@ -100,13 +100,15 @@ static void InitScreenCoordinates(OutputVertex& vtx) void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { using boost::container::static_vector; - // TODO (neobrain): - // The list of output vertices has some fixed maximum size, - // however I haven't taken the time to figure out what it is exactly. - // For now, we hence just assume a maximal size of 256 vertices. - static const size_t MAX_VERTICES = 256; - static_vector buffer_vertices; - static_vector output_list = { &v0, &v1, &v2 }; + // Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at + // the new edge (or less in degenerate cases). As such, we can say that each clipping plane + // introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a + // fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9. + static const size_t MAX_VERTICES = 9; + static_vector buffer_a = { v0, v1, v2 }; + static_vector buffer_b; + auto* output_list = &buffer_a; + auto* input_list = &buffer_b; // Simple implementation of the Sutherland-Hodgman clipping algorithm. // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here) @@ -117,48 +119,45 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)), ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) { - const static_vector input_list = output_list; - output_list.clear(); + std::swap(input_list, output_list); + output_list->clear(); - const OutputVertex* reference_vertex = input_list.back(); + const OutputVertex* reference_vertex = &input_list->back(); - for (const auto& vertex : input_list) { + for (const auto& vertex : *input_list) { // NOTE: This algorithm changes vertex order in some cases! - if (edge.IsInside(*vertex)) { + if (edge.IsInside(vertex)) { if (edge.IsOutSide(*reference_vertex)) { - buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex)); - output_list.push_back(&(buffer_vertices.back())); + output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); } - output_list.push_back(vertex); + output_list->push_back(vertex); } else if (edge.IsInside(*reference_vertex)) { - buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex)); - output_list.push_back(&(buffer_vertices.back())); + output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); } - - reference_vertex = vertex; + reference_vertex = &vertex; } // Need to have at least a full triangle to continue... - if (output_list.size() < 3) + if (output_list->size() < 3) return; } - InitScreenCoordinates(*(output_list[0])); - InitScreenCoordinates(*(output_list[1])); + InitScreenCoordinates((*output_list)[0]); + InitScreenCoordinates((*output_list)[1]); - for (size_t i = 0; i < output_list.size() - 2; i ++) { - OutputVertex& vtx0 = *(output_list[0]); - OutputVertex& vtx1 = *(output_list[i+1]); - OutputVertex& vtx2 = *(output_list[i+2]); + for (size_t i = 0; i < output_list->size() - 2; i ++) { + OutputVertex& vtx0 = (*output_list)[0]; + OutputVertex& vtx1 = (*output_list)[i+1]; + OutputVertex& vtx2 = (*output_list)[i+2]; InitScreenCoordinates(vtx2); LOG_TRACE(Render_Software, - "Triangle %lu/%lu (%lu buffer vertices) at position (%.3f, %.3f, %.3f, %.3f), " + "Triangle %lu/%lu at position (%.3f, %.3f, %.3f, %.3f), " "(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and " "screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)", - i,output_list.size(), buffer_vertices.size(), + i, output_list->size(), vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(), vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(), vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(), From 2012e1420f90ea86ea6975f2005f05ecd304b0c4 Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Tue, 23 Dec 2014 10:59:07 -0200 Subject: [PATCH 5/7] Rasterizer: Common sub-expression elimination Move the computation of some values out of loops so that they're not constantly recalculated even when they don't change. --- src/video_core/rasterizer.cpp | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp index df1f88c793..63da7104d6 100644 --- a/src/video_core/rasterizer.cpp +++ b/src/video_core/rasterizer.cpp @@ -106,6 +106,14 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, int bias1 = IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0; int bias2 = IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0; + const Math::Vec3 w_inverse = Math::MakeVec( + float24::FromFloat32(1.0f) / v0.pos.w, + float24::FromFloat32(1.0f) / v1.pos.w, + float24::FromFloat32(1.0f) / v2.pos.w); + + auto textures = registers.GetTextures(); + auto tev_stages = registers.GetTevStages(); + // TODO: Not sure if looping through x first might be faster for (u16 y = min_y; y < max_y; y += 0x10) { for (u16 x = min_x; x < max_x; x += 0x10) { @@ -129,6 +137,11 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, if (w0 < 0 || w1 < 0 || w2 < 0) continue; + auto baricentric_coordinates = Math::MakeVec(float24::FromFloat32(static_cast(w0)), + float24::FromFloat32(static_cast(w1)), + float24::FromFloat32(static_cast(w2))); + float24 interpolated_w_inverse = float24::FromFloat32(1.0f) / Math::Dot(w_inverse, baricentric_coordinates); + // Perspective correct attribute interpolation: // Attribute values cannot be calculated by simple linear interpolation since // they are not linear in screen space. For example, when interpolating a @@ -145,19 +158,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, // // The generalization to three vertices is straightforward in baricentric coordinates. auto GetInterpolatedAttribute = [&](float24 attr0, float24 attr1, float24 attr2) { - auto attr_over_w = Math::MakeVec(attr0 / v0.pos.w, - attr1 / v1.pos.w, - attr2 / v2.pos.w); - auto w_inverse = Math::MakeVec(float24::FromFloat32(1.f) / v0.pos.w, - float24::FromFloat32(1.f) / v1.pos.w, - float24::FromFloat32(1.f) / v2.pos.w); - auto baricentric_coordinates = Math::MakeVec(float24::FromFloat32(static_cast(w0)), - float24::FromFloat32(static_cast(w1)), - float24::FromFloat32(static_cast(w2))); - + auto attr_over_w = Math::MakeVec(attr0, attr1, attr2) * w_inverse; float24 interpolated_attr_over_w = Math::Dot(attr_over_w, baricentric_coordinates); - float24 interpolated_w_inverse = Math::Dot(w_inverse, baricentric_coordinates); - return interpolated_attr_over_w / interpolated_w_inverse; + return interpolated_attr_over_w * interpolated_w_inverse; }; Math::Vec4 primary_color{ @@ -177,7 +180,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, Math::Vec4 texture_color[3]{}; for (int i = 0; i < 3; ++i) { - auto texture = registers.GetTextures()[i]; + const auto& texture = textures[i]; if (!texture.enabled) continue; @@ -219,7 +222,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, // with some basic arithmetic. Alpha combiners can be configured separately but work // analogously. Math::Vec4 combiner_output; - for (auto tev_stage : registers.GetTevStages()) { + for (const auto& tev_stage : tev_stages) { using Source = Regs::TevStageConfig::Source; using ColorModifier = Regs::TevStageConfig::ColorModifier; using AlphaModifier = Regs::TevStageConfig::AlphaModifier; From fe186d3a598837ba7337f06399dfb8ae7930a070 Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Tue, 23 Dec 2014 12:27:56 -0200 Subject: [PATCH 6/7] GPU: Bitwise texture swizzling Replace the loop-based texture address swizzling code by a bit-twiddling implementation, providing a very small speed up. Also simplify addressing code. --- src/video_core/debug_utils/debug_utils.cpp | 49 ++++++++++------------ 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp index 328386b7e4..5921185a65 100644 --- a/src/video_core/debug_utils/debug_utils.cpp +++ b/src/video_core/debug_utils/debug_utils.cpp @@ -304,7 +304,6 @@ std::unique_ptr FinishPicaTracing() } const Math::Vec4 LookupTexture(const u8* source, int x, int y, const TextureInfo& info, bool disable_alpha) { - // Images are split into 8x8 tiles. Each tile is composed of four 4x4 subtiles each // of which is composed of four 2x2 subtiles each of which is composed of four texels. // Each structure is embedded into the next-bigger one in a diagonal pattern, e.g. @@ -323,41 +322,39 @@ const Math::Vec4 LookupTexture(const u8* source, int x, int y, const Texture // 02 03 06 07 18 19 22 23 // 00 01 04 05 16 17 20 21 - // TODO(neobrain): Not sure if this swizzling pattern is used for all textures. - // To be flexible in case different but similar patterns are used, we keep this - // somewhat inefficient code around for now. - int texel_index_within_tile = 0; - for (int block_size_index = 0; block_size_index < 3; ++block_size_index) { - int sub_tile_width = 1 << block_size_index; - int sub_tile_height = 1 << block_size_index; + const unsigned int block_width = 8; + const unsigned int block_height = 8; - int sub_tile_index = (x & sub_tile_width) << block_size_index; - sub_tile_index += 2 * ((y & sub_tile_height) << block_size_index); - texel_index_within_tile += sub_tile_index; - } + const unsigned int coarse_x = x & ~7; + const unsigned int coarse_y = y & ~7; - const int block_width = 8; - const int block_height = 8; + // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are + // arranged in a Z-order curve. More details on the bit manipulation at: + // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/ + unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210 + i = (i ^ (i << 2)) & 0x1313; // ---2 --10 + i = (i ^ (i << 1)) & 0x1515; // ---2 -1-0 + i = (i | (i >> 7)) & 0x3F; - int coarse_x = (x / block_width) * block_width; - int coarse_y = (y / block_height) * block_height; + source += coarse_y * info.stride; + const unsigned int offset = coarse_x * block_height + i; switch (info.format) { case Regs::TextureFormat::RGBA8: { - const u8* source_ptr = source + coarse_x * block_height * 4 + coarse_y * info.stride + texel_index_within_tile * 4; + const u8* source_ptr = source + offset * 4; return { source_ptr[3], source_ptr[2], source_ptr[1], disable_alpha ? (u8)255 : source_ptr[0] }; } case Regs::TextureFormat::RGB8: { - const u8* source_ptr = source + coarse_x * block_height * 3 + coarse_y * info.stride + texel_index_within_tile * 3; + const u8* source_ptr = source + offset * 3; return { source_ptr[2], source_ptr[1], source_ptr[0], 255 }; } case Regs::TextureFormat::RGBA5551: { - const u16 source_ptr = *(const u16*)(source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2); + const u16 source_ptr = *(const u16*)(source + offset * 2); u8 r = (source_ptr >> 11) & 0x1F; u8 g = ((source_ptr) >> 6) & 0x1F; u8 b = (source_ptr >> 1) & 0x1F; @@ -367,7 +364,7 @@ const Math::Vec4 LookupTexture(const u8* source, int x, int y, const Texture case Regs::TextureFormat::RGB565: { - const u16 source_ptr = *(const u16*)(source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2); + const u16 source_ptr = *(const u16*)(source + offset * 2); u8 r = (source_ptr >> 11) & 0x1F; u8 g = ((source_ptr) >> 5) & 0x3F; u8 b = (source_ptr) & 0x1F; @@ -376,7 +373,7 @@ const Math::Vec4 LookupTexture(const u8* source, int x, int y, const Texture case Regs::TextureFormat::RGBA4: { - const u8* source_ptr = source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2; + const u8* source_ptr = source + offset * 2; u8 r = source_ptr[1] >> 4; u8 g = source_ptr[1] & 0xFF; u8 b = source_ptr[0] >> 4; @@ -390,7 +387,7 @@ const Math::Vec4 LookupTexture(const u8* source, int x, int y, const Texture case Regs::TextureFormat::IA8: { - const u8* source_ptr = source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2; + const u8* source_ptr = source + offset * 2; // TODO: component order not verified @@ -404,13 +401,13 @@ const Math::Vec4 LookupTexture(const u8* source, int x, int y, const Texture case Regs::TextureFormat::I8: { - const u8* source_ptr = source + coarse_x * block_height + coarse_y * info.stride + texel_index_within_tile; + const u8* source_ptr = source + offset; return { *source_ptr, *source_ptr, *source_ptr, 255 }; } case Regs::TextureFormat::A8: { - const u8* source_ptr = source + coarse_x * block_height + coarse_y * info.stride + texel_index_within_tile; + const u8* source_ptr = source + offset; if (disable_alpha) { return { *source_ptr, *source_ptr, *source_ptr, 255 }; @@ -421,7 +418,7 @@ const Math::Vec4 LookupTexture(const u8* source, int x, int y, const Texture case Regs::TextureFormat::IA4: { - const u8* source_ptr = source + coarse_x * block_height / 2 + coarse_y * info.stride + texel_index_within_tile / 2; + const u8* source_ptr = source + offset / 2; // TODO: component order not verified @@ -440,7 +437,7 @@ const Math::Vec4 LookupTexture(const u8* source, int x, int y, const Texture case Regs::TextureFormat::A4: { - const u8* source_ptr = source + coarse_x * block_height / 2 + coarse_y * info.stride + texel_index_within_tile / 2; + const u8* source_ptr = source + offset / 2; // TODO: component order not verified From 8369ee58035ca98f776428f6cccbcf987fee3bc9 Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Tue, 23 Dec 2014 13:05:51 -0200 Subject: [PATCH 7/7] Rasterizer: Pre-divide vertex attributes by W Execute the division-by-W for perspective-correct interpolation of values in the clipper, moving them out of the rasterization inner loop. --- src/video_core/clipper.cpp | 13 ++++++++++--- src/video_core/pica.h | 20 ++++++++++++++++++++ src/video_core/rasterizer.cpp | 7 ++----- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp index 0521ef8661..1744066ba0 100644 --- a/src/video_core/clipper.cpp +++ b/src/video_core/clipper.cpp @@ -91,10 +91,17 @@ static void InitScreenCoordinates(OutputVertex& vtx) viewport.zscale = float24::FromRawFloat24(registers.viewport_depth_range); viewport.offset_z = float24::FromRawFloat24(registers.viewport_depth_far_plane); + float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w; + vtx.color *= inv_w; + vtx.tc0 *= inv_w; + vtx.tc1 *= inv_w; + vtx.tc2 *= inv_w; + vtx.pos.w = inv_w; + // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not - vtx.screenpos[0] = (vtx.pos.x / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x; - vtx.screenpos[1] = (vtx.pos.y / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y; - vtx.screenpos[2] = viewport.offset_z - vtx.pos.z / vtx.pos.w * viewport.zscale; + vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x; + vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y; + vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale; } void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { diff --git a/src/video_core/pica.h b/src/video_core/pica.h index 89d97e4e93..38bac748cd 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h @@ -757,6 +757,26 @@ struct float24 { return float24::FromFloat32(ToFloat32() - flt.ToFloat32()); } + float24& operator *= (const float24& flt) { + value *= flt.ToFloat32(); + return *this; + } + + float24& operator /= (const float24& flt) { + value /= flt.ToFloat32(); + return *this; + } + + float24& operator += (const float24& flt) { + value += flt.ToFloat32(); + return *this; + } + + float24& operator -= (const float24& flt) { + value -= flt.ToFloat32(); + return *this; + } + float24 operator - () const { return float24::FromFloat32(-ToFloat32()); } diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp index 63da7104d6..a801488726 100644 --- a/src/video_core/rasterizer.cpp +++ b/src/video_core/rasterizer.cpp @@ -106,10 +106,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, int bias1 = IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0; int bias2 = IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0; - const Math::Vec3 w_inverse = Math::MakeVec( - float24::FromFloat32(1.0f) / v0.pos.w, - float24::FromFloat32(1.0f) / v1.pos.w, - float24::FromFloat32(1.0f) / v2.pos.w); + auto w_inverse = Math::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w); auto textures = registers.GetTextures(); auto tev_stages = registers.GetTevStages(); @@ -158,7 +155,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, // // The generalization to three vertices is straightforward in baricentric coordinates. auto GetInterpolatedAttribute = [&](float24 attr0, float24 attr1, float24 attr2) { - auto attr_over_w = Math::MakeVec(attr0, attr1, attr2) * w_inverse; + auto attr_over_w = Math::MakeVec(attr0, attr1, attr2); float24 interpolated_attr_over_w = Math::Dot(attr_over_w, baricentric_coordinates); return interpolated_attr_over_w * interpolated_w_inverse; };