GPU: Use half width vector types where appropriate

2024-07-11 18:37:51 +10:00
parent d70f5ddb32
commit 59a2309a83
8 changed files with 2614 additions and 220 deletions
--- a/src/core/gpu.h
+++ b/src/core/gpu.h
@ -319,15 +319,15 @@ protected:
  virtual void DrawRendererStats();
  virtual void OnBufferSwapped();

-  ALWAYS_INLINE_RELEASE void AddDrawTriangleTicks(GSVector4i v1, GSVector4i v2, GSVector4i v3, bool shaded,
+  ALWAYS_INLINE_RELEASE void AddDrawTriangleTicks(GSVector2i v1, GSVector2i v2, GSVector2i v3, bool shaded,
                                                  bool textured, bool semitransparent)
  {
    // This will not produce the correct results for triangles which are partially outside the clip area.
    // However, usually it'll undershoot not overshoot. If we wanted to make this more accurate, we'd need to intersect
    // the edges with the clip rectangle.
    // TODO: Coordinates are exclusive, so off by one here...
-    const GSVector4i clamp_min = m_clamped_drawing_area; // would be xyxy(), but zw isn't used.
-    const GSVector4i clamp_max = m_clamped_drawing_area.zwzw();
+    const GSVector2i clamp_min = GSVector2i::load(&m_clamped_drawing_area.x);
+    const GSVector2i clamp_max = GSVector2i::load(&m_clamped_drawing_area.z);
    v1 = v1.sat_i32(clamp_min, clamp_max);
    v2 = v2.sat_i32(clamp_min, clamp_max);
    v3 = v3.sat_i32(clamp_min, clamp_max);
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@ -1962,15 +1962,15 @@ void GPU_HW::ComputePolygonUVLimits(BatchVertex* vertices, u32 num_vertices)
 {
  DebugAssert(num_vertices == 3 || num_vertices == 4);

-  GSVector4i v0 = GSVector4i::load32(&vertices[0].u);
-  GSVector4i v1 = GSVector4i::load32(&vertices[1].u);
-  GSVector4i v2 = GSVector4i::load32(&vertices[2].u);
-  GSVector4i v3;
-  GSVector4i min = v0.min_u16(v1).min_u16(v2);
-  GSVector4i max = v0.max_u16(v1).max_u16(v2);
+  GSVector2i v0 = GSVector2i::load32(&vertices[0].u);
+  GSVector2i v1 = GSVector2i::load32(&vertices[1].u);
+  GSVector2i v2 = GSVector2i::load32(&vertices[2].u);
+  GSVector2i v3;
+  GSVector2i min = v0.min_u16(v1).min_u16(v2);
+  GSVector2i max = v0.max_u16(v1).max_u16(v2);
  if (num_vertices == 4)
  {
-    v3 = GSVector4i::load32(&vertices[3].u);
+    v3 = GSVector2i::load32(&vertices[3].u);
    min = min.min_u16(v3);
    max = max.max_u16(v3);
  }
@ -1986,7 +1986,7 @@ void GPU_HW::ComputePolygonUVLimits(BatchVertex* vertices, u32 num_vertices)
    vertices[i].SetUVLimits(min_u, max_u, min_v, max_v);

  if (m_texpage_dirty != 0)
-    CheckForTexPageOverlap(min.upl32(max).u16to32());
+    CheckForTexPageOverlap(GSVector4i(min).upl32(GSVector4i(max)).u16to32());
 }

 void GPU_HW::SetBatchDepthBuffer(bool enabled)
@ -2157,8 +2157,6 @@ void GPU_HW::LoadVertices()
      const bool shaded = rc.shading_enable;
      const bool pgxp = g_settings.gpu_pgxp_enable;

-      // TODO: Using 64-bit vectors instead of 32-bit could be advantageous here, particularly for small ARM cores and
-      // RISC-V.
      const u32 first_color = rc.color_for_first_vertex;
      u32 num_vertices = rc.quad_polygon ? 4 : 3;
      std::array<BatchVertex, 4> vertices;
@ -2240,13 +2238,13 @@ void GPU_HW::LoadVertices()
      }

      // Cull polygons which are too large.
-      const GSVector4 v0f = GSVector4::loadl(&vertices[0].x);
-      const GSVector4 v1f = GSVector4::loadl(&vertices[1].x);
-      const GSVector4 v2f = GSVector4::loadl(&vertices[2].x);
-      const GSVector4 min_pos_12 = v1f.min(v2f);
-      const GSVector4 max_pos_12 = v1f.max(v2f);
-      const GSVector4i draw_rect_012 =
-        GSVector4i(min_pos_12.min(v0f).upld(max_pos_12.max(v0f))).add32(GSVector4i::cxpr(0, 0, 1, 1));
+      const GSVector2 v0f = GSVector2::load(&vertices[0].x);
+      const GSVector2 v1f = GSVector2::load(&vertices[1].x);
+      const GSVector2 v2f = GSVector2::load(&vertices[2].x);
+      const GSVector2 min_pos_12 = v1f.min(v2f);
+      const GSVector2 max_pos_12 = v1f.max(v2f);
+      const GSVector4i draw_rect_012 = GSVector4i(GSVector4(min_pos_12.min(v0f)).upld(GSVector4(max_pos_12.max(v0f))))
+                                         .add32(GSVector4i::cxpr(0, 0, 1, 1));
      const GSVector4i clamped_draw_rect_012 = draw_rect_012.rintersect(m_clamped_drawing_area);
      const bool first_tri_culled = (draw_rect_012.width() > MAX_PRIMITIVE_WIDTH ||
                                     draw_rect_012.height() > MAX_PRIMITIVE_HEIGHT || clamped_draw_rect_012.rempty());
@ -2265,9 +2263,8 @@ void GPU_HW::LoadVertices()
          ComputePolygonUVLimits(vertices.data(), num_vertices);

        AddDrawnRectangle(clamped_draw_rect_012);
-        AddDrawTriangleTicks(GSVector4i(native_vertex_positions[0]), GSVector4i(native_vertex_positions[1]),
-                             GSVector4i(native_vertex_positions[2]), rc.shading_enable, rc.texture_enable,
-                             rc.transparency_enable);
+        AddDrawTriangleTicks(native_vertex_positions[0], native_vertex_positions[1], native_vertex_positions[2],
+                             rc.shading_enable, rc.texture_enable, rc.transparency_enable);

        // Expand lines to triangles (Doom, Soul Blade, etc.)
        if (!rc.quad_polygon && m_line_detect_mode >= GPULineDetectMode::BasicTriangles && !is_3d &&
@ -2288,9 +2285,9 @@ void GPU_HW::LoadVertices()
      // quads
      if (rc.quad_polygon)
      {
-        const GSVector4 v3f = GSVector4::loadl(&vertices[3].x);
-        const GSVector4i draw_rect_123 =
-          GSVector4i(min_pos_12.min(v3f).upld(max_pos_12.max(v3f))).add32(GSVector4i::cxpr(0, 0, 1, 1));
+        const GSVector2 v3f = GSVector2::load(&vertices[3].x);
+        const GSVector4i draw_rect_123 = GSVector4i(GSVector4(min_pos_12.min(v3f)).upld(GSVector4(max_pos_12.max(v3f))))
+                                           .add32(GSVector4i::cxpr(0, 0, 1, 1));
        const GSVector4i clamped_draw_rect_123 = draw_rect_123.rintersect(m_clamped_drawing_area);

        // Cull polygons which are too large.
@ -2312,9 +2309,8 @@ void GPU_HW::LoadVertices()
            ComputePolygonUVLimits(vertices.data(), num_vertices);

          AddDrawnRectangle(clamped_draw_rect_123);
-          AddDrawTriangleTicks(GSVector4i(native_vertex_positions[2]), GSVector4i(native_vertex_positions[1]),
-                               GSVector4i(native_vertex_positions[3]), rc.shading_enable, rc.texture_enable,
-                               rc.transparency_enable);
+          AddDrawTriangleTicks(native_vertex_positions[2], native_vertex_positions[1], native_vertex_positions[3],
+                               rc.shading_enable, rc.texture_enable, rc.transparency_enable);

          const u32 start_index = m_batch_vertex_count;
          DebugAssert(m_batch_index_space >= 3);
@ -2650,7 +2646,7 @@ ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(GSVector4i uv_rect)

  const GPUTextureMode tmode = m_draw_mode.mode_reg.texture_mode;
  const u32 xshift = (tmode >= GPUTextureMode::Direct16Bit) ? 0 : (2 - static_cast<u8>(tmode));
-  const GSVector4i page_offset = GSVector4i(m_current_texture_page_offset).xyxy();
+  const GSVector4i page_offset = GSVector4i::loadl(m_current_texture_page_offset).xyxy();

  uv_rect = uv_rect.blend32<5>(uv_rect.srl32(xshift));   // shift only goes on the x
  uv_rect = uv_rect.add32(page_offset);                  // page offset
@ -3220,7 +3216,7 @@ void GPU_HW::DispatchRenderCommand()
      }

      const GSVector4i page_rect = m_draw_mode.mode_reg.GetTexturePageRectangle();
-      m_current_texture_page_offset = page_rect.xy();
+      GSVector4i::storel(m_current_texture_page_offset, page_rect);

      u8 new_texpage_dirty = m_vram_dirty_draw_rect.rintersects(page_rect) ? TEXPAGE_DIRTY_DRAWN_RECT : 0;
      new_texpage_dirty |= m_vram_dirty_write_rect.rintersects(page_rect) ? TEXPAGE_DIRTY_WRITTEN_RECT : 0;
--- a/src/core/gpu_hw.h
+++ b/src/core/gpu_hw.h
@ -294,7 +294,7 @@ private:
  GSVector4i m_vram_dirty_draw_rect = INVALID_RECT;
  GSVector4i m_vram_dirty_write_rect = INVALID_RECT;
  GSVector4i m_current_uv_rect = INVALID_RECT;
-  GSVector2i m_current_texture_page_offset = {};
+  s32 m_current_texture_page_offset[2] = {};

  std::unique_ptr<GPUPipeline> m_wireframe_pipeline;

--- a/src/core/gpu_sw.cpp
+++ b/src/core/gpu_sw.cpp
@ -515,7 +515,7 @@ void GPU_SW::DispatchRenderCommand()
      GPUBackendDrawPolygonCommand* cmd = m_backend.NewDrawPolygonCommand(num_vertices);
      FillDrawCommand(cmd, rc);

-      std::array<GSVector4i, 4> positions;
+      std::array<GSVector2i, 4> positions;
      const u32 first_color = rc.color_for_first_vertex;
      const bool shaded = rc.shading_enable;
      const bool textured = rc.texture_enable;
@ -528,14 +528,15 @@ void GPU_SW::DispatchRenderCommand()
        vert->x = m_drawing_offset.x + vp.x;
        vert->y = m_drawing_offset.y + vp.y;
        vert->texcoord = textured ? Truncate16(FifoPop()) : 0;
-        positions[i] = GSVector4i::loadl(&vert->x);
+        positions[i] = GSVector2i::load(&vert->x);
      }

      // Cull polygons which are too large.
-      const GSVector4i min_pos_12 = positions[1].min_i32(positions[2]);
-      const GSVector4i max_pos_12 = positions[1].max_i32(positions[2]);
-      const GSVector4i draw_rect_012 =
-        min_pos_12.min_i32(positions[0]).upl64(max_pos_12.max_i32(positions[0])).add32(GSVector4i::cxpr(0, 0, 1, 1));
+      const GSVector2i min_pos_12 = positions[1].min_i32(positions[2]);
+      const GSVector2i max_pos_12 = positions[1].max_i32(positions[2]);
+      const GSVector4i draw_rect_012 = GSVector4i(min_pos_12.min_i32(positions[0]))
+                                         .upl64(GSVector4i(max_pos_12.max_i32(positions[0])))
+                                         .add32(GSVector4i::cxpr(0, 0, 1, 1));
      const bool first_tri_culled =
        (draw_rect_012.width() > MAX_PRIMITIVE_WIDTH || draw_rect_012.height() > MAX_PRIMITIVE_HEIGHT ||
         !m_clamped_drawing_area.rintersects(draw_rect_012));
@ -556,8 +557,9 @@ void GPU_SW::DispatchRenderCommand()
      // quads
      if (rc.quad_polygon)
      {
-        const GSVector4i draw_rect_123 =
-          min_pos_12.min_i32(positions[3]).upl64(max_pos_12.max_i32(positions[3])).add32(GSVector4i::cxpr(0, 0, 1, 1));
+        const GSVector4i draw_rect_123 = GSVector4i(min_pos_12.min_i32(positions[3]))
+                                           .upl64(GSVector4i(max_pos_12.max_i32(positions[3])))
+                                           .add32(GSVector4i::cxpr(0, 0, 1, 1));

        // Cull polygons which are too large.
        const bool second_tri_culled =