ac/nir/cull: cull small prims using a point-triangle intersection test

This is based on Timur Kristof's code, but there are a lot of differences. The idea is that it doesn't just compute an intersection between a point and a triangle. It computes the *distance* between a point and a triangle and it does so in screen space. It accurately takes the subpixel precision of the rasterizer into account, so that it works optimally at all resolutions, all MSAA modes, and all quant modes. The distance computation is only approximated because it only considers the infinite lines going through triangle edges. However, it seems to be more than sufficient in practice because the existing rounding-based small prim culling compensates for it. The performance improvement is up to 10% in some geometry-bound tests, though targeted microbenchmarks can show a lot more than that. Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33361>
2025-02-03 01:58:33 -05:00
parent 36bf26cf27
commit ce716d009f
6 changed files with 183 additions and 13 deletions
--- a/src/amd/common/nir/ac_nir.h
+++ b/src/amd/common/nir/ac_nir.h
@@ -164,6 +164,8 @@ typedef struct {
    * culling. Set this when the shader writes the viewport index.
    */
   bool skip_viewport_culling;
+   /* Use the point-triangle intersection to cull small triangles. */
+   bool use_point_tri_intersection;

   /* VS */
   unsigned num_vertices_per_primitive;
--- a/src/amd/common/nir/ac_nir_cull.c
+++ b/src/amd/common/nir/ac_nir_cull.c
@@ -88,32 +88,194 @@ cull_frustrum(nir_builder *b, nir_def *bbox_min[2], nir_def *bbox_max[2])
 }

 static nir_def *
-cull_small_primitive_triangle(nir_builder *b, nir_def *bbox_min[2], nir_def *bbox_max[2])
+cross(nir_builder *b, nir_def *p[2], nir_def *q[2])
+{
+   nir_def *left = nir_fmul(b, p[0], q[1]);
+   nir_def *right = nir_fmul(b, q[0], p[1]);
+   return nir_fsub(b, left, right);
+}
+
+/* Return whether the distance between the point and the triangle is greater than the given
+ * distance.
+ */
+static nir_def *
+point_outside_triangle(nir_builder *b, nir_def *p[2], nir_def *pos[3][2], nir_def *distance)
+{
+   nir_def **vtx_a = pos[0], **vtx_b = pos[1], **vtx_c = pos[2];
+   nir_def *a_b[2] = { nir_fsub(b, vtx_b[0], vtx_a[0]), nir_fsub(b, vtx_b[1], vtx_a[1]) };
+   nir_def *a_c[2] = { nir_fsub(b, vtx_c[0], vtx_a[0]), nir_fsub(b, vtx_c[1], vtx_a[1]) };
+   nir_def *b_c[2] = { nir_fsub(b, vtx_c[0], vtx_b[0]), nir_fsub(b, vtx_c[1], vtx_b[1]) };
+   nir_def *a_p[2] = { nir_fsub(b, p[0], vtx_a[0]), nir_fsub(b, p[1], vtx_a[1]) };
+   nir_def *b_p[2] = { nir_fsub(b, p[0], vtx_b[0]), nir_fsub(b, p[1], vtx_b[1]) };
+
+   /* Compute 2D cross products, which we need for computing distances from lines. */
+   nir_def *crosses[3] = { cross(b, a_p, a_c), cross(b, a_b, a_p), cross(b, b_c, b_p) };
+
+   /* These are distances from the 3 infinite lines going through triangle edges.
+    *
+    * A distance is positive if the point is on one side of the half space, and negative
+    * if the point is on the other side of the half space. That's because the distance is
+    * a normalized 2D cross product, which is always scalar and signed.
+    */
+   nir_def *line_distances[3] = {
+      nir_fmul(b, crosses[0], nir_frsq(b, nir_fdot2(b, nir_vec(b, a_c, 2), nir_vec(b, a_c, 2)))),
+      nir_fmul(b, crosses[1], nir_frsq(b, nir_fdot2(b, nir_vec(b, a_b, 2), nir_vec(b, a_b, 2)))),
+      nir_fmul(b, crosses[2], nir_frsq(b, nir_fdot2(b, nir_vec(b, b_c, 2), nir_vec(b, b_c, 2)))),
+   };
+
+   nir_def *max_distance =
+      nir_fmax(b, line_distances[0], nir_fmax(b, line_distances[1], line_distances[2]));
+   nir_def *min_distance =
+      nir_fmin(b, line_distances[0], nir_fmin(b, line_distances[1], line_distances[2]));
+
+   /* If max_distance > distance && min_distance < -distance, the point is outside the triangle.
+    *
+    * Explanation:
+    *
+    * If the point it outside the triangle, 2 distances are positive and 1 is negative, or 2 distances
+    * are negative and 1 is positive (depending on winding and where the point is). max_distance > distance
+    * will pass because at least 1 distance is positive, and min_distance < -distance will pass because at
+    * least 1 distance is negative.
+    *
+    * However, if the point is inside the triangle, either all distances are positive (min_distance < -distance
+    * will fail) or all distances are negative (max_distance > distance will fail), depending on winding.
+    *
+    * Note that min/max_distance are not distances from the triangle, but they are distances from
+    * the lines. This can falsely return that the distance between the point and the triangle is
+    * less than than the given distance if 2 infinite lines are sticking out of 1 vertex, are
+    * pointing in the direction of the point, and there is a very small angle between them.
+    * Most of these cases should be eliminated by the rounding-based small prim culling.
+    */
+   return nir_iand(b, nir_flt(b, distance, max_distance),
+                   nir_flt(b, min_distance, nir_fneg(b, distance)));
+}
+
+static nir_def *
+cull_small_primitive_triangle(nir_builder *b, bool use_point_tri_intersection,
+                              nir_def *bbox_min[2], nir_def *bbox_max[2], nir_def *pos[3][4])
 {
   nir_def *vp = nir_load_cull_triangle_viewport_xy_scale_and_offset_amd(b);
   nir_def *small_prim_precision = nir_load_cull_small_triangle_precision_amd(b);
   nir_def *rejected = nir_imm_false(b);

+   nir_def *bbox_pixel_min[2], *bbox_pixel_max[2], *vp_scale[2], *vp_translate[2];
+
   for (unsigned chan = 0; chan < 2; ++chan) {
-      nir_def *vp_scale = nir_channel(b, vp, chan);
-      nir_def *vp_translate = nir_channel(b, vp, 2 + chan);
+      vp_scale[chan] = nir_channel(b, vp, chan);
+      vp_translate[chan] = nir_channel(b, vp, 2 + chan);

      /* Convert the position to screen-space coordinates. */
-      nir_def *min = nir_ffma(b, bbox_min[chan], vp_scale, vp_translate);
-      nir_def *max = nir_ffma(b, bbox_max[chan], vp_scale, vp_translate);
+      nir_def *min = nir_ffma(b, bbox_min[chan], vp_scale[chan], vp_translate[chan]);
+      nir_def *max = nir_ffma(b, bbox_max[chan], vp_scale[chan], vp_translate[chan]);

      /* Scale the bounding box according to precision. */
      min = nir_fsub(b, min, small_prim_precision);
      max = nir_fadd(b, max, small_prim_precision);

      /* Determine if the bbox intersects the sample point, by checking if the min and max round to the same int. */
-      min = nir_fround_even(b, min);
-      max = nir_fround_even(b, max);
+      bbox_pixel_min[chan] = nir_fround_even(b, min);
+      bbox_pixel_max[chan] = nir_fround_even(b, max);

-      nir_def *rounded_to_eq = nir_feq(b, min, max);
+      nir_def *rounded_to_eq = nir_feq(b, bbox_pixel_min[chan], bbox_pixel_max[chan]);
      rejected = nir_ior(b, rejected, rounded_to_eq);
   }

+   /* If the triangle hasn't been filtered out yet, try another way.
+    * Only execute this code if this subgroup has culled at least 1 small triangle, which indicates
+    * that there are probably more small triangles that could be culled.
+    */
+   if (use_point_tri_intersection) {
+      nir_def *outside_center = NULL;
+      nir_if *if_passed = nir_push_if(b, nir_inot(b, rejected));
+      {
+         /* Calculate rounded bounding box dimensions. */
+         nir_def *bbox_pixel_w = nir_fsub(b, bbox_pixel_max[0], bbox_pixel_min[0]);
+         nir_def *bbox_pixel_h = nir_fsub(b, bbox_pixel_max[1], bbox_pixel_min[1]);
+
+         /* The largest bounding box (rounded to integer coordinates) that contains the triangle
+          * that we accept has 1x1 pixel area and looks like this:
+          *
+          *    X         X         X
+          *
+          *         ┌─────────┐
+          *         │         │
+          *    X    │    X    │    X
+          *         │         │
+          *         └─────────┘
+          *
+          *    X         X         X
+          *
+          * However, the largest bounding box before the rounding that contains the triangle can be
+          * this:
+          *
+          *    X         X         X
+          *     ┌─────────────────┐
+          *     │                 │
+          *     │                 │
+          *    X│        X        │X
+          *     │                 │
+          *     │                 │
+          *     └─────────────────┘
+          *    X         X         X
+          *
+          * which is the largest area that has 1 pixel center in the middle and 8 pixel centers
+          * outside. Therefore, a 1x1 pixels-large rounded bounding box represents an area that's
+          * slightly smaller than 2x2 pixels and has only a single pixel in the center. Thanks to
+          * that and given that the triangle is always inside the bounding box, we only have to
+          * compute a single point-triangle intersection.
+          *
+          * Check if the triangle's rounded bounding box is a single pixel, which means the triangle
+          * can only potentially affect this pixel.
+          *
+          * 1.01 is used to prevent possible FP precision issues.
+          */
+         nir_def *w_1px = nir_flt_imm(b, bbox_pixel_w, 1.01);
+         nir_def *h_1px = nir_flt_imm(b, bbox_pixel_h, 1.01);
+         nir_def *fals = nir_imm_false(b);
+         nir_if *if_tri_1px = nir_push_if(b, nir_iand(b, w_1px, h_1px));
+         {
+            /* The coordinates of the pixel center in screen space. */
+            nir_def *pix_center[] = {
+               nir_fadd_imm(b, bbox_pixel_min[0], 0.5),
+               nir_fadd_imm(b, bbox_pixel_min[1], 0.5),
+            };
+
+            /* These are the X, Y coordinates of the 3 points of the triangle. */
+            nir_def *screen_pos[3][2] = {{0}};
+
+            /* Transform the coordinates to screen space. */
+            for (unsigned vtx = 0; vtx < 3; ++vtx) {
+               for (unsigned chan = 0; chan < 2; ++chan)
+                  screen_pos[vtx][chan] = nir_ffma(b, pos[vtx][chan], vp_scale[chan], vp_translate[chan]);
+            }
+
+            /* small_prim_precision is the rasterization precision in X an Y axes, meaning it's the size of
+             * one cell in the fixed-point grid that vertex positions are snapped to. When floating-point
+             * coordinates are snapped (rounded) to fixed-point, vertex positions can be shifted by
+             * +-small_prim_precision.
+             *
+             * We need a precision value that works in all directions. Compute the worst-case
+             * omnidirectional precision, which is the length of the hypotenuse where
+             * small_prim_precision is the length of the catheti.
+             *
+             * x = small_prim_precision
+             * sqrt(x*x + x*x) = sqrt(x*x*2) = x * sqrt(2)
+             */
+            nir_def *precision_distance = nir_fmul_imm(b, small_prim_precision, sqrt(2));
+
+            /* Check if the pixel center is outside the triangle. If it is, the triangle can be
+             * safely removed.
+             */
+            outside_center = point_outside_triangle(b, pix_center, screen_pos, precision_distance);
+         }
+         nir_pop_if(b, if_tri_1px);
+
+         outside_center = nir_if_phi(b, outside_center, fals);
+      }
+      nir_pop_if(b, if_passed);
+      rejected = nir_if_phi(b, outside_center, rejected);
+   }
+
   return rejected;
 }

@@ -135,6 +297,7 @@ call_accept_func(nir_builder *b, nir_def *accepted, ac_nir_cull_accepted accept_
 static nir_def *
 ac_nir_cull_triangle(nir_builder *b,
                     bool skip_viewport_culling,
+                     bool use_point_tri_intersection,
                     nir_def *initially_accepted,
                     nir_def *pos[3][4],
                     position_w_info *w_info,
@@ -162,7 +325,8 @@ ac_nir_cull_triangle(nir_builder *b,

      nir_if *if_cull_small_prims = nir_push_if(b, nir_load_cull_small_triangles_enabled_amd(b));
      {
-         nir_def *small_prim_rejected = cull_small_primitive_triangle(b, bbox_min, bbox_max);
+         nir_def *small_prim_rejected = cull_small_primitive_triangle(b, use_point_tri_intersection,
+                                                                      bbox_min, bbox_max, pos);
         bbox_rejected = nir_ior(b, bbox_rejected, small_prim_rejected);
      }
      nir_pop_if(b, if_cull_small_prims);
@@ -349,6 +513,7 @@ ac_nir_cull_line(nir_builder *b,
 nir_def *
 ac_nir_cull_primitive(nir_builder *b,
                      bool skip_viewport_culling,
+                      bool use_point_tri_intersection,
                      nir_def *initially_accepted,
                      nir_def *pos[3][4],
                      unsigned num_vertices,
@@ -359,8 +524,8 @@ ac_nir_cull_primitive(nir_builder *b,
   analyze_position_w(b, pos, num_vertices, &w_info);

   if (num_vertices == 3) {
-      return ac_nir_cull_triangle(b, skip_viewport_culling, initially_accepted, pos, &w_info,
-                                  accept_func, state);
+      return ac_nir_cull_triangle(b, skip_viewport_culling, use_point_tri_intersection,
+                                  initially_accepted, pos, &w_info, accept_func, state);
   } else if (num_vertices == 2) {
      return ac_nir_cull_line(b, skip_viewport_culling, initially_accepted, pos, &w_info,
                              accept_func, state);
--- a/src/amd/common/nir/ac_nir_helpers.h
+++ b/src/amd/common/nir/ac_nir_helpers.h
@@ -147,6 +147,7 @@ ac_nir_map_io_location(unsigned location,
 nir_def *
 ac_nir_cull_primitive(nir_builder *b,
                      bool skip_viewport_culling,
+                      bool use_point_tri_intersection,
                      nir_def *initially_accepted,
                      nir_def *pos[3][4],
                      unsigned num_vertices,
--- a/src/amd/common/nir/ac_nir_lower_ngg.c
+++ b/src/amd/common/nir/ac_nir_lower_ngg.c
@@ -1193,8 +1193,8 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
         }

         /* See if the current primitive is accepted */
-         ac_nir_cull_primitive(b, s->options->skip_viewport_culling, accepted_by_clipdist, pos,
-                               s->options->num_vertices_per_primitive,
+         ac_nir_cull_primitive(b, s->options->skip_viewport_culling, s->options->use_point_tri_intersection,
+                               accepted_by_clipdist, pos, s->options->num_vertices_per_primitive,
                               cull_primitive_accepted, s);
      }
      nir_pop_if(b, if_gs_thread);
--- a/src/amd/common/nir/ac_nir_lower_ngg_gs.c
+++ b/src/amd/common/nir/ac_nir_lower_ngg_gs.c
@@ -632,6 +632,7 @@ ngg_gs_cull_primitive(nir_builder *b, nir_def *tid_in_tg, nir_def *max_vtxcnt,
      nir_def *accepted_by_clipdist = nir_imm_true(b);

      nir_def *accepted = ac_nir_cull_primitive(b, s->options->skip_viewport_culling,
+                                                s->options->use_point_tri_intersection,
                                                accepted_by_clipdist, pos,
                                                s->num_vertices_per_primitive, NULL, NULL);

--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1807,6 +1807,7 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir)
      .force_vrs = sel->screen->options.vrs2x2,
      .use_gfx12_xfb_intrinsic = !nir->info.use_aco_amd,
      .skip_viewport_culling = sel->info.writes_viewport_index,
+      .use_point_tri_intersection = sel->screen->info.num_cu / sel->screen->info.num_se >= 12,
   };

   if (nir->info.stage == MESA_SHADER_VERTEX ||