r300: nir fcsel/CMP lowering pass for R500

Right now this is done in the backend so move it up to NIR. Doing this in the backend is easier, as at that time we can have a better idea about when we hit the hardware limits of three different TMP sources, however moving this to NIR allows for some optimizations. Specifically, at this time if we decide we actually have to lower we still have the info if we have plain fcsel for which we can save the comparison and emit flrp only. During translation to TGSI all of fcsel, fcsel_gt, and fcsel_ge translate to CMP so at that point the comparison is always needed. Shader-db RV530: total instructions in shared programs: 126057 -> 125823 (-0.19%) instructions in affected programs: 11359 -> 11125 (-2.06%) helped: 68 HURT: 12 total temps in shared programs: 17043 -> 17023 (-0.12%) temps in affected programs: 459 -> 439 (-4.36%) helped: 32 HURT: 12 total cycles in shared programs: 191604 -> 191294 (-0.16%) cycles in affected programs: 11834 -> 11524 (-2.62%) helped: 68 HURT: 12 The hurt shaders are some GTK shaders where there is some bad interaction with nir_move_vec_src_uses_to_dest. This is known and might be improved later by thweking the pass more. Reviewed-by: Filip Gawin <filip.gawin@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26816>
2023-09-21 11:59:07 +02:00
parent 75e7790ee5
commit 024491f60f
4 changed files with 140 additions and 0 deletions
--- a/src/gallium/drivers/r300/compiler/nir_to_rc.c
+++ b/src/gallium/drivers/r300/compiler/nir_to_rc.c
@@ -27,6 +27,7 @@
 #include "compiler/nir/nir_worklist.h"
 #include "nir_to_rc.h"
 #include "r300_nir.h"
+#include "r300_screen.h"
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_dump.h"
@@ -2372,6 +2373,7 @@ const void *nir_to_rc_options(struct nir_shader *s,
 {
   struct ntr_compile *c;
   const void *tgsi_tokens;
+   bool is_r500 = r300_screen(screen)->caps.is_r500;
   nir_variable_mode no_indirects_mask = ntr_no_indirects_mask(s, screen);

   /* Lower array indexing on FS inputs.  Since we don't set
@@ -2446,6 +2448,10 @@ const void *nir_to_rc_options(struct nir_shader *s,
              !options->lower_cmp && !options->lower_fabs);
   /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
   NIR_PASS_V(s, nir_copy_prop);
+   if (s->info.stage == MESA_SHADER_VERTEX) {
+      if (is_r500)
+         NIR_PASS_V(s, r300_nir_lower_fcsel_r500);
+   }
   NIR_PASS_V(s, nir_opt_dce);

   nir_move_options move_all =
--- a/src/gallium/drivers/r300/compiler/r300_nir.h
+++ b/src/gallium/drivers/r300/compiler/r300_nir.h
@@ -81,4 +81,6 @@ extern bool r300_nir_clean_double_fneg(struct nir_shader *shader);

 extern bool r300_nir_post_integer_lowering(struct nir_shader *shader);

+extern bool r300_nir_lower_fcsel_r500(nir_shader *shader);
+
 #endif /* R300_NIR_H */
--- a/src/gallium/drivers/r300/compiler/r500_nir_lower_fcsel.c
+++ b/src/gallium/drivers/r300/compiler/r500_nir_lower_fcsel.c
@@ -0,0 +1,131 @@
+#include <stdbool.h>
+#include "r300_nir.h"
+#include "nir_builder.h"
+
+static int
+follow_modifiers(nir_instr *instr)
+{
+   /* We don't have texturing so the only other options besides alus are
+    * just load input, load ubo or phi. We can copy propagate the first two
+    * in most cases. The cases when the copy propagate is not guaranteed
+    * to work is with indirect ubo load and in the presence of control flow.
+    * So just be safe and count this as a separate tmp.
+    */
+   if (instr->type == nir_instr_type_intrinsic) {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      /* It should be enough to check if any of the uses is in phi. */
+      if (intrin->intrinsic == nir_intrinsic_load_ubo_vec4 ||
+          intrin->intrinsic == nir_intrinsic_load_constant ||
+          intrin->intrinsic == nir_intrinsic_load_input) {
+          nir_foreach_use(use, &intrin->def) {
+              if (nir_src_parent_instr(use)->type == nir_instr_type_phi)
+                 return intrin->def.index;
+          }
+      }
+      if (intrin->intrinsic == nir_intrinsic_load_ubo_vec4 &&
+          !nir_src_is_const(intrin->src[1]))
+      return intrin->def.index;
+   }
+   /* Assume the worst when we see a phi. */
+   if (instr->type == nir_instr_type_phi)
+      return nir_instr_as_phi(instr)->def.index;
+
+   if (instr->type != nir_instr_type_alu)
+      return -1;
+
+   nir_alu_instr *alu = nir_instr_as_alu(instr);
+
+   if (alu->op == nir_op_fneg || alu->op == nir_op_fabs) {
+      return follow_modifiers(alu->src[0].src.ssa->parent_instr);
+   }
+   return alu->def.index;
+}
+
+static bool
+has_three_different_tmp_sources(nir_alu_instr *fcsel)
+{
+   unsigned src_def_index[3];
+   for (unsigned i = 0; i < 3; i++) {
+      int index = follow_modifiers(fcsel->src[i].src.ssa->parent_instr);
+      if (index == -1)
+         return false;
+      else
+	 src_def_index[i] = index;
+   }
+   return src_def_index[0] != src_def_index[1] &&
+          src_def_index[0] != src_def_index[2] &&
+          src_def_index[1] != src_def_index[2];
+}
+
+static bool
+is_comparison(nir_instr *instr)
+{
+   if (instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *alu = nir_instr_as_alu(instr);
+
+   switch (alu->op) {
+   case nir_op_sge:
+   case nir_op_slt:
+   case nir_op_seq:
+   case nir_op_sne:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static bool
+r300_nir_lower_fcsel_instr(nir_builder *b, nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *alu = nir_instr_as_alu(instr);
+
+   if (alu->op != nir_op_fcsel && alu->op != nir_op_fcsel_ge && alu->op != nir_op_fcsel_gt)
+      return false;
+
+   if (has_three_different_tmp_sources(alu)) {
+      nir_def *lrp;
+      b->cursor = nir_before_instr(&alu->instr);
+      /* Lower to LRP.
+       * At this point there are no fcsels as all bcsels were converted to
+       * fcsel_gt by nir_lower_bool_to_float, however we can save on the slt
+       * even for nir_op_fcsel_gt if the source is 0 or 1 anyway.
+       */
+      nir_instr *src0_instr = alu->src[0].src.ssa->parent_instr;
+      if (alu->op == nir_op_fcsel ||
+          (alu->op == nir_op_fcsel_gt && is_comparison(src0_instr))) {
+         lrp = nir_flrp(b, nir_ssa_for_alu_src(b, alu, 2),
+                        nir_ssa_for_alu_src(b, alu, 1),
+                        nir_ssa_for_alu_src(b, alu, 0));
+      } else if (alu->op == nir_op_fcsel_ge) {
+         nir_def *sge = nir_sge(b, nir_ssa_for_alu_src(b, alu, 0), nir_imm_float(b, 0.0));
+         lrp = nir_flrp(b, nir_ssa_for_alu_src(b, alu, 2),
+                        nir_ssa_for_alu_src(b, alu, 1), sge);
+      } else {
+         nir_def *slt = nir_slt(b, nir_fneg(b, nir_ssa_for_alu_src(b, alu, 0)),
+                                nir_imm_float(b, 0.0));
+         lrp = nir_flrp(b, nir_ssa_for_alu_src(b, alu, 2),
+                        nir_ssa_for_alu_src(b, alu, 1), slt);
+      }
+
+      nir_def_rewrite_uses(&alu->def, lrp);
+      nir_instr_remove(&alu->instr);
+      return true;
+   }
+   return false;
+}
+
+bool
+r300_nir_lower_fcsel_r500(nir_shader *shader)
+{
+   bool progress = nir_shader_instructions_pass(shader,
+                                                r300_nir_lower_fcsel_instr,
+                                                nir_metadata_block_index |
+                                                   nir_metadata_dominance,
+                                                NULL);
+   return progress;
+}
--- a/src/gallium/drivers/r300/meson.build
+++ b/src/gallium/drivers/r300/meson.build
@@ -77,6 +77,7 @@ files_r300 = files(
  'compiler/r500_fragprog.h',
  'compiler/r300_nir.c',
  'compiler/r300_nir.h',
+  'compiler/r500_nir_lower_fcsel.c',
  'compiler/radeon_code.c',
  'compiler/radeon_code.h',
  'compiler/radeon_compiler.c',