radeonsi: use shader_info::use_aco_amd to determine whether to use ACO

It's set by si_nir_scan_shader, so we need to use it after that. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28917>
2024-04-14 14:16:32 -04:00
parent c83225cd0a
commit fe7a4ed708
8 changed files with 47 additions and 34 deletions
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -100,12 +100,12 @@ static void si_create_compute_state_async(void *job, void *gdata, int thread_ind
   assert(thread_index < ARRAY_SIZE(sscreen->compiler));
   compiler = &sscreen->compiler[thread_index];

-   if (!sscreen->use_aco && !*compiler)
-      *compiler = si_create_llvm_compiler(sscreen);
-
   assert(program->ir_type == PIPE_SHADER_IR_NIR);
   si_nir_scan_shader(sscreen, sel->nir, &sel->info);

+   if (!sel->info.base.use_aco_amd && !*compiler)
+      *compiler = si_create_llvm_compiler(sscreen);
+
   si_get_active_slot_masks(sscreen, &sel->info, &sel->active_const_and_shader_buffers,
                            &sel->active_samplers_and_images);

--- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
+++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
@@ -695,7 +695,7 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
      break;
   case nir_intrinsic_load_tess_rel_patch_id_amd:
      /* LLVM need to replace patch id arg, so have to be done in LLVM backend. */
-      if (!sel->screen->use_aco)
+      if (!sel->info.base.use_aco_amd)
         return false;

      if (stage == MESA_SHADER_TESS_CTRL) {
@@ -776,7 +776,7 @@ static bool lower_tex(nir_builder *b, nir_instr *instr, struct lower_abi_state *
    */

   /* LLVM keep non-uniform sampler as index, so can't do this in NIR. */
-   if (tex->is_shadow && gfx_level >= GFX8 && gfx_level <= GFX9 && sel->screen->use_aco) {
+   if (tex->is_shadow && gfx_level >= GFX8 && gfx_level <= GFX9 && sel->info.base.use_aco_amd) {
      int samp_index = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
      int comp_index = nir_tex_instr_src_index(tex, nir_tex_src_comparator);
      assert(samp_index >= 0 && comp_index >= 0);
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -382,7 +382,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args)
      }

      /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
-      if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11)
+      if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11)
         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);

      /* VGPRs */
@@ -400,7 +400,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args)
      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);

      /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
-      if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11)
+      if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11)
         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);

      /* VGPRs */
@@ -453,7 +453,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args)
               ac_add_return(&args->ac, AC_ARG_VGPR);

            /* VS outputs passed via VGPRs to TCS. */
-            if (shader->key.ge.opt.same_patch_vertices && !sel->screen->use_aco) {
+            if (shader->key.ge.opt.same_patch_vertices && !sel->info.base.use_aco_amd) {
               unsigned num_outputs = util_last_bit64(shader->selector->info.outputs_written_before_tes_gs);
               for (i = 0; i < num_outputs * 4; i++)
                  ac_add_return(&args->ac, AC_ARG_VGPR);
@@ -461,7 +461,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args)
         }
      } else {
         /* TCS inputs are passed via VGPRs from VS. */
-         if (shader->key.ge.opt.same_patch_vertices && !sel->screen->use_aco) {
+         if (shader->key.ge.opt.same_patch_vertices && !sel->info.base.use_aco_amd) {
            unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->info.outputs_written_before_tes_gs);
            for (i = 0; i < num_inputs * 4; i++)
               ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
@@ -574,7 +574,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args)
      }

      /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
-      if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11)
+      if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11)
         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);

      /* VGPRs */
@@ -588,7 +588,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args)
      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_wave_id);

      /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
-      if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11)
+      if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11)
         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);

      /* VGPRs */
@@ -641,7 +641,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args)
      si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.pos_fixed_pt,
                         SI_PARAM_POS_FIXED_PT);

-      if (sel->screen->use_aco) {
+      if (sel->info.base.use_aco_amd) {
         ac_compact_ps_vgpr_args(&args->ac, shader->config.spi_ps_input_addr);

         /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
@@ -728,7 +728,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args)
         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tg_size);

      /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
-      if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11)
+      if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11)
         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);

      /* Hardware VGPRs. */
@@ -2345,7 +2345,7 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
      NIR_PASS(progress, nir, ac_nir_lower_image_opcodes);

   /* LLVM does not work well with this, so is handled in llvm backend waterfall. */
-   if (sel->screen->use_aco && sel->info.has_non_uniform_tex_access) {
+   if (sel->info.base.use_aco_amd && sel->info.has_non_uniform_tex_access) {
      nir_lower_non_uniform_access_options options = {
         .types = nir_lower_non_uniform_texture_access,
      };
@@ -2449,7 +2449,7 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
      ac_nir_lower_ps_options options = {
         .gfx_level = sel->screen->info.gfx_level,
         .family = sel->screen->info.family,
-         .use_aco = sel->screen->use_aco,
+         .use_aco = sel->info.base.use_aco_amd,
         .uses_discard = si_shader_uses_discard(shader),
         .alpha_to_coverage_via_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz,
         .dual_src_blend_swizzle = key->ps.part.epilog.dual_src_blend_swizzle,
@@ -2538,7 +2538,7 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
   /* aco only accept scalar const, must be done after si_nir_late_opts()
    * which may generate vec const.
    */
-   if (sel->screen->use_aco)
+   if (sel->info.base.use_aco_amd)
      NIR_PASS_V(nir, nir_lower_load_const_to_scalar);

   /* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
@@ -2643,7 +2643,7 @@ si_nir_generate_gs_copy_shader(struct si_screen *sscreen,
   si_nir_opts(gs_selector->screen, nir, false);

   /* aco only accept scalar const */
-   if (sscreen->use_aco)
+   if (gsinfo->base.use_aco_amd)
      NIR_PASS_V(nir, nir_lower_load_const_to_scalar);

   if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY, SI_DUMP_NIR)) {
@@ -2653,11 +2653,11 @@ si_nir_generate_gs_copy_shader(struct si_screen *sscreen,

   bool ok =
 #if AMD_LLVM_AVAILABLE
-      !sscreen->use_aco ? si_llvm_compile_shader(sscreen, compiler, shader, &args, debug, nir) :
+      !gs_selector->info.base.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader,
+                                                                   &args, debug, nir) :
 #endif
      si_aco_compile_shader(shader, &args, nir, debug);

-
   if (ok) {
      assert(!shader->config.scratch_bytes_per_wave);
      ok = si_shader_binary_upload(sscreen, shader, 0) >= 0;
@@ -2857,7 +2857,7 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
   struct si_shader_selector *sel = shader->selector;

   /* ACO need spi_ps_input in advance to init args and used in compiler. */
-   if (sel->stage == MESA_SHADER_FRAGMENT && sscreen->use_aco)
+   if (sel->stage == MESA_SHADER_FRAGMENT && sel->info.base.use_aco_amd)
      si_set_spi_ps_input_config(shader);

   /* We need this info only when legacy GS. */
@@ -2923,7 +2923,8 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi

   ret =
 #if AMD_LLVM_AVAILABLE
-      !sscreen->use_aco ? si_llvm_compile_shader(sscreen, compiler, shader, &args, debug, nir) :
+      !sel->info.base.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader, &args,
+                                                           debug, nir) :
 #endif
      si_aco_compile_shader(shader, &args, nir, debug);

@@ -3015,7 +3016,7 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
   if (sel->screen->info.gfx_level < GFX11 &&
       (sel->screen->info.family < CHIP_GFX940 || sel->screen->info.has_graphics) &&
       !si_is_merged_shader(shader)) {
-      if (sscreen->use_aco) {
+      if (sel->info.base.use_aco_amd) {
         /* When aco scratch_offset arg is added explicitly at the beginning.
          * After compile if no scratch used, reduce the input sgpr count.
          */
@@ -3087,9 +3088,14 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
   result = CALLOC_STRUCT(si_shader_part);
   result->key = *key;

+   ASSERTED bool use_aco = sscreen->use_aco ||
+                           (stage == MESA_SHADER_FRAGMENT &&
+                            ((prolog && key->ps_prolog.use_aco) ||
+                             (!prolog && key->ps_epilog.use_aco)));
+
   bool ok =
 #if AMD_LLVM_AVAILABLE
-      !sscreen->use_aco ? si_llvm_build_shader_part(sscreen, stage, prolog, compiler, debug, name, result) :
+      !use_aco ? si_llvm_build_shader_part(sscreen, stage, prolog, compiler, debug, name, result) :
 #endif
      si_aco_build_shader_part(sscreen, stage, prolog, debug, name, result);

@@ -3144,6 +3150,7 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke

   memset(key, 0, sizeof(*key));
   key->ps_prolog.states = shader->key.ps.part.prolog;
+   key->ps_prolog.use_aco = info->base.use_aco_amd;
   key->ps_prolog.wave32 = shader->wave_size == 32;
   key->ps_prolog.colors_read = shader->info.ps_colors_read;
   key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
@@ -3266,6 +3273,7 @@ void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *ke
 {
   struct si_shader_info *info = &shader->selector->info;
   memset(key, 0, sizeof(*key));
+   key->ps_epilog.use_aco = info->base.use_aco_amd;
   key->ps_epilog.wave32 = shader->wave_size == 32;
   key->ps_epilog.uses_discard = si_shader_uses_discard(shader);
   key->ps_epilog.colors_written = info->colors_written;
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -642,6 +642,7 @@ struct si_ps_epilog_bits {
 union si_shader_part_key {
   struct {
      struct si_ps_prolog_bits states;
+      unsigned use_aco : 1;
      unsigned wave32 : 1;
      unsigned num_input_sgprs : 6;
      /* Color interpolation and two-side color selection. */
@@ -654,6 +655,7 @@ union si_shader_part_key {
   } ps_prolog;
   struct {
      struct si_ps_epilog_bits states;
+      unsigned use_aco : 1;
      unsigned wave32 : 1;
      unsigned uses_discard : 1;
      unsigned colors_written : 8;
--- a/src/gallium/drivers/radeonsi/si_shader_info.c
+++ b/src/gallium/drivers/radeonsi/si_shader_info.c
@@ -9,6 +9,7 @@
 #include "util/mesa-sha1.h"
 #include "sid.h"
 #include "nir.h"
+#include "aco_interface.h"

 struct si_shader_profile si_shader_profiles[] =
 {
@@ -619,6 +620,8 @@ void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir,
 {
   memset(info, 0, sizeof(*info));
   info->base = nir->info;
+   info->base.use_aco_amd = aco_is_gpu_supported(&sscreen->info) &&
+                            (sscreen->use_aco || nir->info.use_aco_amd);

   /* Get options from shader profiles. */
   for (unsigned i = 0; i < ARRAY_SIZE(si_shader_profiles); i++) {
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -71,6 +71,7 @@ static unsigned si_lower_bit_size_callback(const nir_instr *instr, void *data)

 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
 {
+   bool use_aco = sscreen->use_aco || nir->info.use_aco_amd;
   bool progress;

   do {
@@ -80,7 +81,7 @@ void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)

      NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
      NIR_PASS(progress, nir, nir_lower_alu_to_scalar,
-               nir->options->lower_to_scalar_filter, (void *)sscreen->use_aco);
+               nir->options->lower_to_scalar_filter, (void *)use_aco);
      NIR_PASS(progress, nir, nir_lower_phis_to_scalar, false);

      if (first) {
@@ -103,7 +104,7 @@ void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)

      if (lower_alu_to_scalar) {
         NIR_PASS_V(nir, nir_lower_alu_to_scalar,
-                    nir->options->lower_to_scalar_filter, (void *)sscreen->use_aco);
+                    nir->options->lower_to_scalar_filter, (void *)use_aco);
      }
      if (lower_phis_to_scalar)
         NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
@@ -145,10 +146,8 @@ void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
      if (nir->info.stage == MESA_SHADER_FRAGMENT)
         NIR_PASS_V(nir, nir_opt_move_discards_to_top);

-      if (sscreen->info.has_packed_math_16bit) {
-         NIR_PASS(progress, nir, nir_opt_vectorize, si_vectorize_callback,
-                  (void *)sscreen->use_aco);
-      }
+      if (sscreen->info.has_packed_math_16bit)
+         NIR_PASS(progress, nir, nir_opt_vectorize, si_vectorize_callback, (void *)use_aco);
   } while (progress);

   NIR_PASS_V(nir, nir_lower_var_copies);
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@@ -301,7 +301,7 @@ static void optimization_barrier_vgpr_array(struct si_context *sctx, nir_builder
    * barrier in the compute blit for GFX6-8 because the lack of A16 combined with optimization
    * barriers would unnecessarily increase VGPR usage for MSAA resources.
    */
-   if (!sctx->screen->use_aco && sctx->gfx_level >= GFX10) {
+   if (!b->shader->info.use_aco_amd && sctx->gfx_level >= GFX10) {
      for (unsigned i = 0; i < num_elements; i++) {
         unsigned prev_num = array[i]->num_components;
         array[i] = nir_trim_vector(b, array[i], num_components);
@@ -360,6 +360,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha

   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, nir_options,
                                                  "blit_non_scaled_cs");
+   b.shader->info.use_aco_amd = sctx->screen->use_aco;
   b.shader->info.num_images = options->is_clear ? 1 : 2;
   unsigned image_dst_index = b.shader->info.num_images - 1;
   if (!options->is_clear && options->src_is_msaa)
@@ -609,7 +610,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
    * barriers waiting for image loads, i.e. after s_waitcnt vmcnt(0).
    */
   nir_def *img_dst_desc = nir_image_deref_descriptor_amd(&b, 8, 32, deref_ssa(&b, img_dst));
-   if (lane_size > 1 && !sctx->screen->use_aco)
+   if (lane_size > 1 && !b.shader->info.use_aco_amd)
      img_dst_desc = nir_optimization_barrier_sgpr_amd(&b, 32, img_dst_desc);

   /* Apply the blit output modifiers, once per sample.  */
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -2823,7 +2823,7 @@ static void si_build_shader_variant(struct si_shader *shader, int thread_index,
      compiler = &shader->compiler_ctx_state.compiler;
   }

-   if (!sscreen->use_aco && !*compiler)
+   if (!sel->info.base.use_aco_amd && !*compiler)
      *compiler = si_create_llvm_compiler(sscreen);

   if (unlikely(!si_create_shader_variant(sscreen, *compiler, shader, debug))) {
@@ -3039,7 +3039,7 @@ current_not_ready:

   util_queue_fence_init(&shader->ready);

-   if (!sscreen->use_aco && !sctx->compiler)
+   if (!sel->info.base.use_aco_amd && !sctx->compiler)
      sctx->compiler = si_create_llvm_compiler(sctx->screen);

   shader->selector = sel;
@@ -3249,7 +3249,7 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind
   assert(thread_index < (int)ARRAY_SIZE(sscreen->compiler));
   compiler = &sscreen->compiler[thread_index];

-   if (!sscreen->use_aco && !*compiler)
+   if (!sel->info.base.use_aco_amd && !*compiler)
      *compiler = si_create_llvm_compiler(sscreen);

   /* Serialize NIR to save memory. Monolithic shader variants