radv: New shader args for NGG culling settings and viewport.

Add new shader arguments in RADV for: - NGG culling settings - Viewport transform These will be used by NGG culling shaders. Additionally, some tweaks are made to some config registers in order to make culling shaders more efficient. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10525>
2021-06-07 23:23:38 +02:00
parent ed163a44b6
commit 9a95f5487f
7 changed files with 315 additions and 5 deletions
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1327,6 +1327,19 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)

   radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);

+   if (pipeline->graphics.has_ngg_culling &&
+       pipeline->graphics.last_vgt_api_stage != MESA_SHADER_GEOMETRY &&
+       !cmd_buffer->state.last_nggc_settings) {
+      /* The already emitted RSRC2 contains the LDS required for NGG culling.
+       * Culling is currently disabled, so re-emit RSRC2 to reduce LDS usage.
+       * API GS always needs LDS, so this isn't useful there.
+       */
+      struct radv_shader_variant *v = pipeline->shaders[pipeline->graphics.last_vgt_api_stage];
+      radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+                        (v->config.rsrc2 & C_00B22C_LDS_SIZE) |
+                        S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling));
+   }
+
   if (!cmd_buffer->state.emitted_pipeline ||
       cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw ||
       cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash ||
@@ -3839,6 +3852,8 @@ radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBegi
   cmd_buffer->state.last_sx_ps_downconvert = -1;
   cmd_buffer->state.last_sx_blend_opt_epsilon = -1;
   cmd_buffer->state.last_sx_blend_opt_control = -1;
+   cmd_buffer->state.last_nggc_settings = -1;
+   cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
   cmd_buffer->usage_flags = pBeginInfo->flags;

   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
@@ -4961,6 +4976,10 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou
      if (secondary->state.last_index_type != -1) {
         primary->state.last_index_type = secondary->state.last_index_type;
      }
+
+      primary->state.last_nggc_settings = secondary->state.last_nggc_settings;
+      primary->state.last_nggc_settings_sgpr_idx = secondary->state.last_nggc_settings_sgpr_idx;
+      primary->state.last_nggc_skip = secondary->state.last_nggc_skip;
   }

   /* After executing commands from secondary buffers we have to dirty
@@ -5635,6 +5654,209 @@ radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
   return false;
 }

+enum {
+   ngg_cull_none = 0,
+   ngg_cull_front_face = 1,
+   ngg_cull_back_face = 2,
+   ngg_cull_face_is_ccw = 4,
+   ngg_cull_small_primitives = 8,
+};
+
+ALWAYS_INLINE static bool
+radv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt,
+                      bool indirect, unsigned num_viewports)
+{
+   /* If we have to draw only a few vertices, we get better latency if
+    * we disable NGG culling.
+    *
+    * When tessellation is used, what matters is the number of tessellated
+    * vertices, so let's always assume it's not a small draw.
+    *
+    * TODO: Figure out how to do culling with multiple viewports efficiently.
+    */
+   return !has_tess && !indirect && vtx_cnt < 512 && num_viewports == 1;
+}
+
+ALWAYS_INLINE static uint32_t
+radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted)
+{
+   const struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
+
+   /* Cull every triangle when rasterizer discard is enabled. */
+   if (d->rasterizer_discard_enable ||
+       G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.pipeline->graphics.pa_cl_clip_cntl))
+      return ngg_cull_front_face | ngg_cull_back_face;
+
+   uint32_t pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl;
+   uint32_t nggc_settings = ngg_cull_none;
+
+   /* The culling code needs to know whether face is CW or CCW. */
+   bool ccw = (pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_FRONT_FACE)
+              ? d->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE
+              : G_028814_FACE(pa_su_sc_mode_cntl) == 0;
+
+   /* Take inverted viewport into account. */
+   ccw ^= vp_y_inverted;
+
+   if (ccw)
+      nggc_settings |= ngg_cull_face_is_ccw;
+
+   /* Face culling settings. */
+   if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
+         ? (d->cull_mode & VK_CULL_MODE_FRONT_BIT)
+         : G_028814_CULL_FRONT(pa_su_sc_mode_cntl))
+      nggc_settings |= ngg_cull_front_face;
+   if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
+         ? (d->cull_mode & VK_CULL_MODE_BACK_BIT)
+         : G_028814_CULL_BACK(pa_su_sc_mode_cntl))
+      nggc_settings |= ngg_cull_back_face;
+
+   /* Small primitive culling is only valid when conservative overestimation is not used. */
+   if (!pipeline->graphics.uses_conservative_overestimate) {
+      nggc_settings |= ngg_cull_small_primitives;
+
+      /* small_prim_precision = num_samples / 2^subpixel_bits
+       * num_samples is also always a power of two, so the small prim precision can only be
+       * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent.
+       */
+      unsigned subpixel_bits = 256;
+      int32_t small_prim_precision_log2 = util_logbase2(pipeline->graphics.ms.num_samples) - util_logbase2(subpixel_bits);
+      nggc_settings |= ((uint32_t) small_prim_precision_log2 << 24u);
+   }
+
+   return nggc_settings;
+}
+
+static void
+radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
+{
+   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   const unsigned stage = pipeline->graphics.last_vgt_api_stage;
+   const bool nggc_supported = pipeline->graphics.has_ngg_culling;
+
+   if (!nggc_supported && !cmd_buffer->state.last_nggc_settings) {
+      /* Current shader doesn't support culling and culling was already disabled:
+       * No further steps needed, just remember the SGPR's location is not set.
+       */
+      cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
+      return;
+   }
+
+   /* Check dirty flags:
+    * - Dirty pipeline: SGPR index may have changed (we have to re-emit if changed).
+    * - Dirty dynamic flags: culling settings may have changed.
+    */
+   const bool dirty =
+      cmd_buffer->state.dirty &
+      (RADV_CMD_DIRTY_PIPELINE |
+       RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
+       RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT);
+
+   /* Check small draw status:
+    * For small draw calls, we disable culling by setting the SGPR to 0.
+    */
+   const bool skip =
+      radv_skip_ngg_culling(
+         stage == MESA_SHADER_TESS_EVAL, draw_info->count, draw_info->indirect,
+         cmd_buffer->state.dynamic.viewport.count);
+
+   /* See if anything changed. */
+   if (!dirty && skip == cmd_buffer->state.last_nggc_skip)
+      return;
+
+   /* Remember small draw state. */
+   cmd_buffer->state.last_nggc_skip = skip;
+   const struct radv_shader_variant *v = pipeline->shaders[stage];
+   assert(v->info.has_ngg_culling == nggc_supported);
+
+   /* Find the user SGPR. */
+   const uint32_t base_reg = pipeline->user_data_0[stage];
+   const int8_t nggc_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_CULLING_SETTINGS].sgpr_idx;
+   assert(!nggc_supported || nggc_sgpr_idx != -1);
+
+   /* Get viewport transform. */
+   float vp_scale[3], vp_translate[3];
+   radv_get_viewport_xform(&cmd_buffer->state.dynamic.viewport.viewports[0], vp_scale, vp_translate);
+   bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]);
+
+   /* Get current culling settings. */
+   uint32_t nggc_settings = nggc_supported && !skip
+                            ? radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted)
+                            : ngg_cull_none;
+
+   bool emit_viewport = nggc_settings &&
+                        (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_VIEWPORT ||
+                         cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx ||
+                         !cmd_buffer->state.last_nggc_settings);
+
+   if (emit_viewport) {
+      /* Correction for inverted Y */
+      if (vp_y_inverted) {
+         vp_scale[1] = -vp_scale[1];
+         vp_translate[1] = -vp_translate[1];
+      }
+
+      /* Correction for number of samples per pixel. */
+      for (unsigned i = 0; i < 2; ++i) {
+         vp_scale[i] *= (float) pipeline->graphics.ms.num_samples;
+         vp_translate[i] *= (float) pipeline->graphics.ms.num_samples;
+      }
+
+      uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])};
+      const int8_t vp_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_VIEWPORT].sgpr_idx;
+      assert(vp_sgpr_idx != -1);
+      radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4);
+      radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4);
+   }
+
+   bool emit_settings = nggc_supported &&
+                        (cmd_buffer->state.last_nggc_settings != nggc_settings ||
+                         cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx);
+
+   /* This needs to be emitted when culling is turned on
+    * and when it's already on but some settings change.
+    */
+   if (emit_settings) {
+      assert(nggc_sgpr_idx >= 0);
+      radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings);
+   }
+
+   /* These only need to be emitted when culling is turned on or off,
+    * but not when it stays on and just some settings change.
+    */
+   if (!!cmd_buffer->state.last_nggc_settings != !!nggc_settings) {
+      const struct radv_physical_device *physical_device = cmd_buffer->device->physical_device;
+      uint32_t rsrc2 = v->config.rsrc2;
+      uint32_t oversub_pc_lines = physical_device->rad_info.pc_lines / 4;
+
+      if (nggc_settings) {
+         /* Tweak the parameter cache oversubscription.
+          * This allows the HW to launch more NGG workgroups than the pre-allocated parameter
+          * cache would normally allow, yielding better perf when culling is on.
+          */
+         oversub_pc_lines = physical_device->rad_info.pc_lines * 3 / 4;
+      } else {
+         /* Allocate less LDS when culling is disabled. (But GS always needs it.) */
+         if (stage != MESA_SHADER_GEOMETRY)
+            rsrc2 = (rsrc2 & C_00B22C_LDS_SIZE) | S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling);
+      }
+
+      /* When the pipeline is dirty, radv_emit_graphics_pipeline will write this register. */
+      if (!(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)) {
+         radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
+      }
+
+      /* Update parameter cache oversubscription setting. */
+      radeon_set_uconfig_reg(cmd_buffer->cs, R_030980_GE_PC_ALLOC,
+                                             S_030980_OVERSUB_EN(physical_device->rad_info.use_late_alloc) |
+                                             S_030980_NUM_PC_LINES(oversub_pc_lines - 1));
+   }
+
+   cmd_buffer->state.last_nggc_settings = nggc_settings;
+   cmd_buffer->state.last_nggc_settings_sgpr_idx = nggc_sgpr_idx;
+}
+
 static void
 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
 {
@@ -5644,6 +5866,10 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct r
       cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
      radv_emit_rbplus_state(cmd_buffer);

+   if ((cmd_buffer->device->instance->perftest_flags & RADV_PERFTEST_NGGC) &&
+       cmd_buffer->state.pipeline->graphics.is_ngg)
+      radv_emit_ngg_culling_state(cmd_buffer, info);
+
   if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
      radv_emit_graphics_pipeline(cmd_buffer);

--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1790,6 +1790,10 @@ radv_pipeline_init_raster_state(struct radv_pipeline *pipeline,
      S_028810_ZCLIP_FAR_DISABLE(depth_clip_disable ? 1 : 0) |
      S_028810_DX_RASTERIZATION_KILL(raster_info->rasterizerDiscardEnable ? 1 : 0) |
      S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
+
+   pipeline->graphics.uses_conservative_overestimate =
+      radv_get_conservative_raster_mode(pCreateInfo->pRasterizationState) ==
+         VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT;
 }

 static void
@@ -5441,6 +5445,9 @@ radv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device,
   pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline);

   pipeline->graphics.is_ngg = radv_pipeline_has_ngg(pipeline);
+   pipeline->graphics.has_ngg_culling =
+      pipeline->graphics.is_ngg &&
+      pipeline->shaders[pipeline->graphics.last_vgt_api_stage]->info.has_ngg_culling;

   radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend);

--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1415,6 +1415,11 @@ struct radv_cmd_state {
   bool pending_sqtt_barrier_end;
   enum rgp_flush_bits sqtt_flush_bits;

+   /* NGG culling state. */
+   uint32_t last_nggc_settings;
+   int8_t last_nggc_settings_sgpr_idx;
+   bool last_nggc_skip;
+
   uint8_t cb_mip[MAX_RTS];

   /* Whether DRAW_{INDEX}_INDIRECT_MULTI is emitted. */
@@ -1762,6 +1767,7 @@ struct radv_pipeline {
         unsigned pa_cl_clip_cntl;
         unsigned cb_color_control;
         bool uses_dynamic_stride;
+         bool uses_conservative_overestimate;

         /* Used for rbplus */
         uint32_t col_format;
@@ -1769,6 +1775,7 @@ struct radv_pipeline {

         /* Whether the pipeline uses NGG (GFX10+). */
         bool is_ngg;
+         bool has_ngg_culling;

         /* Last pre-PS API stage */
         gl_shader_stage last_vgt_api_stage;
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -969,6 +969,8 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
            key->vs_common_out.export_prim_id,
            key->vs.provoking_vtx_last);

+      info->has_ngg_culling = out_conf.can_cull;
+      info->num_lds_blocks_when_not_culling = DIV_ROUND_UP(out_conf.lds_bytes_if_culling_off, device->physical_device->rad_info.lds_encode_granularity);
      info->is_ngg_passthrough = out_conf.passthrough;
      key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;
   } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -162,7 +162,9 @@ enum radv_ud_index {
   AC_UD_VIEW_INDEX = 4,
   AC_UD_STREAMOUT_BUFFERS = 5,
   AC_UD_NGG_GS_STATE = 6,
-   AC_UD_SHADER_START = 7,
+   AC_UD_NGG_CULLING_SETTINGS = 7,
+   AC_UD_NGG_VIEWPORT = 8,
+   AC_UD_SHADER_START = 9,
   AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
   AC_UD_VS_BASE_VERTEX_START_INSTANCE,
   AC_UD_VS_MAX_UD,
@@ -261,6 +263,8 @@ struct radv_shader_info {
   bool need_indirect_descriptor_sets;
   bool is_ngg;
   bool is_ngg_passthrough;
+   bool has_ngg_culling;
+   uint32_t num_lds_blocks_when_not_culling;
   uint32_t num_tess_patches;
   struct {
      uint8_t input_usage_mask[RADV_VERT_ATTRIB_MAX];
--- a/src/amd/vulkan/radv_shader_args.c
+++ b/src/amd/vulkan/radv_shader_args.c
@@ -117,6 +117,19 @@ count_vs_user_sgprs(struct radv_shader_args *args)
   return count;
 }

+static unsigned
+count_ngg_sgprs(struct radv_shader_args *args, gl_shader_stage stage)
+{
+   unsigned count = 0;
+
+   if (stage == MESA_SHADER_GEOMETRY)
+      count += 1; /* ngg_gs_state */
+   if (args->shader_info->has_ngg_culling)
+      count += 5; /* ngg_culling_settings + 4x ngg_viewport_* */
+
+   return count;
+}
+
 static void
 allocate_inline_push_consts(struct radv_shader_args *args, struct user_sgpr_info *user_sgpr_info)
 {
@@ -184,6 +197,8 @@ allocate_user_sgprs(struct radv_shader_args *args, gl_shader_stage stage, bool h
   case MESA_SHADER_VERTEX:
      if (!args->is_gs_copy_shader)
         user_sgpr_count += count_vs_user_sgprs(args);
+      if (args->options->key.vs_common_out.as_ngg)
+         user_sgpr_count += count_ngg_sgprs(args, stage);
      break;
   case MESA_SHADER_TESS_CTRL:
      if (has_previous_stage) {
@@ -192,11 +207,13 @@ allocate_user_sgprs(struct radv_shader_args *args, gl_shader_stage stage, bool h
      }
      break;
   case MESA_SHADER_TESS_EVAL:
+      if (args->options->key.vs_common_out.as_ngg)
+         user_sgpr_count += count_ngg_sgprs(args, stage);
      break;
   case MESA_SHADER_GEOMETRY:
      if (has_previous_stage) {
         if (args->options->key.vs_common_out.as_ngg)
-            user_sgpr_count++; /* NGG GS state */
+            user_sgpr_count += count_ngg_sgprs(args, stage);

         if (previous_stage == MESA_SHADER_VERTEX) {
            user_sgpr_count += count_vs_user_sgprs(args);
@@ -356,6 +373,22 @@ declare_tes_input_vgprs(struct radv_shader_args *args)
   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_patch_id);
 }

+static void
+declare_ngg_sgprs(struct radv_shader_args *args, gl_shader_stage stage)
+{
+   if (stage == MESA_SHADER_GEOMETRY) {
+      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_gs_state);
+   }
+
+   if (args->shader_info->has_ngg_culling) {
+      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_culling_settings);
+      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_scale[0]);
+      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_scale[1]);
+      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_translate[0]);
+      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_translate[1]);
+   }
+}
+
 static void
 set_global_input_locs(struct radv_shader_args *args, const struct user_sgpr_info *user_sgpr_info,
                      uint8_t *user_sgpr_idx)
@@ -405,6 +438,24 @@ set_vs_specific_input_locs(struct radv_shader_args *args, gl_shader_stage stage,
   }
 }

+static void
+set_ngg_sgprs_locs(struct radv_shader_args *args, gl_shader_stage stage, uint8_t *user_sgpr_idx)
+{
+   if (stage == MESA_SHADER_GEOMETRY) {
+      assert(args->ngg_gs_state.used);
+      set_loc_shader(args, AC_UD_NGG_GS_STATE, user_sgpr_idx, 1);
+   }
+
+   if (args->shader_info->has_ngg_culling) {
+      assert(args->ngg_culling_settings.used &&
+             args->ngg_viewport_scale[0].used && args->ngg_viewport_scale[1].used &&
+             args->ngg_viewport_translate[0].used && args->ngg_viewport_translate[1].used);
+
+      set_loc_shader(args, AC_UD_NGG_CULLING_SETTINGS, user_sgpr_idx, 1);
+      set_loc_shader(args, AC_UD_NGG_VIEWPORT, user_sgpr_idx, 4);
+   }
+}
+
 /* Returns whether the stage is a stage that can be directly before the GS */
 static bool
 is_pre_gs_stage(gl_shader_stage stage)
@@ -488,6 +539,9 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
      if (args->options->explicit_scratch_args) {
         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
      }
+      if (args->options->key.vs_common_out.as_ngg) {
+         declare_ngg_sgprs(args, stage);
+      }

      declare_vs_input_vgprs(args);
      break;
@@ -547,6 +601,9 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
      if (args->options->explicit_scratch_args) {
         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
      }
+      if (args->options->key.vs_common_out.as_ngg) {
+         declare_ngg_sgprs(args, stage);
+      }
      declare_tes_input_vgprs(args);
      break;
   case MESA_SHADER_GEOMETRY:
@@ -576,7 +633,7 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
         }

         if (args->options->key.vs_common_out.as_ngg) {
-            ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_gs_state);
+            declare_ngg_sgprs(args, stage);
         }

         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
@@ -669,6 +726,8 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
      set_vs_specific_input_locs(args, stage, has_previous_stage, previous_stage, &user_sgpr_idx);
      if (args->ac.view_index.used)
         set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
+      if (args->options->key.vs_common_out.as_ngg)
+         set_ngg_sgprs_locs(args, stage, &user_sgpr_idx);
      break;
   case MESA_SHADER_TESS_CTRL:
      set_vs_specific_input_locs(args, stage, has_previous_stage, previous_stage, &user_sgpr_idx);
@@ -678,6 +737,8 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
   case MESA_SHADER_TESS_EVAL:
      if (args->ac.view_index.used)
         set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
+      if (args->options->key.vs_common_out.as_ngg)
+         set_ngg_sgprs_locs(args, stage, &user_sgpr_idx);
      break;
   case MESA_SHADER_GEOMETRY:
      if (has_previous_stage) {
@@ -688,8 +749,8 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
      if (args->ac.view_index.used)
         set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);

-      if (args->ngg_gs_state.used)
-         set_loc_shader(args, AC_UD_NGG_GS_STATE, &user_sgpr_idx, 1);
+      if (args->options->key.vs_common_out.as_ngg)
+         set_ngg_sgprs_locs(args, stage, &user_sgpr_idx);
      break;
   case MESA_SHADER_FRAGMENT:
      break;
--- a/src/amd/vulkan/radv_shader_args.h
+++ b/src/amd/vulkan/radv_shader_args.h
@@ -41,6 +41,9 @@ struct radv_shader_args {

   /* NGG GS */
   struct ac_arg ngg_gs_state;
+   struct ac_arg ngg_culling_settings;
+   struct ac_arg ngg_viewport_scale[2];
+   struct ac_arg ngg_viewport_translate[2];

   bool is_gs_copy_shader;
   bool is_trap_handler_shader;