From 88c951bd463af52abd0257419f1601d7767a963d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timur=20Krist=C3=B3f?= <timur.kristof@gmail.com>
Date: Thu, 9 Jan 2025 18:27:33 -0600
Subject: [PATCH] ac/nir: Move ac_nir_lower_legacy_gs to separate file.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32966>
---
 src/amd/common/ac_nir.c                 | 275 -----------------------
 src/amd/common/ac_nir_lower_legacy_gs.c | 285 ++++++++++++++++++++++++
 src/amd/common/meson.build              |   1 +
 3 files changed, 286 insertions(+), 275 deletions(-)
 create mode 100644 src/amd/common/ac_nir_lower_legacy_gs.c

diff --git a/src/amd/common/ac_nir.c b/src/amd/common/ac_nir.c
index 334ec372a10..5da74995e14 100644
--- a/src/amd/common/ac_nir.c
+++ b/src/amd/common/ac_nir.c
@@ -1027,281 +1027,6 @@ ac_nir_gs_shader_query(nir_builder *b,
    return true;
 }
 
-typedef struct {
-   nir_def *outputs[64][4];
-   nir_def *outputs_16bit_lo[16][4];
-   nir_def *outputs_16bit_hi[16][4];
-
-   ac_nir_gs_output_info *info;
-
-   nir_def *vertex_count[4];
-   nir_def *primitive_count[4];
-} lower_legacy_gs_state;
-
-static bool
-lower_legacy_gs_store_output(nir_builder *b, nir_intrinsic_instr *intrin,
-                             lower_legacy_gs_state *s)
-{
-   /* Assume:
-    * - the shader used nir_lower_io_to_temporaries
-    * - 64-bit outputs are lowered
-    * - no indirect indexing is present
-    */
-   assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
-
-   b->cursor = nir_before_instr(&intrin->instr);
-
-   unsigned component = nir_intrinsic_component(intrin);
-   unsigned write_mask = nir_intrinsic_write_mask(intrin);
-   nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
-
-   nir_def **outputs;
-   if (sem.location < VARYING_SLOT_VAR0_16BIT) {
-      outputs = s->outputs[sem.location];
-   } else {
-      unsigned index = sem.location - VARYING_SLOT_VAR0_16BIT;
-      if (sem.high_16bits)
-         outputs = s->outputs_16bit_hi[index];
-      else
-         outputs = s->outputs_16bit_lo[index];
-   }
-
-   nir_def *store_val = intrin->src[0].ssa;
-   /* 64bit output has been lowered to 32bit */
-   assert(store_val->bit_size <= 32);
-
-   /* 16-bit output stored in a normal varying slot that isn't a dedicated 16-bit slot. */
-   const bool non_dedicated_16bit = sem.location < VARYING_SLOT_VAR0_16BIT && store_val->bit_size == 16;
-
-   u_foreach_bit (i, write_mask) {
-      unsigned comp = component + i;
-      nir_def *store_component = nir_channel(b, store_val, i);
-
-      if (non_dedicated_16bit) {
-         if (sem.high_16bits) {
-            nir_def *lo = outputs[comp] ? nir_unpack_32_2x16_split_x(b, outputs[comp]) : nir_imm_intN_t(b, 0, 16);
-            outputs[comp] = nir_pack_32_2x16_split(b, lo, store_component);
-         } else {
-            nir_def *hi = outputs[comp] ? nir_unpack_32_2x16_split_y(b, outputs[comp]) : nir_imm_intN_t(b, 0, 16);
-            outputs[comp] = nir_pack_32_2x16_split(b, store_component, hi);
-         }
-      } else {
-         outputs[comp] = store_component;
-      }
-   }
-
-   nir_instr_remove(&intrin->instr);
-   return true;
-}
-
-static bool
-lower_legacy_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intrin,
-                                         lower_legacy_gs_state *s)
-{
-   b->cursor = nir_before_instr(&intrin->instr);
-
-   unsigned stream = nir_intrinsic_stream_id(intrin);
-   nir_def *vtxidx = intrin->src[0].ssa;
-
-   nir_def *gsvs_ring = nir_load_ring_gsvs_amd(b, .stream_id = stream);
-   nir_def *soffset = nir_load_ring_gs2vs_offset_amd(b);
-
-   unsigned offset = 0;
-   u_foreach_bit64 (i, b->shader->info.outputs_written) {
-      for (unsigned j = 0; j < 4; j++) {
-         nir_def *output = s->outputs[i][j];
-         /* Next vertex emit need a new value, reset all outputs. */
-         s->outputs[i][j] = NULL;
-
-         const uint8_t usage_mask = s->info->varying_mask[i] | s->info->sysval_mask[i];
-
-         if (!(usage_mask & (1 << j)) ||
-             ((s->info->streams[i] >> (j * 2)) & 0x3) != stream)
-            continue;
-
-         unsigned base = offset * b->shader->info.gs.vertices_out * 4;
-         offset++;
-
-         /* no one set this output, skip the buffer store */
-         if (!output)
-            continue;
-
-         nir_def *voffset = nir_ishl_imm(b, vtxidx, 2);
-
-         /* extend 8/16 bit to 32 bit, 64 bit has been lowered */
-         nir_def *data = nir_u2uN(b, output, 32);
-
-         nir_store_buffer_amd(b, data, gsvs_ring, voffset, soffset, nir_imm_int(b, 0),
-                              .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL |
-                                        ACCESS_IS_SWIZZLED_AMD,
-                              .base = base,
-                              /* For ACO to not reorder this store around EmitVertex/EndPrimitve */
-                              .memory_modes = nir_var_shader_out);
-      }
-   }
-
-   u_foreach_bit (i, b->shader->info.outputs_written_16bit) {
-      for (unsigned j = 0; j < 4; j++) {
-         nir_def *output_lo = s->outputs_16bit_lo[i][j];
-         nir_def *output_hi = s->outputs_16bit_hi[i][j];
-         /* Next vertex emit need a new value, reset all outputs. */
-         s->outputs_16bit_lo[i][j] = NULL;
-         s->outputs_16bit_hi[i][j] = NULL;
-
-         bool has_lo_16bit = (s->info->varying_mask_16bit_lo[i] & (1 << j)) &&
-            ((s->info->streams_16bit_lo[i] >> (j * 2)) & 0x3) == stream;
-         bool has_hi_16bit = (s->info->varying_mask_16bit_hi[i] & (1 << j)) &&
-            ((s->info->streams_16bit_hi[i] >> (j * 2)) & 0x3) == stream;
-         if (!has_lo_16bit && !has_hi_16bit)
-            continue;
-
-         unsigned base = offset * b->shader->info.gs.vertices_out;
-         offset++;
-
-         bool has_lo_16bit_out = has_lo_16bit && output_lo;
-         bool has_hi_16bit_out = has_hi_16bit && output_hi;
-
-         /* no one set needed output, skip the buffer store */
-         if (!has_lo_16bit_out && !has_hi_16bit_out)
-            continue;
-
-         if (!has_lo_16bit_out)
-            output_lo = nir_undef(b, 1, 16);
-
-         if (!has_hi_16bit_out)
-            output_hi = nir_undef(b, 1, 16);
-
-         nir_def *voffset = nir_iadd_imm(b, vtxidx, base);
-         voffset = nir_ishl_imm(b, voffset, 2);
-
-         nir_store_buffer_amd(b, nir_pack_32_2x16_split(b, output_lo, output_hi),
-                              gsvs_ring, voffset, soffset, nir_imm_int(b, 0),
-                              .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL |
-                                        ACCESS_IS_SWIZZLED_AMD,
-                              /* For ACO to not reorder this store around EmitVertex/EndPrimitve */
-                              .memory_modes = nir_var_shader_out);
-      }
-   }
-
-   /* Signal vertex emission. */
-   nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
-                   .base = AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8));
-
-   nir_instr_remove(&intrin->instr);
-   return true;
-}
-
-static bool
-lower_legacy_gs_set_vertex_and_primitive_count(nir_builder *b, nir_intrinsic_instr *intrin,
-                                               lower_legacy_gs_state *s)
-{
-   b->cursor = nir_before_instr(&intrin->instr);
-
-   unsigned stream = nir_intrinsic_stream_id(intrin);
-
-   s->vertex_count[stream] = intrin->src[0].ssa;
-   s->primitive_count[stream] = intrin->src[1].ssa;
-
-   nir_instr_remove(&intrin->instr);
-   return true;
-}
-
-static bool
-lower_legacy_gs_end_primitive_with_counter(nir_builder *b, nir_intrinsic_instr *intrin,
-                                               lower_legacy_gs_state *s)
-{
-   b->cursor = nir_before_instr(&intrin->instr);
-   const unsigned stream = nir_intrinsic_stream_id(intrin);
-
-   /* Signal primitive emission. */
-   nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
-                   .base = AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8));
-
-   nir_instr_remove(&intrin->instr);
-   return true;
-}
-
-static bool
-lower_legacy_gs_intrinsic(nir_builder *b, nir_instr *instr, void *state)
-{
-   lower_legacy_gs_state *s = (lower_legacy_gs_state *) state;
-
-   if (instr->type != nir_instr_type_intrinsic)
-      return false;
-
-   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-   if (intrin->intrinsic == nir_intrinsic_store_output)
-      return lower_legacy_gs_store_output(b, intrin, s);
-   else if (intrin->intrinsic == nir_intrinsic_emit_vertex_with_counter)
-      return lower_legacy_gs_emit_vertex_with_counter(b, intrin, s);
-   else if (intrin->intrinsic == nir_intrinsic_end_primitive_with_counter)
-      return lower_legacy_gs_end_primitive_with_counter(b, intrin, s);
-   else if (intrin->intrinsic == nir_intrinsic_set_vertex_and_primitive_count)
-      return lower_legacy_gs_set_vertex_and_primitive_count(b, intrin, s);
-
-   return false;
-}
-
-void
-ac_nir_lower_legacy_gs(nir_shader *nir,
-                       bool has_gen_prim_query,
-                       bool has_pipeline_stats_query,
-                       ac_nir_gs_output_info *output_info)
-{
-   lower_legacy_gs_state s = {
-      .info = output_info,
-   };
-
-   unsigned num_vertices_per_primitive = 0;
-   switch (nir->info.gs.output_primitive) {
-   case MESA_PRIM_POINTS:
-      num_vertices_per_primitive = 1;
-      break;
-   case MESA_PRIM_LINE_STRIP:
-      num_vertices_per_primitive = 2;
-      break;
-   case MESA_PRIM_TRIANGLE_STRIP:
-      num_vertices_per_primitive = 3;
-      break;
-   default:
-      unreachable("Invalid GS output primitive.");
-      break;
-   }
-
-   nir_shader_instructions_pass(nir, lower_legacy_gs_intrinsic,
-                                nir_metadata_control_flow, &s);
-
-   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
-
-   nir_builder builder = nir_builder_at(nir_after_impl(impl));
-   nir_builder *b = &builder;
-
-   /* Emit shader query for mix use legacy/NGG GS */
-   bool progress = ac_nir_gs_shader_query(b,
-                                          has_gen_prim_query,
-                                          has_pipeline_stats_query,
-                                          has_pipeline_stats_query,
-                                          num_vertices_per_primitive,
-                                          64,
-                                          s.vertex_count,
-                                          s.primitive_count);
-
-   /* Wait for all stores to finish. */
-   nir_barrier(b, .execution_scope = SCOPE_INVOCATION,
-                      .memory_scope = SCOPE_DEVICE,
-                      .memory_semantics = NIR_MEMORY_RELEASE,
-                      .memory_modes = nir_var_shader_out | nir_var_mem_ssbo |
-                                      nir_var_mem_global | nir_var_image);
-
-   /* Signal that the GS is done. */
-   nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
-                   .base = AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE);
-
-   if (progress)
-      nir_metadata_preserve(impl, nir_metadata_none);
-}
-
 /* Shader logging function for printing nir_def values. The driver prints this after
  * command submission.
  *
diff --git a/src/amd/common/ac_nir_lower_legacy_gs.c b/src/amd/common/ac_nir_lower_legacy_gs.c
new file mode 100644
index 00000000000..1a1b06c46ab
--- /dev/null
+++ b/src/amd/common/ac_nir_lower_legacy_gs.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright © 2021 Valve Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "ac_nir.h"
+#include "ac_nir_helpers.h"
+
+#include "nir_builder.h"
+
+typedef struct {
+   nir_def *outputs[64][4];
+   nir_def *outputs_16bit_lo[16][4];
+   nir_def *outputs_16bit_hi[16][4];
+
+   ac_nir_gs_output_info *info;
+
+   nir_def *vertex_count[4];
+   nir_def *primitive_count[4];
+} lower_legacy_gs_state;
+
+static bool
+lower_legacy_gs_store_output(nir_builder *b, nir_intrinsic_instr *intrin,
+                             lower_legacy_gs_state *s)
+{
+   /* Assume:
+    * - the shader used nir_lower_io_to_temporaries
+    * - 64-bit outputs are lowered
+    * - no indirect indexing is present
+    */
+   assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   unsigned component = nir_intrinsic_component(intrin);
+   unsigned write_mask = nir_intrinsic_write_mask(intrin);
+   nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
+
+   nir_def **outputs;
+   if (sem.location < VARYING_SLOT_VAR0_16BIT) {
+      outputs = s->outputs[sem.location];
+   } else {
+      unsigned index = sem.location - VARYING_SLOT_VAR0_16BIT;
+      if (sem.high_16bits)
+         outputs = s->outputs_16bit_hi[index];
+      else
+         outputs = s->outputs_16bit_lo[index];
+   }
+
+   nir_def *store_val = intrin->src[0].ssa;
+   /* 64bit output has been lowered to 32bit */
+   assert(store_val->bit_size <= 32);
+
+   /* 16-bit output stored in a normal varying slot that isn't a dedicated 16-bit slot. */
+   const bool non_dedicated_16bit = sem.location < VARYING_SLOT_VAR0_16BIT && store_val->bit_size == 16;
+
+   u_foreach_bit (i, write_mask) {
+      unsigned comp = component + i;
+      nir_def *store_component = nir_channel(b, store_val, i);
+
+      if (non_dedicated_16bit) {
+         if (sem.high_16bits) {
+            nir_def *lo = outputs[comp] ? nir_unpack_32_2x16_split_x(b, outputs[comp]) : nir_imm_intN_t(b, 0, 16);
+            outputs[comp] = nir_pack_32_2x16_split(b, lo, store_component);
+         } else {
+            nir_def *hi = outputs[comp] ? nir_unpack_32_2x16_split_y(b, outputs[comp]) : nir_imm_intN_t(b, 0, 16);
+            outputs[comp] = nir_pack_32_2x16_split(b, store_component, hi);
+         }
+      } else {
+         outputs[comp] = store_component;
+      }
+   }
+
+   nir_instr_remove(&intrin->instr);
+   return true;
+}
+
+static bool
+lower_legacy_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intrin,
+                                         lower_legacy_gs_state *s)
+{
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   unsigned stream = nir_intrinsic_stream_id(intrin);
+   nir_def *vtxidx = intrin->src[0].ssa;
+
+   nir_def *gsvs_ring = nir_load_ring_gsvs_amd(b, .stream_id = stream);
+   nir_def *soffset = nir_load_ring_gs2vs_offset_amd(b);
+
+   unsigned offset = 0;
+   u_foreach_bit64 (i, b->shader->info.outputs_written) {
+      for (unsigned j = 0; j < 4; j++) {
+         nir_def *output = s->outputs[i][j];
+         /* Next vertex emit need a new value, reset all outputs. */
+         s->outputs[i][j] = NULL;
+
+         const uint8_t usage_mask = s->info->varying_mask[i] | s->info->sysval_mask[i];
+
+         if (!(usage_mask & (1 << j)) ||
+             ((s->info->streams[i] >> (j * 2)) & 0x3) != stream)
+            continue;
+
+         unsigned base = offset * b->shader->info.gs.vertices_out * 4;
+         offset++;
+
+         /* no one set this output, skip the buffer store */
+         if (!output)
+            continue;
+
+         nir_def *voffset = nir_ishl_imm(b, vtxidx, 2);
+
+         /* extend 8/16 bit to 32 bit, 64 bit has been lowered */
+         nir_def *data = nir_u2uN(b, output, 32);
+
+         nir_store_buffer_amd(b, data, gsvs_ring, voffset, soffset, nir_imm_int(b, 0),
+                              .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL |
+                                        ACCESS_IS_SWIZZLED_AMD,
+                              .base = base,
+                              /* For ACO to not reorder this store around EmitVertex/EndPrimitve */
+                              .memory_modes = nir_var_shader_out);
+      }
+   }
+
+   u_foreach_bit (i, b->shader->info.outputs_written_16bit) {
+      for (unsigned j = 0; j < 4; j++) {
+         nir_def *output_lo = s->outputs_16bit_lo[i][j];
+         nir_def *output_hi = s->outputs_16bit_hi[i][j];
+         /* Next vertex emit need a new value, reset all outputs. */
+         s->outputs_16bit_lo[i][j] = NULL;
+         s->outputs_16bit_hi[i][j] = NULL;
+
+         bool has_lo_16bit = (s->info->varying_mask_16bit_lo[i] & (1 << j)) &&
+            ((s->info->streams_16bit_lo[i] >> (j * 2)) & 0x3) == stream;
+         bool has_hi_16bit = (s->info->varying_mask_16bit_hi[i] & (1 << j)) &&
+            ((s->info->streams_16bit_hi[i] >> (j * 2)) & 0x3) == stream;
+         if (!has_lo_16bit && !has_hi_16bit)
+            continue;
+
+         unsigned base = offset * b->shader->info.gs.vertices_out;
+         offset++;
+
+         bool has_lo_16bit_out = has_lo_16bit && output_lo;
+         bool has_hi_16bit_out = has_hi_16bit && output_hi;
+
+         /* no one set needed output, skip the buffer store */
+         if (!has_lo_16bit_out && !has_hi_16bit_out)
+            continue;
+
+         if (!has_lo_16bit_out)
+            output_lo = nir_undef(b, 1, 16);
+
+         if (!has_hi_16bit_out)
+            output_hi = nir_undef(b, 1, 16);
+
+         nir_def *voffset = nir_iadd_imm(b, vtxidx, base);
+         voffset = nir_ishl_imm(b, voffset, 2);
+
+         nir_store_buffer_amd(b, nir_pack_32_2x16_split(b, output_lo, output_hi),
+                              gsvs_ring, voffset, soffset, nir_imm_int(b, 0),
+                              .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL |
+                                        ACCESS_IS_SWIZZLED_AMD,
+                              /* For ACO to not reorder this store around EmitVertex/EndPrimitve */
+                              .memory_modes = nir_var_shader_out);
+      }
+   }
+
+   /* Signal vertex emission. */
+   nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
+                   .base = AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8));
+
+   nir_instr_remove(&intrin->instr);
+   return true;
+}
+
+static bool
+lower_legacy_gs_set_vertex_and_primitive_count(nir_builder *b, nir_intrinsic_instr *intrin,
+                                               lower_legacy_gs_state *s)
+{
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   unsigned stream = nir_intrinsic_stream_id(intrin);
+
+   s->vertex_count[stream] = intrin->src[0].ssa;
+   s->primitive_count[stream] = intrin->src[1].ssa;
+
+   nir_instr_remove(&intrin->instr);
+   return true;
+}
+
+static bool
+lower_legacy_gs_end_primitive_with_counter(nir_builder *b, nir_intrinsic_instr *intrin,
+                                               lower_legacy_gs_state *s)
+{
+   b->cursor = nir_before_instr(&intrin->instr);
+   const unsigned stream = nir_intrinsic_stream_id(intrin);
+
+   /* Signal primitive emission. */
+   nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
+                   .base = AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8));
+
+   nir_instr_remove(&intrin->instr);
+   return true;
+}
+
+static bool
+lower_legacy_gs_intrinsic(nir_builder *b, nir_instr *instr, void *state)
+{
+   lower_legacy_gs_state *s = (lower_legacy_gs_state *) state;
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+   if (intrin->intrinsic == nir_intrinsic_store_output)
+      return lower_legacy_gs_store_output(b, intrin, s);
+   else if (intrin->intrinsic == nir_intrinsic_emit_vertex_with_counter)
+      return lower_legacy_gs_emit_vertex_with_counter(b, intrin, s);
+   else if (intrin->intrinsic == nir_intrinsic_end_primitive_with_counter)
+      return lower_legacy_gs_end_primitive_with_counter(b, intrin, s);
+   else if (intrin->intrinsic == nir_intrinsic_set_vertex_and_primitive_count)
+      return lower_legacy_gs_set_vertex_and_primitive_count(b, intrin, s);
+
+   return false;
+}
+
+void
+ac_nir_lower_legacy_gs(nir_shader *nir,
+                       bool has_gen_prim_query,
+                       bool has_pipeline_stats_query,
+                       ac_nir_gs_output_info *output_info)
+{
+   lower_legacy_gs_state s = {
+      .info = output_info,
+   };
+
+   unsigned num_vertices_per_primitive = 0;
+   switch (nir->info.gs.output_primitive) {
+   case MESA_PRIM_POINTS:
+      num_vertices_per_primitive = 1;
+      break;
+   case MESA_PRIM_LINE_STRIP:
+      num_vertices_per_primitive = 2;
+      break;
+   case MESA_PRIM_TRIANGLE_STRIP:
+      num_vertices_per_primitive = 3;
+      break;
+   default:
+      unreachable("Invalid GS output primitive.");
+      break;
+   }
+
+   nir_shader_instructions_pass(nir, lower_legacy_gs_intrinsic,
+                                nir_metadata_control_flow, &s);
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+   nir_builder builder = nir_builder_at(nir_after_impl(impl));
+   nir_builder *b = &builder;
+
+   /* Emit shader query for mix use legacy/NGG GS */
+   bool progress = ac_nir_gs_shader_query(b,
+                                          has_gen_prim_query,
+                                          has_pipeline_stats_query,
+                                          has_pipeline_stats_query,
+                                          num_vertices_per_primitive,
+                                          64,
+                                          s.vertex_count,
+                                          s.primitive_count);
+
+   /* Wait for all stores to finish. */
+   nir_barrier(b, .execution_scope = SCOPE_INVOCATION,
+                      .memory_scope = SCOPE_DEVICE,
+                      .memory_semantics = NIR_MEMORY_RELEASE,
+                      .memory_modes = nir_var_shader_out | nir_var_mem_ssbo |
+                                      nir_var_mem_global | nir_var_image);
+
+   /* Signal that the GS is done. */
+   nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
+                   .base = AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE);
+
+   if (progress)
+      nir_metadata_preserve(impl, nir_metadata_none);
+}
diff --git a/src/amd/common/meson.build b/src/amd/common/meson.build
index 53c020ec883..0a9a57468ce 100644
--- a/src/amd/common/meson.build
+++ b/src/amd/common/meson.build
@@ -92,6 +92,7 @@ amd_common_files = files(
   'ac_nir_lower_global_access.c',
   'ac_nir_lower_image_opcodes_cdna.c',
   'ac_nir_lower_intrinsics_to_args.c',
+  'ac_nir_lower_legacy_gs.c',
   'ac_nir_lower_legacy_vs.c',
   'ac_nir_lower_resinfo.c',
   'ac_nir_lower_taskmesh_io_to_mem.c',