radeonsi: merge all preamble states into one

Tess registers are appended. GS registers are appended or overwritten if they are already set. There are separate TMZ and non-TMZ preambles. The preamble will be passed to the kernel as an IB to execute on a context switch only. Reviewed-by: Mihai Preda <mhpreda@gmail.com> Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16509>
2022-05-14 10:23:05 -04:00
parent f46cd73e29
commit 32c7805ccc
6 changed files with 80 additions and 82 deletions
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -416,10 +416,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
         if (ctx->cs_preamble_state)
            ac_parse_ib(f, ctx->cs_preamble_state->pm4, ctx->cs_preamble_state->ndw, NULL, 0,
                        "IB2: Init config", ctx->gfx_level, NULL, NULL);
-
-         if (ctx->cs_preamble_gs_rings)
-            ac_parse_ib(f, ctx->cs_preamble_gs_rings->pm4, ctx->cs_preamble_gs_rings->ndw, NULL, 0,
-                        "IB2: Init GS rings", ctx->gfx_level, NULL, NULL);
      }

      if (scs->flushed) {
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -437,12 +437,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)

   /* The CS initialization should be emitted before everything else. */
   if (ctx->cs_preamble_state)
-      si_pm4_emit(ctx, ctx->cs_preamble_state);
-   if (ctx->cs_preamble_tess_rings)
-      si_pm4_emit(ctx, unlikely(is_secure) ? ctx->cs_preamble_tess_rings_tmz :
-         ctx->cs_preamble_tess_rings);
-   if (ctx->cs_preamble_gs_rings)
-      si_pm4_emit(ctx, ctx->cs_preamble_gs_rings);
+      si_pm4_emit(ctx, unlikely(is_secure) ? ctx->cs_preamble_state_tmz : ctx->cs_preamble_state);

   if (ctx->queued.named.ls)
      ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -222,12 +222,9 @@ static void si_destroy_context(struct pipe_context *context)

   if (sctx->cs_preamble_state)
      si_pm4_free_state(sctx, sctx->cs_preamble_state, ~0);
-   if (sctx->cs_preamble_tess_rings)
-      si_pm4_free_state(sctx, sctx->cs_preamble_tess_rings, ~0);
-   if (sctx->cs_preamble_tess_rings_tmz)
-      si_pm4_free_state(sctx, sctx->cs_preamble_tess_rings_tmz, ~0);
-   if (sctx->cs_preamble_gs_rings)
-      si_pm4_free_state(sctx, sctx->cs_preamble_gs_rings, ~0);
+   if (sctx->cs_preamble_state_tmz)
+      si_pm4_free_state(sctx, sctx->cs_preamble_state_tmz, ~0);
+
   for (i = 0; i < ARRAY_SIZE(sctx->vgt_shader_config); i++)
      si_pm4_free_state(sctx, sctx->vgt_shader_config[i], SI_STATE_IDX(vgt_shader_config));

--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1051,10 +1051,12 @@ struct si_context {

   /* Precomputed states. */
   struct si_pm4_state *cs_preamble_state;
-   struct si_pm4_state *cs_preamble_tess_rings;
-   struct si_pm4_state *cs_preamble_tess_rings_tmz;
-   struct si_pm4_state *cs_preamble_gs_rings;
+   struct si_pm4_state *cs_preamble_state_tmz;
+   uint16_t gs_ring_state_dw_offset;
+   uint16_t gs_ring_state_dw_offset_tmz;
   bool cs_preamble_has_vgt_flush;
+   bool cs_preamble_has_vgt_flush_tmz;
+
   struct si_pm4_state *vgt_shader_config[SI_NUM_VGT_STAGES_STATES];

   /* shaders */
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -5905,4 +5905,8 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
   }

   sctx->cs_preamble_state = pm4;
+
+   /* Make a copy of the preamble for TMZ. */
+   sctx->cs_preamble_state_tmz = (struct si_pm4_state *)CALLOC_STRUCT(si_cs_preamble);
+   memcpy(sctx->cs_preamble_state_tmz, sctx->cs_preamble_state, sizeof(struct si_cs_preamble));
 }
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -3666,22 +3666,27 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 /**
 * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that.
 */
-static void si_cs_preamble_add_vgt_flush(struct si_context *sctx)
+static void si_cs_preamble_add_vgt_flush(struct si_context *sctx, bool tmz)
 {
+   struct si_pm4_state *pm4 = tmz ? sctx->cs_preamble_state_tmz : sctx->cs_preamble_state;
+   bool *has_vgt_flush = tmz ? &sctx->cs_preamble_has_vgt_flush_tmz :
+                               &sctx->cs_preamble_has_vgt_flush;
+
   /* We shouldn't get here if registers are shadowed. */
   assert(!sctx->shadowed_regs);

-   if (sctx->cs_preamble_has_vgt_flush)
+   if (*has_vgt_flush)
      return;

   /* Done by Vulkan before VGT_FLUSH. */
-   si_pm4_cmd_add(sctx->cs_preamble_state, PKT3(PKT3_EVENT_WRITE, 0, 0));
-   si_pm4_cmd_add(sctx->cs_preamble_state, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+   si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
+   si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));

   /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
-   si_pm4_cmd_add(sctx->cs_preamble_state, PKT3(PKT3_EVENT_WRITE, 0, 0));
-   si_pm4_cmd_add(sctx->cs_preamble_state, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
-   sctx->cs_preamble_has_vgt_flush = true;
+   si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
+   si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+
+   *has_vgt_flush = true;
 }

 /**
@@ -3709,7 +3714,6 @@ bool si_update_gs_ring_buffers(struct si_context *sctx)
   struct si_shader_selector *es =
      sctx->shader.tes.cso ? sctx->shader.tes.cso : sctx->shader.vs.cso;
   struct si_shader_selector *gs = sctx->shader.gs.cso;
-   struct si_pm4_state *pm4;

   /* Chip constants. */
   unsigned num_se = sctx->screen->info.max_se;
@@ -3811,32 +3815,43 @@ bool si_update_gs_ring_buffers(struct si_context *sctx)
   }

   /* The codepath without register shadowing. */
-   /* Create the "cs_preamble_gs_rings" state. */
-   pm4 = CALLOC_STRUCT(si_pm4_state);
-   if (!pm4)
-      return false;
+   for (unsigned tmz = 0; tmz <= 1; tmz++) {
+      struct si_pm4_state *pm4 = tmz ? sctx->cs_preamble_state_tmz : sctx->cs_preamble_state;
+      uint16_t *gs_ring_state_dw_offset = tmz ? &sctx->gs_ring_state_dw_offset_tmz :
+                                                &sctx->gs_ring_state_dw_offset;
+      unsigned old_ndw = 0;

-   if (sctx->gfx_level >= GFX7) {
-      if (sctx->esgs_ring) {
-         assert(sctx->gfx_level <= GFX8);
-         si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256);
+      si_cs_preamble_add_vgt_flush(sctx, tmz);
+
+      if (!*gs_ring_state_dw_offset) {
+         /* We are here for the first time. The packets will be added. */
+         *gs_ring_state_dw_offset = pm4->ndw;
+      } else {
+         /* We have been here before. Overwrite the previous packets. */
+         old_ndw = pm4->ndw;
+         pm4->ndw = *gs_ring_state_dw_offset;
+      }
+
+      if (sctx->gfx_level >= GFX7) {
+         if (sctx->esgs_ring) {
+            assert(sctx->gfx_level <= GFX8);
+            si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256);
+         }
+         if (sctx->gsvs_ring)
+            si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256);
+      } else {
+         if (sctx->esgs_ring)
+            si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256);
+         if (sctx->gsvs_ring)
+            si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256);
+      }
+
+      if (old_ndw) {
+         pm4->ndw = old_ndw;
+         pm4->last_opcode = 255; /* invalid opcode (we don't save the last opcode) */
      }
-      if (sctx->gsvs_ring)
-         si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256);
-   } else {
-      if (sctx->esgs_ring)
-         si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256);
-      if (sctx->gsvs_ring)
-         si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256);
   }

-   /* Set the state. */
-   if (sctx->cs_preamble_gs_rings)
-      si_pm4_free_state(sctx, sctx->cs_preamble_gs_rings, ~0);
-   sctx->cs_preamble_gs_rings = pm4;
-
-   si_cs_preamble_add_vgt_flush(sctx);
-
   /* Flush the context to re-emit both cs_preamble states. */
   sctx->initial_gfx_cs_size = 0; /* force flush */
   si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
@@ -4081,42 +4096,31 @@ void si_init_tess_factor_ring(struct si_context *sctx)
      return;
   }

-   /* The codepath without register shadowing. */
-   si_cs_preamble_add_vgt_flush(sctx);
+   /* The codepath without register shadowing is below. */
+   /* Add these registers to cs_preamble_state. */
+   for (unsigned tmz = 0; tmz <= 1; tmz++) {
+      struct si_pm4_state *pm4 = tmz ? sctx->cs_preamble_state_tmz : sctx->cs_preamble_state;
+      struct pipe_resource *tf_ring = tmz ? sctx->tess_rings_tmz : sctx->tess_rings;

-   /* Append these registers to the init config state. */
-   if (sctx->gfx_level >= GFX7) {
-      si_pm4_set_reg(sctx->cs_preamble_state, R_030938_VGT_TF_RING_SIZE,
-                     S_030938_SIZE(tf_ring_size_field));
-      si_pm4_set_reg(sctx->cs_preamble_state, R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8);
-      if (sctx->gfx_level >= GFX10)
-         si_pm4_set_reg(sctx->cs_preamble_state, R_030984_VGT_TF_MEMORY_BASE_HI,
-                        S_030984_BASE_HI(factor_va >> 40));
-      else if (sctx->gfx_level == GFX9)
-         si_pm4_set_reg(sctx->cs_preamble_state, R_030944_VGT_TF_MEMORY_BASE_HI,
-                        S_030944_BASE_HI(factor_va >> 40));
-      si_pm4_set_reg(sctx->cs_preamble_state, R_03093C_VGT_HS_OFFCHIP_PARAM,
-                     sctx->screen->hs.hs_offchip_param);
-   } else {
-      struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
+      if (!tf_ring)
+         continue; /* TMZ not supported */

-      si_pm4_set_reg(pm4, R_008988_VGT_TF_RING_SIZE,
-                     S_008988_SIZE(tf_ring_size_field));
-      si_pm4_set_reg(pm4, R_0089B8_VGT_TF_MEMORY_BASE, factor_va >> 8);
-      si_pm4_set_reg(pm4, R_0089B0_VGT_HS_OFFCHIP_PARAM,
-                     sctx->screen->hs.hs_offchip_param);
-      sctx->cs_preamble_tess_rings = pm4;
+      uint64_t va = si_resource(tf_ring)->gpu_address + sctx->screen->hs.tess_offchip_ring_size;

-      if (sctx->screen->info.has_tmz_support) {
-         pm4 = CALLOC_STRUCT(si_pm4_state);
-         uint64_t factor_va_tmz =
-            si_resource(sctx->tess_rings_tmz)->gpu_address + sctx->screen->hs.tess_offchip_ring_size;
-         si_pm4_set_reg(pm4, R_008988_VGT_TF_RING_SIZE,
-                     S_008988_SIZE(tf_ring_size_field));
-         si_pm4_set_reg(pm4, R_0089B8_VGT_TF_MEMORY_BASE, factor_va_tmz >> 8);
-         si_pm4_set_reg(pm4, R_0089B0_VGT_HS_OFFCHIP_PARAM,
-                        sctx->screen->hs.hs_offchip_param);
-         sctx->cs_preamble_tess_rings_tmz = pm4;
+      si_cs_preamble_add_vgt_flush(sctx, tmz);
+
+      if (sctx->gfx_level >= GFX7) {
+         si_pm4_set_reg(pm4, R_030938_VGT_TF_RING_SIZE, S_030938_SIZE(tf_ring_size_field));
+         si_pm4_set_reg(pm4, R_03093C_VGT_HS_OFFCHIP_PARAM, sctx->screen->hs.hs_offchip_param);
+         si_pm4_set_reg(pm4, R_030940_VGT_TF_MEMORY_BASE, va >> 8);
+         if (sctx->gfx_level >= GFX10)
+            si_pm4_set_reg(pm4, R_030984_VGT_TF_MEMORY_BASE_HI, S_030984_BASE_HI(va >> 40));
+         else if (sctx->gfx_level == GFX9)
+            si_pm4_set_reg(pm4, R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(va >> 40));
+      } else {
+         si_pm4_set_reg(pm4, R_008988_VGT_TF_RING_SIZE, S_008988_SIZE(tf_ring_size_field));
+         si_pm4_set_reg(pm4, R_0089B8_VGT_TF_MEMORY_BASE, factor_va >> 8);
+         si_pm4_set_reg(pm4, R_0089B0_VGT_HS_OFFCHIP_PARAM, sctx->screen->hs.hs_offchip_param);
      }
   }