agx: Lower discard in NIR

We already lower discard in NIR when depth/stencil writes are used in the shader. In this patch, we extend that lowering for when depth/stencil writes are not used, in which case the discard is lowered to a sample_mask instruction. This is a step towards multisampling, since the old lowering assumed single-sample and there's no way to express a sample mask with a standard NIR discard instructions so we need to lower in NIR anyway for sample shading (i.e. if a discard_if diverges between samples in a pixel). This changes the lowering for discard_if to be free of control flow (instead executing a sample mask instruction unconditionally). This seems to be slightly faster in SuperTuxKart and slightly slower in Dolphin, but I'm not too worried right now. To make this work, we do need some extra lowering to ensure we always execute a sample_mask instruction, in case a discard_if is buried in other control flow (as occurs with Dolphin's ubershaders). So that's added too. We need that for MSAA anyway, so pardon the line count. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23480>
2023-05-25 13:22:49 -04:00
parent 989d6fd378
commit 398851ca53
8 changed files with 241 additions and 80 deletions
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -419,26 +419,6 @@ agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr)
                      agx_src_index(&instr->src[0]));
 }

-static void
-agx_write_sample_mask_1(agx_builder *b)
-{
-   if (b->shader->nir->info.fs.uses_discard && !b->shader->did_sample_mask) {
-      /* If the shader uses discard, the sample mask must be written by the
-       * shader on all execution paths. If we've reached the end of the shader,
-       * we are therefore still active and need to write a full sample mask.
-       * TODO: interactions with MSAA and gl_SampleMask writes
-       */
-      agx_sample_mask(b, agx_immediate(0xFF), agx_immediate(1));
-      agx_signal_pix(b, 1);
-      b->shader->did_sample_mask = true;
-
-      assert(!(b->shader->nir->info.outputs_written &
-               (BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
-                BITFIELD64_BIT(FRAG_RESULT_STENCIL))) &&
-             "incompatible");
-   }
-}
-
 static agx_instr *
 agx_emit_local_store_pixel(agx_builder *b, nir_intrinsic_instr *instr)
 {
@@ -451,8 +431,6 @@ agx_emit_local_store_pixel(agx_builder *b, nir_intrinsic_instr *instr)
      agx_wait_pix(b, 0x000C);
   }

-   agx_write_sample_mask_1(b);
-
   /* Compact the registers according to the mask */
   agx_index compacted[4] = {agx_null()};

@@ -499,7 +477,6 @@ agx_emit_store_zs(agx_builder *b, nir_intrinsic_instr *instr)
    * maybe rename this flag to something more general.
    */
   b->shader->out->writes_sample_mask = true;
-   assert(!b->shader->did_sample_mask && "incompatible");

   return agx_zs_emit(b, agx_src_index(&instr->src[0]), zs, base);
 }
@@ -672,26 +649,6 @@ agx_emit_load_frag_coord(agx_builder *b, agx_index dst,
   agx_emit_collect_to(b, dst, 4, dests);
 }

-/*
- * Demoting a helper invocation is logically equivalent to zeroing the sample
- * mask. Metal implement discard as such.
- *
- * XXX: Actually, Metal's "discard" is a demote, and what is implemented here
- * is a demote. There might be a better way to implement this to get correct
- * helper invocation semantics. For now, I'm kicking the can down the road.
- */
-static agx_instr *
-agx_emit_discard(agx_builder *b)
-{
-   assert(!b->shader->key->fs.ignore_tib_dependencies && "invalid usage");
-   agx_wait_pix(b, 0x0001);
-   b->shader->did_writeout = true;
-
-   b->shader->out->writes_sample_mask = true;
-   agx_sample_mask(b, agx_immediate(0xFF), agx_immediate(0));
-   return agx_signal_pix(b, 1);
-}
-
 static agx_instr *
 agx_load_compute_dimension(agx_builder *b, agx_index dst,
                           nir_intrinsic_instr *instr, enum agx_sr base)
@@ -876,9 +833,6 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
      agx_emit_load_frag_coord(b, dst, instr);
      return NULL;

-   case nir_intrinsic_discard:
-      return agx_emit_discard(b);
-
   case nir_intrinsic_sample_mask_agx: {
      assert(stage == MESA_SHADER_FRAGMENT);
      b->shader->out->writes_sample_mask = true;
@@ -2202,10 +2156,6 @@ agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,
    */
   agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
   agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));
-
-   if (ctx->stage == MESA_SHADER_FRAGMENT && !impl->function->is_preamble)
-      agx_write_sample_mask_1(&_b);
-
   agx_logical_end(&_b);
   agx_stop(&_b);

@@ -2427,7 +2377,7 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,

   /* Late clip plane lowering created discards */
   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-      NIR_PASS_V(nir, agx_nir_lower_zs_emit);
+      NIR_PASS_V(nir, agx_nir_lower_discard_zs_emit);
   }

   /* Late sysval lowering creates large loads. Load lowering creates unpacks */
@@ -2455,10 +2405,16 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
   out->push_count = key->reserved_preamble;
   agx_optimize_nir(nir, &out->push_count);

-   /* Implement conditional discard with real control flow like Metal */
-   NIR_PASS_V(nir, nir_lower_discard_if,
-              (nir_lower_discard_if_to_cf | nir_lower_demote_if_to_cf |
-               nir_lower_terminate_if_to_cf));
+   /* Create sample_mask instructions late, since NIR's scheduling is not aware
+    * of the ordering requirements between sample_mask and pixel stores.
+    *
+    * Note: when epilogs are used, special handling is required since the sample
+    * count is dynamic when the main fragment shader is compiled.
+    */
+   if (key->fs.nr_samples) {
+      assert(nir->info.stage == MESA_SHADER_FRAGMENT);
+      NIR_PASS_V(nir, agx_nir_lower_sample_mask, key->fs.nr_samples);
+   }

   /* Must be last since NIR passes can remap driver_location freely */
   if (nir->info.stage == MESA_SHADER_VERTEX)
--- a/src/asahi/compiler/agx_compile.h
+++ b/src/asahi/compiler/agx_compile.h
@@ -153,6 +153,13 @@ struct agx_fs_shader_key {
    * tilebuffer loads (including blending).
    */
   bool ignore_tib_dependencies;
+
+   /* In a monolithic fragment shader or in a fragment epilogue, the number of
+    * samples in the tilebuffer. In a non-monolithic fragment shader, leave
+    * zero. This is used for the correct lowering of sample_mask instructions,
+    * to ensure that all samples are written out. Can be set conservatively.
+    */
+   unsigned nr_samples;
 };

 struct agx_shader_key {
--- a/src/asahi/compiler/agx_compiler.h
+++ b/src/asahi/compiler/agx_compiler.h
@@ -379,7 +379,7 @@ typedef struct {
   unsigned alloc;

   /* I don't really understand how writeout ops work yet */
-   bool did_writeout, did_sample_mask;
+   bool did_writeout;

   /* Has r0l been zeroed yet due to control flow? */
   bool any_cf;
@@ -793,7 +793,8 @@ void agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,
 void agx_compute_liveness(agx_context *ctx);
 void agx_liveness_ins_update(BITSET_WORD *live, agx_instr *I);

-bool agx_nir_lower_zs_emit(nir_shader *s);
+bool agx_nir_lower_discard_zs_emit(nir_shader *s);
+bool agx_nir_lower_sample_mask(nir_shader *s, unsigned nr_samples);
 bool agx_nir_lower_texture(nir_shader *s, bool support_lod_bias);
 bool agx_nir_opt_preamble(nir_shader *s, unsigned *preamble_size);
 bool agx_nir_lower_load_mask(nir_shader *shader);
--- a/src/asahi/compiler/agx_nir_lower_discard_zs_emit.c
+++ b/src/asahi/compiler/agx_nir_lower_discard_zs_emit.c
@@ -12,7 +12,7 @@
 #define BASE_S      2

 static bool
-lower(nir_function_impl *impl, nir_block *block)
+lower_zs_emit(nir_function_impl *impl, nir_block *block)
 {
   nir_intrinsic_instr *zs_emit = NULL;
   bool progress = false;
@@ -72,7 +72,7 @@ lower(nir_function_impl *impl, nir_block *block)
 }

 static bool
-lower_discard_to_z(nir_builder *b, nir_instr *instr, UNUSED void *data)
+lower_discard(nir_builder *b, nir_instr *instr, UNUSED void *data)
 {
   if (instr->type != nir_instr_type_intrinsic)
      return false;
@@ -84,34 +84,42 @@ lower_discard_to_z(nir_builder *b, nir_instr *instr, UNUSED void *data)

   b->cursor = nir_before_instr(instr);

-   if (intr->intrinsic == nir_intrinsic_discard_if)
-      nir_push_if(b, intr->src[0].ssa);
-
-   bool stencil_written =
-      b->shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
-
-   nir_store_zs_agx(b, nir_imm_intN_t(b, ALL_SAMPLES, 16),
-                    nir_imm_float(b, NAN),
-                    stencil_written ? nir_imm_intN_t(b, 0, 16)
-                                    : nir_ssa_undef(b, 1, 16) /* stencil */,
-                    .base = BASE_Z | (stencil_written ? BASE_S : 0));
+   nir_ssa_def *all_samples = nir_imm_intN_t(b, ALL_SAMPLES, 16);
+   nir_ssa_def *no_samples = nir_imm_intN_t(b, 0, 16);

   if (intr->intrinsic == nir_intrinsic_discard_if)
-      nir_push_else(b, NULL);
+      no_samples = nir_bcsel(b, intr->src[0].ssa, no_samples, all_samples);
+
+   /* This will get lowered later to zs_emit if needed */
+   nir_sample_mask_agx(b, all_samples, no_samples);
+   b->shader->info.fs.uses_discard = false;
+   b->shader->info.outputs_written |= BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);

   nir_instr_remove(instr);
   return true;
 }

-bool
+static bool
+agx_nir_lower_discard(nir_shader *s)
+{
+   if (!s->info.fs.uses_discard)
+      return false;
+
+   return nir_shader_instructions_pass(
+      s, lower_discard, nir_metadata_block_index | nir_metadata_dominance,
+      NULL);
+}
+
+static bool
 agx_nir_lower_zs_emit(nir_shader *s)
 {
-   bool any_progress = false;
-
+   /* If depth/stencil isn't written, there's nothing to lower */
   if (!(s->info.outputs_written & (BITFIELD64_BIT(FRAG_RESULT_STENCIL) |
                                    BITFIELD64_BIT(FRAG_RESULT_DEPTH))))
      return false;

+   bool any_progress = false;
+
   nir_foreach_function(function, s) {
      if (!function->impl)
         continue;
@@ -119,7 +127,7 @@ agx_nir_lower_zs_emit(nir_shader *s)
      bool progress = false;

      nir_foreach_block(block, function->impl) {
-         progress |= lower(function->impl, block);
+         progress |= lower_zs_emit(function->impl, block);
      }

      if (progress) {
@@ -132,9 +140,17 @@ agx_nir_lower_zs_emit(nir_shader *s)
      any_progress |= progress;
   }

-   any_progress |= nir_shader_instructions_pass(
-      s, lower_discard_to_z, nir_metadata_block_index | nir_metadata_dominance,
-      NULL);
-   s->info.fs.uses_discard = false;
   return any_progress;
 }
+
+bool
+agx_nir_lower_discard_zs_emit(nir_shader *s)
+{
+   bool progress = false;
+
+   /* Lower depth/stencil writes before discard so the interaction works */
+   progress |= agx_nir_lower_zs_emit(s);
+   progress |= agx_nir_lower_discard(s);
+
+   return progress;
+}
--- a/src/asahi/compiler/agx_nir_lower_sample_mask.c
+++ b/src/asahi/compiler/agx_nir_lower_sample_mask.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright 2023 Alyssa Rosenzweig
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "agx_compiler.h"
+
+/*
+ * sample_mask takes two bitmasks as arguments, TARGET and LIVE. Each bit refers
+ * to an indexed sample. Roughly, the instruction does:
+ *
+ *    foreach sample in TARGET {
+ *       if sample in LIVE {
+ *          mark sample live
+ *       } else {
+ *          kill sample
+ *       }
+ *    }
+ *
+ * As a special case, TARGET may be set to all-1s (~0) to refer to all samples
+ * regardless of the framebuffer sample count.
+ *
+ * For example, to discard an entire pixel unconditionally, we could run:
+ *
+ *    sample_mask ~0, 0
+ *
+ * sample_mask must follow these rules:
+ *
+ * 1. All sample_mask instructions affecting a sample must execute before a
+ *    local_store_pixel instruction targeting that sample.
+ *
+ * 2. If sample_mask is used anywhere in a shader, the state of every sample
+ *    must be set on all execution paths. That is, the union of the TARGET sets
+ *    of executed sample_masks must contain all samples.
+ *
+ * 3. If a sample is killed, future sample_mask instructions will have
+ *    no effect. The following code sequence correctly implements a conditional
+ *    discard (if there are no other sample_mask instructions in the shader):
+ *
+ *       sample_mask discarded, 0
+ *       sample_mask ~0, ~0
+ *
+ * 4. If zs_emit is used anywhere in the shader, sample_mask must not be used.
+ * Instead, zs_emit with depth = NaN can be emitted.
+ *
+ * This pass legalizes sample_mask instructions to satisfy these rules.
+ */
+
+#define ALL_SAMPLES (0xFF)
+#define BASE_Z      1
+#define BASE_S      2
+
+static bool
+lower_sample_mask_to_zs(nir_builder *b, nir_instr *instr, UNUSED void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+   bool depth_written =
+      b->shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH);
+   bool stencil_written =
+      b->shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
+
+   b->cursor = nir_before_instr(instr);
+
+   /* Existing zs_emit instructions need to be fixed up to write their own depth
+    * for consistency.
+    */
+   if (intr->intrinsic == nir_intrinsic_store_zs_agx && !depth_written) {
+      /* Load the current depth at this pixel */
+      nir_ssa_def *z = nir_channel(b, nir_load_frag_coord(b), 2);
+
+      /* Write it out from this store_zs */
+      nir_intrinsic_set_base(intr, nir_intrinsic_base(intr) | BASE_Z);
+      nir_instr_rewrite_src_ssa(instr, &intr->src[1], z);
+
+      /* We'll set outputs_written after the pass in case there are multiple
+       * store_zs_agx instructions needing fixup.
+       */
+      b->shader->info.fs.depth_layout = FRAG_DEPTH_LAYOUT_ANY;
+      return true;
+   }
+
+   if (intr->intrinsic != nir_intrinsic_sample_mask_agx)
+      return false;
+
+   nir_ssa_def *target = intr->src[0].ssa;
+   nir_ssa_def *live = intr->src[1].ssa;
+   nir_ssa_def *discard = nir_iand(b, target, nir_inot(b, live));
+
+   /* Write a NaN depth value for discarded samples */
+   nir_store_zs_agx(b, discard, nir_imm_float(b, NAN),
+                    stencil_written ? nir_imm_intN_t(b, 0, 16)
+                                    : nir_ssa_undef(b, 1, 16) /* stencil */,
+                    .base = BASE_Z | (stencil_written ? BASE_S : 0));
+
+   nir_instr_remove(instr);
+   return true;
+}
+
+bool
+agx_nir_lower_sample_mask(nir_shader *shader, unsigned nr_samples)
+{
+   if (!(shader->info.outputs_written &
+         (BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK))))
+      return false;
+
+   /* sample_mask can't be used with zs_emit, so lower sample_mask to zs_emit */
+   if (shader->info.outputs_written & (BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
+                                       BITFIELD64_BIT(FRAG_RESULT_STENCIL))) {
+      bool progress = nir_shader_instructions_pass(
+         shader, lower_sample_mask_to_zs,
+         nir_metadata_block_index | nir_metadata_dominance, NULL);
+
+      /* The lowering requires an unconditional depth write. We mark this after
+       * lowering so the lowering knows whether there was already a depth write
+       */
+      assert(progress && "must have lowered something,given the outputs");
+      shader->info.outputs_written |= BITFIELD64_BIT(FRAG_RESULT_DEPTH);
+
+      return true;
+   }
+
+   /* nir_lower_io_to_temporaries ensures that stores are in the last block */
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+   nir_block *block = nir_impl_last_block(impl);
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   /* Check which samples get a value written in the last block */
+   uint8_t samples_set = 0;
+
+   nir_foreach_instr(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+      if (intr->intrinsic != nir_intrinsic_sample_mask_agx)
+         continue;
+
+      if (!nir_src_is_const(intr->src[0]))
+         continue;
+
+      samples_set |= nir_src_as_uint(intr->src[0]);
+   }
+
+   /* If all samples are set, we're good to go */
+   if ((samples_set & BITFIELD_MASK(nr_samples)) == BITFIELD_MASK(nr_samples))
+      return false;
+
+   /* Otherwise, at least one sample is not set in the last block and hence may
+    * not be set at all. Insert an instruction in the last block to ensure it
+    * will be live.
+    */
+   b.cursor = nir_after_block(block);
+
+   nir_foreach_instr(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+      if (intr->intrinsic != nir_intrinsic_store_local_pixel_agx)
+         continue;
+
+      b.cursor = nir_before_instr(instr);
+      break;
+   }
+
+   nir_sample_mask_agx(&b, nir_imm_intN_t(&b, ALL_SAMPLES, 16),
+                       nir_imm_intN_t(&b, ALL_SAMPLES, 16));
+   return true;
+}
--- a/src/asahi/compiler/meson.build
+++ b/src/asahi/compiler/meson.build
@@ -9,7 +9,8 @@ libasahi_agx_files = files(
  'agx_insert_waits.c',
  'agx_nir_lower_address.c',
  'agx_nir_lower_frag_sidefx.c',
-  'agx_nir_lower_zs_emit.c',
+  'agx_nir_lower_sample_mask.c',
+  'agx_nir_lower_discard_zs_emit.c',
  'agx_nir_lower_texture.c',
  'agx_nir_lower_load_mask.c',
  'agx_nir_lower_shared_bitsize.c',
--- a/src/asahi/lib/agx_meta.c
+++ b/src/asahi/lib/agx_meta.c
@@ -75,6 +75,7 @@ agx_build_background_shader(struct agx_meta_cache *cache,

   struct agx_shader_key compiler_key = {
      .fs.ignore_tib_dependencies = true,
+      .fs.nr_samples = key->tib.nr_samples,
   };

   for (unsigned rt = 0; rt < ARRAY_SIZE(key->op); ++rt) {
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -1418,6 +1418,9 @@ agx_compile_variant(struct agx_device *dev, struct agx_uncompiled_shader *so,

   struct agx_shader_key base_key = {0};

+   if (nir->info.stage == MESA_SHADER_FRAGMENT)
+      base_key.fs.nr_samples = 1;
+
   NIR_PASS_V(nir, agx_nir_lower_sysvals, compiled,
              &base_key.reserved_preamble);