panvk: implement multiview support

In Valhall multiview, position/varying shaders are invoked once per draw. Each invocation write separate outputs for all views. Fragment processing is handled by the existing multilayer support. Note that because the hardware only supports up to 8 views, we don't have to care about the case where there are too many layers to fit in one tiler when multiview is enabled. Signed-off-by: Benjamin Lee <benjamin.lee@collabora.com> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31704>
2024-10-16 22:27:25 -07:00
parent 74ccf6cbdc
commit 448b5e0225
7 changed files with 72 additions and 10 deletions
--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@@ -1047,7 +1047,8 @@ bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-   if (intr->intrinsic != nir_intrinsic_store_output)
+   if (intr->intrinsic != nir_intrinsic_store_output &&
       intr->intrinsic != nir_intrinsic_store_per_view_output)
      return false;
   if (bi_should_remove_store(intr, *idvs)) {
@@ -1127,11 +1128,12 @@ bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
                bi_imm_u32(format), regfmt, nr - 1);
   } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
      bi_index index = bi_preload(b, 59);
      unsigned index_offset = 0;
      unsigned pos_attr_offset = 0;
      unsigned src_bit_sz = nir_src_bit_size(instr->src[0]);
      if (psiz || layer)
-         index = bi_iadd_imm_i32(b, index, 4);
+         index_offset += 4;
      if (layer) {
         assert(nr == 1 && src_bit_sz == 32);
@@ -1143,11 +1145,29 @@ bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
      if (psiz)
         assert(T_size == 16 && "should've been lowered");
      bool varying = (b->shader->idvs == BI_IDVS_VARYING);
      if (instr->intrinsic == nir_intrinsic_store_per_view_output) {
         unsigned view_index = nir_src_as_uint(instr->src[1]);
         if (varying) {
            index_offset += view_index * 4;
         } else {
            /* We don't patch these offsets in the no_psiz variant, so if
             * multiview is enabled we can't switch to the basic format by
             * using no_psiz */
            bool extended_position_fifo = b->shader->nir->info.outputs_written &
               (VARYING_BIT_LAYER | VARYING_BIT_PSIZ);
            unsigned position_fifo_stride = extended_position_fifo ? 8 : 4;
            index_offset += view_index * position_fifo_stride;
         }
      }
      if (index_offset != 0)
         index = bi_iadd_imm_i32(b, index, index_offset);
      bi_index address = bi_lea_buf_imm(b, index);
      bi_emit_split_i32(b, a, address, 2);
      bool varying = (b->shader->idvs == BI_IDVS_VARYING);
      bi_store(b, nr * src_bit_sz, data, a[0], a[1],
               varying ? BI_SEG_VARY : BI_SEG_POS,
               varying ? bi_varying_offset(b->shader, instr) : pos_attr_offset);
@@ -1739,6 +1759,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
      break;
   case nir_intrinsic_store_output:
   case nir_intrinsic_store_per_view_output:
      if (stage == MESA_SHADER_FRAGMENT)
         bi_emit_fragment_out(b, instr);
      else if (stage == MESA_SHADER_VERTEX)
@@ -1978,6 +1999,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
      bi_emit_derivative(b, dst, instr, 2, true);
      break;
   case nir_intrinsic_load_view_index:
   case nir_intrinsic_load_layer_id:
      assert(b->shader->arch >= 9);
      bi_mov_i32_to(b, dst, bi_u8_to_u32(b, bi_byte(bi_preload(b, 62), 0)));
--- a/src/panfrost/util/pan_ir.h
+++ b/src/panfrost/util/pan_ir.h
@@ -105,6 +105,7 @@ struct panfrost_compile_inputs {
   } blend;
   bool no_idvs;
   bool no_ubo_to_push;
   uint32_t view_mask;
   /* Used on Valhall.
    *
--- a/src/panfrost/util/pan_lower_store_component.c
+++ b/src/panfrost/util/pan_lower_store_component.c
@@ -36,7 +36,8 @@
 static bool
 lower_store_component(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
-   if (intr->intrinsic != nir_intrinsic_store_output)
+   if (intr->intrinsic != nir_intrinsic_store_output &&
       intr->intrinsic != nir_intrinsic_store_per_view_output)
      return false;
   struct hash_table_u64 *slots = data;
@@ -44,6 +45,11 @@ lower_store_component(nir_builder *b, nir_intrinsic_instr *intr, void *data)
   nir_src *slot_src = nir_get_io_offset_src(intr);
   uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr);
   if (intr->intrinsic == nir_intrinsic_store_per_view_output) {
      uint64_t view_index = nir_src_as_uint(intr->src[1]);
      slot |= view_index << 32;
   }
   nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot);
   unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0);
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
@@ -1571,6 +1571,7 @@ set_tiler_idvs_flags(struct cs_builder *b, struct panvk_cmd_buffer *cmdbuf,
         cfg.secondary_shader = vs->info.vs.secondary_enable && fs != NULL;
         cfg.primitive_restart = ia->primitive_restart_enable;
         cfg.view_mask = cmdbuf->state.gfx.render.view_mask;
      }
      cs_move32_to(b, cs_sr_reg32(b, 56), tiler_idvs_flags.opaque[0]);
@@ -1857,8 +1858,12 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
    * we decide to support layared+indirect, we'll need to pass the
    * layer_count info through the tiler descriptor, for instance by
    * re-using one of the word that's flagged 'ignored' in the descriptor
-    * (word 14:23). */
+    * (word 14:23).
-   assert(cmdbuf->state.gfx.render.layer_count <= 1);
+    *
    * Multiview is limited to 8 layers, and so will always fit in one TD.
    * Therefore layered rendering is allowed with multiview. */
   assert(cmdbuf->state.gfx.render.layer_count <= 1 ||
          cmdbuf->state.gfx.render.view_mask);
   /* MultiDrawIndirect (.maxDrawIndirectCount) needs additional changes. */
   assert(draw->indirect.draw_count == 1);
@@ -1978,7 +1983,10 @@ panvk_per_arch(cmd_inherit_render_state)(
          sizeof(cmdbuf->state.gfx.render.s_attachment));
   cmdbuf->state.gfx.render.bound_attachments = 0;
-   cmdbuf->state.gfx.render.layer_count = 0;
+   cmdbuf->state.gfx.render.view_mask = inheritance_info->viewMask;
   cmdbuf->state.gfx.render.layer_count = inheritance_info->viewMask ?
      util_last_bit(inheritance_info->viewMask) :
      0;
   *fbinfo = (struct pan_fb_info){
      .tile_buf_budget = panfrost_query_optimal_tib_size(phys_dev->model),
      .nr_samples = inheritance_info->rasterizationSamples,
--- a/src/panfrost/vulkan/panvk_cmd_draw.h
+++ b/src/panfrost/vulkan/panvk_cmd_draw.h
@@ -40,6 +40,7 @@ struct panvk_resolve_attachment {
 struct panvk_rendering_state {
   VkRenderingFlags flags;
   uint32_t layer_count;
   uint32_t view_mask;
   enum vk_rp_attachment_flags bound_attachments;
   struct {
--- a/src/panfrost/vulkan/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/panvk_vX_cmd_draw.c
@@ -227,7 +227,10 @@ panvk_per_arch(cmd_init_render_state)(struct panvk_cmd_buffer *cmdbuf,
   memset(&state->render.s_attachment, 0, sizeof(state->render.s_attachment));
   state->render.bound_attachments = 0;
-   state->render.layer_count = pRenderingInfo->layerCount;
+   cmdbuf->state.gfx.render.layer_count = pRenderingInfo->viewMask ?
      util_last_bit(pRenderingInfo->viewMask) :
      pRenderingInfo->layerCount;
   cmdbuf->state.gfx.render.view_mask = pRenderingInfo->viewMask;
   *fbinfo = (struct pan_fb_info){
      .tile_buf_budget = panfrost_query_optimal_tib_size(phys_dev->model),
      .nr_samples = 1,
@@ -390,7 +393,7 @@ panvk_per_arch(cmd_resolve_attachments)(struct panvk_cmd_buffer *cmdbuf)
            .extent.height = fbinfo->extent.maxy - fbinfo->extent.miny + 1,
         },
      .layerCount = cmdbuf->state.gfx.render.layer_count,
-      .viewMask = 0,
+      .viewMask = cmdbuf->state.gfx.render.view_mask,
      .colorAttachmentCount = color_att_count,
      .pColorAttachments = color_atts,
      .pDepthAttachment = &z_att,
--- a/src/panfrost/vulkan/panvk_vX_shader.c
+++ b/src/panfrost/vulkan/panvk_vX_shader.c
@@ -372,6 +372,9 @@ panvk_hash_graphics_state(struct vk_physical_device *device,
   _mesa_blake3_update(&blake3_ctx, &sample_shading_enable,
                       sizeof(sample_shading_enable));
   _mesa_blake3_update(&blake3_ctx, &state->rp->view_mask,
                       sizeof(state->rp->view_mask));
   _mesa_blake3_final(&blake3_ctx, blake3_out);
 }
@@ -458,6 +461,23 @@ panvk_lower_nir(struct panvk_device *dev, nir_shader *nir,
      to_panvk_instance(dev->vk.physical->instance);
   gl_shader_stage stage = nir->info.stage;
 #if PAN_ARCH >= 10
   if (stage == MESA_SHADER_VERTEX && compile_input->view_mask) {
      nir_lower_multiview_options options = {
         .view_mask = compile_input->view_mask,
         .allowed_per_view_outputs = ~0
      };
      /* The only case where this should fail is with memory/image writes,
       * which we don't support in vertex shaders */
      assert(nir_can_lower_multiview(nir, options));
      NIR_PASS(_, nir, nir_lower_multiview, options);
      /* Pull output writes out of the loop and give them constant offsets for
       * pan_lower_store_components */
      NIR_PASS(_, nir, nir_lower_io_to_temporaries,
               nir_shader_get_entrypoint(nir), true, false);
   }
 #endif
   NIR_PASS(_, nir, panvk_per_arch(nir_lower_descriptors), dev, rs,
            set_layout_count, set_layouts, shader);
@@ -835,6 +855,7 @@ panvk_compile_shader(struct panvk_device *dev,
   struct panfrost_compile_inputs inputs = {
      .gpu_id = phys_dev->kmod.props.gpu_prod_id,
      .no_ubo_to_push = true,
      .view_mask = (state && state->rp) ? state->rp->view_mask : 0,
   };
   panvk_lower_nir(dev, nir, info->set_layout_count, info->set_layouts,