tu: support KHR_8bit_storage

Add basic KHR_8bit_storage support for Adreno 750 devices, for now enabling the storageBuffer8BitAccess feature. A separate descriptor is provided for 8-bit storage access. The descriptor index is adjusted appropriately for 8-bit SSBO loads and stores. The 8-bit SSBO loads cannot go through isam since that instruction isn't able to handle those. The ldib and stib instruction encodings are a bit peculiar but they match the blob's image buffer access through VK_FORMAT_R8 and the dedicated descriptor. These loads and stores do not work in vectorized form, so they have to be scalarized. Additionally stores of 8-bit values have to clear up higher bits of those values. 8-bit truncation can leave higher bits as undefined. Zero-extension of 8-bit values has to use masking since the corresponding cov instruction doesn't function as intended. 8-bit sign extension through cov from a non-shared to a shared register also doesn't work, so an exception is applied to avoid it. Conversion of 8-bit values to and from floating-point values also doesn't work with a straightforward cov instruction, instead the conversion has to go through a 16-bit value. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9979 Signed-off-by: Zan Dobersek <zdobersek@igalia.com> Reviewed-by: Connor Abbott <cwabbott0@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28254>
2024-03-18 21:12:44 +01:00
parent c93a629f2c
commit 8a84e77b15
15 changed files with 169 additions and 24 deletions
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -448,7 +448,7 @@ Vulkan 1.1 -- all DONE: anv, lvp, nvk, radv, tu, vn

 Vulkan 1.2 -- all DONE: anv, nvk, tu, vn

-  VK_KHR_8bit_storage                                   DONE (anv, dzn, hasvk, lvp, nvk, radv, v3dv, vn)
+  VK_KHR_8bit_storage                                   DONE (anv, dzn, hasvk, lvp, nvk, radv, tu/a750+, v3dv, vn)
  VK_KHR_buffer_device_address                          DONE (anv, hasvk, lvp, nvk, panvk, radv, tu, v3dv, vn)
  VK_KHR_create_renderpass2                             DONE (anv, dzn, hasvk, lvp, nvk, radv, tu, v3dv, vn)
  VK_KHR_depth_stencil_resolve                          DONE (anv, dzn, hasvk, lvp, nvk, radv, tu, v3dv, vn)
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@@ -271,6 +271,8 @@ struct fd_dev_info {
       * best thing we could do is a toggle.
       */
      bool enable_tp_ubwc_flag_hint;
+
+      bool storage_8bit;
   } a7xx;
 };

--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@@ -856,6 +856,7 @@ a7xx_750 = A7XXProps(
        # now.
        #supports_ibo_ubwc = True,
        no_gs_hw_binning_quirk = True,
+        storage_8bit = True,
    )

 a730_magic_regs = dict(
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -1175,12 +1175,14 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
         /* floating-point conversions when moving from non-shared to shared
          * seem not to work. We only use floating-point types in ir3 for
          * conversions, so don't bother specially handling the case where the
-          * types are equal.
+          * types are equal. Same goes for 8-bit sign extension.
          */
         if ((instr->dsts[0]->flags & IR3_REG_SHARED) &&
             !(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)) &&
-             (full_type(instr->cat1.src_type) == TYPE_F32 ||
-              full_type(instr->cat1.dst_type) == TYPE_F32))
+             ((full_type(instr->cat1.src_type) == TYPE_F32 ||
+               full_type(instr->cat1.dst_type) == TYPE_F32) ||
+              (instr->cat1.src_type == TYPE_U8 &&
+               full_type(instr->cat1.dst_type) == TYPE_S32)))
            return false;

         /* Conversions seem not to work in shared->shared copies before scalar
--- a/src/freedreno/ir3/ir3_a6xx.c
+++ b/src/freedreno/ir3/ir3_a6xx.c
@@ -69,7 +69,24 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
   ldib->dsts[0]->wrmask = MASK(intr->num_components);
   ldib->cat6.iim_val = intr->num_components;
   ldib->cat6.d = 1;
-   ldib->cat6.type = intr->def.bit_size == 16 ? TYPE_U16 : TYPE_U32;
+   switch (intr->def.bit_size) {
+   case 8:
+      /* This encodes the 8-bit SSBO load and matches blob's encoding of
+       * imageBuffer access using VK_FORMAT_R8 and the dedicated 8-bit
+       * descriptor. No vectorization is possible.
+       */
+      assert(intr->num_components == 1);
+
+      ldib->cat6.type = TYPE_U16;
+      ldib->cat6.typed = true;
+      break;
+   case 16:
+      ldib->cat6.type = TYPE_U16;
+      break;
+   default:
+      ldib->cat6.type = TYPE_U32;
+      break;
+   }
   ldib->barrier_class = IR3_BARRIER_BUFFER_R;
   ldib->barrier_conflict = IR3_BARRIER_BUFFER_W;

@@ -100,6 +117,17 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
    */
   val = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp);

+   /* Any 8-bit store will be done on a single-component value that additionally
+    * has to be masked to clear up the higher bits or it will malfunction.
+    */
+   if (intr->src[0].ssa->bit_size == 8) {
+      assert(ncomp == 1);
+
+      struct ir3_instruction *mask = create_immed_typed(b, 0xff, TYPE_U8);
+      val = ir3_AND_B(b, val, 0, mask, 0);
+      val->dsts[0]->flags |= IR3_REG_HALF;
+   }
+
   lower_ssbo_offset(ctx, intr, &intr->src[3], &offset, &imm_offset_val);
   struct ir3_instruction *imm_offset = create_immed(b, imm_offset_val);

@@ -107,7 +135,24 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
                   imm_offset, 0, val, 0);
   stib->cat6.iim_val = ncomp;
   stib->cat6.d = 1;
-   stib->cat6.type = intr->src[0].ssa->bit_size == 16 ? TYPE_U16 : TYPE_U32;
+   switch (intr->src[0].ssa->bit_size) {
+   case 8:
+      /* As with ldib, this encodes the 8-bit SSBO store and matches blob's
+       * encoding of imageBuffer access using VK_FORMAT_R8 and the extra 8-bit
+       * descriptor. No vectorization is possible and we have to override the
+       * relevant field anyway.
+       */
+      stib->cat6.type = TYPE_U16;
+      stib->cat6.iim_val = 4;
+      stib->cat6.typed = true;
+      break;
+   case 16:
+      stib->cat6.type = TYPE_U16;
+      break;
+   default:
+      stib->cat6.type = TYPE_U32;
+      break;
+   }
   stib->barrier_class = IR3_BARRIER_BUFFER_W;
   stib->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;

--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -67,6 +67,8 @@ struct ir3_compiler_options {

   /* True if 16-bit descriptors are available. */
   bool storage_16bit;
+   /* True if 8-bit descriptors are available. */
+   bool storage_8bit;

  /* If base_vertex should be lowered in nir */
  bool lower_base_vertex;
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -273,6 +273,50 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
   if (src_type == dst_type)
      return src;

+   /* Zero-extension of 8-bit values doesn't work with `cov`, so simple masking
+    * is used to achieve the result.
+    */
+   if (src_type == TYPE_U8 && full_type(dst_type) == TYPE_U32) {
+      struct ir3_instruction *mask = create_immed_typed(ctx->block, 0xff, TYPE_U8);
+      struct ir3_instruction *cov = ir3_AND_B(ctx->block, src, 0, mask, 0);
+      cov->dsts[0]->flags |= type_flags(dst_type);
+      return cov;
+   }
+
+   /* Conversion of 8-bit values into floating-point values doesn't work with
+    * a simple `cov`, instead the 8-bit values first have to be converted into
+    * corresponding 16-bit values and converted from there.
+    */
+   if (src_type == TYPE_U8 && full_type(dst_type) == TYPE_F32) {
+      assert(op == nir_op_u2f16 || op == nir_op_i2f16 ||
+             op == nir_op_u2f32 || op == nir_op_i2f32);
+
+      struct ir3_instruction *cov;
+      if (op == nir_op_u2f16 || op == nir_op_u2f32) {
+         struct ir3_instruction *mask = create_immed_typed(ctx->block, 0xff, TYPE_U8);
+         cov = ir3_AND_B(ctx->block, src, 0, mask, 0);
+         cov->dsts[0]->flags |= IR3_REG_HALF;
+         cov = ir3_COV(ctx->block, cov, TYPE_U16, dst_type);
+      } else {
+         cov = ir3_COV(ctx->block, src, TYPE_U8, TYPE_S16);
+         cov = ir3_COV(ctx->block, cov, TYPE_S16, dst_type);
+      }
+      return cov;
+   }
+
+   /* Conversion of floating-point values to 8-bit values also doesn't work
+    * through a single `cov`, instead the conversion has to go through the
+    * corresponding 16-bit type that's then truncated.
+    */
+   if (full_type(src_type) == TYPE_F32 && dst_type == TYPE_U8) {
+      assert(op == nir_op_f2u8 || op == nir_op_f2i8);
+
+      type_t intermediate_type = op == nir_op_f2u8 ? TYPE_U16 : TYPE_S16;
+      struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, intermediate_type);
+      cov = ir3_COV(ctx->block, cov, intermediate_type, TYPE_U8);
+      return cov;
+   }
+
   struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, dst_type);

   if (op == nir_op_f2f16_rtne) {
@@ -1611,9 +1655,11 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx,
 {
   /* Note: we can only use isam for vectorized loads/stores if isam.v is
    * available.
+    * Note: isam also can't handle 8-bit loads.
    */
   if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) ||
       (intr->def.num_components > 1 && !ctx->compiler->has_isam_v) ||
+       (ctx->compiler->options.storage_8bit && intr->def.bit_size == 8) ||
       !ctx->compiler->has_isam_ssbo) {
      ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
      return;
--- a/src/freedreno/ir3/ir3_cp.c
+++ b/src/freedreno/ir3/ir3_cp.c
@@ -416,7 +416,7 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
               return false;
            if (!is_cat2_float(instr->opc) && !is_cat3_float(instr->opc))
               return false;
-         } else if (src->cat1.dst_type == TYPE_U16) {
+         } else if (src->cat1.dst_type == TYPE_U16 || src->cat1.dst_type == TYPE_S16) {
            /* Since we set CONSTANT_DEMOTION_ENABLE, a float reference of
             * what was a U16 value read from the constbuf would incorrectly
             * do 32f->16f conversion, when we want to read a 16f value.
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -100,6 +100,13 @@ ir3_nir_should_scalarize_mem(const nir_instr *instr, const void *data)
      return true;
   }

+   if ((intrin->intrinsic == nir_intrinsic_load_ssbo &&
+        intrin->def.bit_size == 8) ||
+       (intrin->intrinsic == nir_intrinsic_store_ssbo &&
+        intrin->src[0].ssa->bit_size == 8)) {
+      return true;
+   }
+
   return false;
 }

--- a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c
+++ b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c
@@ -187,6 +187,11 @@ lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b,
       (!has_dest && intrinsic->src[0].ssa->bit_size == 16))
      shift = 1;

+   /* for 8-bit ssbo access, offset is in 8-bit words instead of dwords */
+   if ((has_dest && intrinsic->def.bit_size == 8) ||
+       (!has_dest && intrinsic->src[0].ssa->bit_size == 8))
+      shift = 0;
+
   /* Here we create a new intrinsic and copy over all contents from the old
    * one. */

@@ -326,7 +331,8 @@ ir3_nir_max_imm_offset(nir_intrinsic_instr *intrin, const void *data)

   switch (intrin->intrinsic) {
   case nir_intrinsic_load_ssbo_ir3:
-      if ((nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER))
+      if ((nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
+          !(compiler->options.storage_8bit && intrin->def.bit_size == 8))
         return 255; /* isam.v */
      return 127;    /* ldib.b */
   case nir_intrinsic_store_ssbo_ir3:
--- a/src/freedreno/ir3/ir3_shared_ra.c
+++ b/src/freedreno/ir3/ir3_shared_ra.c
@@ -761,10 +761,14 @@ can_demote_src(struct ir3_instruction *instr)
   case OPC_META_COLLECT:
      return false;
   case OPC_MOV:
-      /* non-shared -> shared floating-point conversions don't work */
+      /* non-shared -> shared floating-point conversions and
+       * 8-bit sign extension don't work.
+       */
      return (!(instr->dsts[0]->flags & IR3_REG_SHARED) ||
-          (full_type(instr->cat1.src_type) != TYPE_F32 &&
-           full_type(instr->cat1.dst_type) != TYPE_F32));
+              !((full_type(instr->cat1.src_type) == TYPE_F32 ||
+                 full_type(instr->cat1.dst_type) == TYPE_F32) ||
+                (instr->cat1.src_type == TYPE_U8 &&
+                 full_type(instr->cat1.dst_type) == TYPE_S32)));
   default:
      return (!is_alu(instr) && !is_sfu(instr)) ||
         !(instr->dsts[0]->flags & IR3_REG_SHARED);
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@@ -2795,6 +2795,7 @@ tu_bind_descriptor_sets(struct tu_cmd_buffer *cmd,
                     case FMT6_32_UINT:
                        offset_shift = 2;
                        break;
+                     case FMT6_8_UINT:
                     default:
                        offset_shift = 0;
                        break;
--- a/src/freedreno/vulkan/tu_descriptor_set.cc
+++ b/src/freedreno/vulkan/tu_descriptor_set.cc
@@ -69,7 +69,8 @@ descriptor_size(struct tu_device *dev,
       */
      return A6XX_TEX_CONST_DWORDS * 4 * (1 +
         COND(dev->physical_device->info->a6xx.storage_16bit &&
-              !dev->physical_device->info->a6xx.has_isam_v, 1));
+              !dev->physical_device->info->a6xx.has_isam_v, 1) +
+         COND(dev->physical_device->info->a7xx.storage_8bit, 1));
   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
      return binding->descriptorCount;
   default:
@@ -1016,9 +1017,17 @@ write_buffer_descriptor_addr(const struct tu_device *device,
    * 16-bit and 32-bit access through isam.v will of course only be functional
    * when 16-bit storage is supported. */
   assert(!info->a6xx.has_isam_v || info->a6xx.storage_16bit);
+   /* Any configuration enabling 8-bit storage support will also provide 16-bit
+    * storage support and 16-bit descriptors capable of 32-bit isam loads. This
+    * indirectly ensures we won't need more than two descriptors for access of
+    * any size.
+    */
+   assert(!info->a7xx.storage_8bit || (info->a6xx.storage_16bit &&
+                                       info->a6xx.has_isam_v));

-   unsigned num_descriptors = 1 + COND(info->a6xx.storage_16bit &&
-                                       !info->a6xx.has_isam_v, 1);
+   unsigned num_descriptors = 1 +
+      COND(info->a6xx.storage_16bit && !info->a6xx.has_isam_v, 1) +
+      COND(info->a7xx.storage_8bit, 1);
   memset(dst, 0, num_descriptors * A6XX_TEX_CONST_DWORDS * sizeof(uint32_t));

   if (!buffer_info || buffer_info->address == 0)
@@ -1053,6 +1062,18 @@ write_buffer_descriptor_addr(const struct tu_device *device,
         A6XX_TEX_CONST_2_TYPE(A6XX_TEX_BUFFER);
      dst[4] = A6XX_TEX_CONST_4_BASE_LO(base_va);
      dst[5] = A6XX_TEX_CONST_5_BASE_HI(base_va >> 32);
+      dst += A6XX_TEX_CONST_DWORDS;
+   }
+
+   if (info->a7xx.storage_8bit) {
+      dst[0] = A6XX_TEX_CONST_0_TILE_MODE(TILE6_LINEAR) | A6XX_TEX_CONST_0_FMT(FMT6_8_UINT);
+      dst[1] = range;
+      dst[2] =
+         A6XX_TEX_CONST_2_STRUCTSIZETEXELS(1) |
+         A6XX_TEX_CONST_2_STARTOFFSETTEXELS(offset) |
+         A6XX_TEX_CONST_2_TYPE(A6XX_TEX_BUFFER);
+      dst[4] = A6XX_TEX_CONST_4_BASE_LO(base_va);
+      dst[5] = A6XX_TEX_CONST_5_BASE_HI(base_va >> 32);
   }
 }

--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@@ -142,6 +142,7 @@ get_device_extensions(const struct tu_physical_device *device,
                      struct vk_device_extension_table *ext)
 {
   *ext = (struct vk_device_extension_table) { .table = {
+      .KHR_8bit_storage = device->info->a7xx.storage_8bit,
      .KHR_16bit_storage = device->info->a6xx.storage_16bit,
      .KHR_bind_memory2 = true,
      .KHR_buffer_device_address = true,
@@ -379,7 +380,7 @@ tu_get_features(struct tu_physical_device *pdevice,
   /* Vulkan 1.2 */
   features->samplerMirrorClampToEdge            = true;
   features->drawIndirectCount                   = true;
-   features->storageBuffer8BitAccess             = false;
+   features->storageBuffer8BitAccess             = pdevice->info->a7xx.storage_8bit;
   features->uniformAndStorageBuffer8BitAccess   = false;
   features->storagePushConstant8                = false;
   features->shaderBufferInt64Atomics            = false;
@@ -1096,7 +1097,8 @@ tu_get_properties(struct tu_physical_device *pdevice,
   props->uniformBufferDescriptorSize = A6XX_TEX_CONST_DWORDS * 4;
   props->robustUniformBufferDescriptorSize = A6XX_TEX_CONST_DWORDS * 4;
   props->storageBufferDescriptorSize = A6XX_TEX_CONST_DWORDS * 4 * (1 +
-      COND(pdevice->info->a6xx.storage_16bit && !pdevice->info->a6xx.has_isam_v, 1));
+      COND(pdevice->info->a6xx.storage_16bit && !pdevice->info->a6xx.has_isam_v, 1) +
+      COND(pdevice->info->a7xx.storage_8bit, 1));
   props->robustStorageBufferDescriptorSize =
      props->storageBufferDescriptorSize;
   props->inputAttachmentDescriptorSize = TU_DEBUG(DYNAMIC) ?
@@ -2301,6 +2303,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
         .bindless_fb_read_descriptor = -1,
         .bindless_fb_read_slot = -1,
         .storage_16bit = physical_device->info->a6xx.storage_16bit,
+         .storage_8bit = physical_device->info->a7xx.storage_8bit,
         .shared_push_consts = !TU_DEBUG(PUSH_CONSTS_PER_STAGE),
      };
      device->compiler = ir3_compiler_create(
--- a/src/freedreno/vulkan/tu_shader.cc
+++ b/src/freedreno/vulkan/tu_shader.cc
@@ -282,15 +282,20 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev,
      }
   }

-   /* For isam, we need to adjust the descriptor index to use the 32-bit
-    * descriptor if 16-bit storage support is present but the 16-bit descriptor
-    * cannot be used for 32-bit access through isam.v.
+   /* Descriptor index has to be adjusted in the following cases:
+    *  - isam loads, when the 16-bit descriptor cannot also be used for 32-bit
+    *    loads -- next-index descriptor will be able to do that;
+    *  - 8-bit SSBO loads and stores -- next-index descriptor is dedicated to
+    *    storage accesses of that size.
    */
-   if (dev->physical_device->info->a6xx.storage_16bit &&
-       !dev->physical_device->info->a6xx.has_isam_v &&
-       intrin->intrinsic == nir_intrinsic_load_ssbo &&
-       (nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
-       intrin->def.bit_size > 16) {
+   if ((dev->physical_device->info->a6xx.storage_16bit &&
+        !dev->physical_device->info->a6xx.has_isam_v &&
+        intrin->intrinsic == nir_intrinsic_load_ssbo &&
+        (nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
+        intrin->def.bit_size > 16) ||
+       (dev->physical_device->info->a7xx.storage_8bit &&
+        ((intrin->intrinsic == nir_intrinsic_load_ssbo && intrin->def.bit_size == 8) ||
+         (intrin->intrinsic == nir_intrinsic_store_ssbo && intrin->src[0].ssa->bit_size == 8)))) {
      descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1);
   }