intel/brw: Handle 16-bit sampler return payloads

API requires samplers to return 32-bit even though hardware can handle 16-bit floating point, so we detect that case and make more efficient use of memory BW. This is helping improve performance of encode and decode tokens during LLM by at least 5% across multiple platforms. Thank you Kenneth Graunke for suggesting and guiding me throughout this implementation. Signed-off-by: Sushma Venkatesh Reddy <sushma.venkatesh.reddy@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30447>
2024-07-30 23:04:34 -07:00
parent ddd9e043dc
commit 0116430d39
3 changed files with 27 additions and 9 deletions
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -8623,7 +8623,12 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,

   brw_reg nir_def_reg = get_nir_def(ntb, instr->def);

-   brw_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse);
+   bool is_simd8_16bit = nir_alu_type_get_type_size(instr->dest_type) == 16
+      && bld.dispatch_width() == 8;
+
+   brw_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type),
+      (is_simd8_16bit ? 8 : 4) + instr->is_sparse);
+
   fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
   inst->offset = header_bits;

@@ -8635,15 +8640,18 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,
      if (instr->is_sparse) {
         read_size = util_last_bit(write_mask) - 1;
         inst->size_written =
-            read_size * inst->dst.component_size(inst->exec_size) +
+            (is_simd8_16bit ? 2 : 1) * read_size *
+            inst->dst.component_size(inst->exec_size) +
            (reg_unit(devinfo) * REG_SIZE);
      } else {
         read_size = util_last_bit(write_mask);
         inst->size_written =
-            read_size * inst->dst.component_size(inst->exec_size);
+            (is_simd8_16bit ? 2 : 1) * read_size *
+            inst->dst.component_size(inst->exec_size);
      }
   } else {
-      inst->size_written = 4 * inst->dst.component_size(inst->exec_size) +
+      inst->size_written = (is_simd8_16bit ? 2 : 1) * 4 *
+                           inst->dst.component_size(inst->exec_size) +
                           (instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0);
   }

@@ -8666,7 +8674,8 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,
      inst->keep_payload_trailing_zeros = true;
   }

-   if (instr->op != nir_texop_query_levels && !instr->is_sparse) {
+   if (instr->op != nir_texop_query_levels && !instr->is_sparse
+      && !is_simd8_16bit) {
      /* In most cases we can write directly to the result. */
      inst->dst = nir_def_reg;
   } else {
@@ -8675,7 +8684,7 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,
       */
      brw_reg nir_dest[5];
      for (unsigned i = 0; i < read_size; i++)
-         nir_dest[i] = offset(dst, bld, i);
+         nir_dest[i] = offset(dst, bld, (is_simd8_16bit ? 2 : 1) * i);

      if (instr->op == nir_texop_query_levels) {
         /* # levels is in .w */
--- a/src/intel/compiler/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw_lower_logical_sends.cpp
@@ -1124,13 +1124,16 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst,
   inst->mlen = mlen;
   inst->header_size = header_size;
   inst->sfid = BRW_SFID_SAMPLER;
+   uint sampler_ret_type = brw_type_size_bits(inst->dst.type) == 16
+      ? GFX8_SAMPLER_RETURN_FORMAT_16BITS
+      : GFX8_SAMPLER_RETURN_FORMAT_32BITS;
   if (surface.file == IMM &&
       (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
      inst->desc = brw_sampler_desc(devinfo, surface.ud,
                                    sampler.file == IMM ? sampler.ud % 16 : 0,
                                    msg_type,
                                    simd_mode,
-                                    0 /* return_format unused on gfx7+ */);
+                                    sampler_ret_type);
      inst->src[0] = brw_imm_ud(0);
      inst->src[1] = brw_imm_ud(0);
   } else if (surface_handle.file != BAD_FILE) {
@@ -1140,7 +1143,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst,
                                    sampler.file == IMM ? sampler.ud % 16 : 0,
                                    msg_type,
                                    simd_mode,
-                                    0 /* return_format unused on gfx7+ */);
+                                    sampler_ret_type);

      /* For bindless samplers, the entire address is included in the message
       * header so we can leave the portion in the message descriptor 0.
@@ -1166,7 +1169,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst,
                                    0, /* sampler */
                                    msg_type,
                                    simd_mode,
-                                    0 /* return_format unused on gfx7+ */);
+                                    sampler_ret_type);
      const fs_builder ubld = bld.group(1, 0).exec_all();
      brw_reg desc = ubld.vgrf(BRW_TYPE_UD);
      if (surface.equals(sampler)) {
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -996,6 +996,12 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,

   OPT(nir_lower_alu_to_scalar, NULL, NULL);

+   struct nir_opt_16bit_tex_image_options options = {
+      .rounding_mode = nir_rounding_mode_undef,
+      .opt_tex_dest_types = nir_type_float | nir_type_int | nir_type_uint,
+   };
+   OPT(nir_opt_16bit_tex_image, &options);
+
   if (nir->info.stage == MESA_SHADER_GEOMETRY)
      OPT(nir_lower_gs_intrinsics, 0);