intel: Set preferred SLM allocation size >= than SLM size for Xe2

Xe2 has 2 requirements for preferred SLM size: - this value needs to be >= then SLM size - this value must be less than shared SLM/L1$ RAM in the sub-slice of platform Also Xe2 don't have the special '0' encode that sets preferred SLM allocation size to the maximum supported. So here setting a value that is equal or larger than SLM size. It was always setting SLM_ENCODES_128K for LNL A0 stepping probably because of Wa_16018610683 but this restriction applies to all Xe2 platforms, also because of the first restriction mentioned here this workaround is not being properly implemented, will fix that in the next patch. We should have a formula to calculate a preferred SLM allocation size for gfx125 and Xe2 platfoms but until that this is enough to fix at least the applications and tests below on LNL: - GFXBench Aztec Ruins VK - GravityMark VK - Wildlife Extreme VK - 5 crucible tests Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Signed-off-by: José Roberto de Souza <jose.souza@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28910>
2024-04-05 13:12:32 -07:00
parent c4478ab4e3
commit ddda68bbf5
7 changed files with 32 additions and 20 deletions
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -8862,13 +8862,14 @@ iris_upload_compute_walker(struct iris_context *ice,
   idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
   idd.SharedLocalMemorySize =
      intel_compute_slm_encode_size(GFX_VER, shader->total_shared);
   idd.PreferredSLMAllocationSize =
      intel_compute_preferred_slm_calc_encode_size(devinfo, shader->total_shared);
   idd.SamplerStatePointer = shs->sampler_table.offset;
   idd.SamplerCount = encode_sampler_count(shader),
   idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
   /* Typically set to 0 to avoid prefetching on every thread dispatch. */
   idd.BindingTableEntryCount = devinfo->verx10 == 125 ?
      0 : MIN2(shader->bt.size_bytes / 4, 31);
   idd.PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo);
   idd.NumberOfBarriers = cs_data->uses_barrier;
   iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
--- a/src/intel/blorp/blorp_genX_exec_brw.h
+++ b/src/intel/blorp/blorp_genX_exec_brw.h
@@ -1737,7 +1737,8 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
         .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
         .SharedLocalMemorySize =
            intel_compute_slm_encode_size(GFX_VER, prog_data->total_shared),
-         .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
+         .PreferredSLMAllocationSize =
            intel_compute_preferred_slm_calc_encode_size(devinfo, prog_data->total_shared),
         .NumberOfBarriers = cs_prog_data->uses_barrier,
      };
   }
--- a/src/intel/common/intel_compute_slm.c
+++ b/src/intel/common/intel_compute_slm.c
@@ -137,7 +137,7 @@ static struct slm_encode xe2_preferred_slm_allocation_size_table[] = {
  { .encode = 0xA, .size_in_kb = 384, },
 };
-uint32_t
+static uint32_t
 intel_compute_preferred_slm_encode_size(unsigned gen, uint32_t bytes)
 {
   struct slm_encode *table;
@@ -153,3 +153,23 @@ intel_compute_preferred_slm_encode_size(unsigned gen, uint32_t bytes)
   return slm_encode_lookup(table, table_len, bytes)->encode;
 }
 uint32_t
 intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info *devinfo, uint32_t slm_size)
 {
   /* Older platforms than Xe2 has a encode = 0 that sets preferred SLM
    * allocation to maximum supported, so keeping it until we come up
    * with a formula to calculate the optimal preferred slm allocation.
    */
   if (devinfo->ver < 20)
      return 0;
   /* Xe2 has 2 requirements for preferred SLM size:
    * - this value needs to be >= then SLM size
    * - this value must be less than shared SLM/L1$ RAM in the sub-slice of platform
    *
    * For now it is not calculating the optimal preferred SLM allocation,
    * it is just setting the minimum value that comply with first restriction.
    */
   return intel_compute_preferred_slm_encode_size(devinfo->ver, slm_size);
 }
--- a/src/intel/common/intel_compute_slm.h
+++ b/src/intel/common/intel_compute_slm.h
@@ -7,6 +7,8 @@
 #include <stdint.h>
 #include "dev/intel_device_info.h"
 uint32_t intel_compute_slm_calculate_size(unsigned gen, uint32_t bytes);
 uint32_t intel_compute_slm_encode_size(unsigned gen, uint32_t bytes);
-uint32_t intel_compute_preferred_slm_encode_size(unsigned gen, uint32_t bytes);
+uint32_t intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info *devinfo, uint32_t slm_size);
--- a/src/intel/common/intel_genX_state_brw.h
+++ b/src/intel/common/intel_genX_state_brw.h
@@ -165,19 +165,6 @@ intel_set_ps_dispatch_state(struct GENX(3DSTATE_PS) *ps,
 #endif
 #if GFX_VERx10 >= 125
 UNUSED static int
 preferred_slm_allocation_size(const struct intel_device_info *devinfo)
 {
   if (devinfo->platform == INTEL_PLATFORM_LNL && devinfo->revision == 0)
      return SLM_ENCODES_128K;
   return 0;
 }
 #endif
 #ifdef __cplusplus
 }
 #endif
--- a/src/intel/vulkan/genX_cmd_compute.c
+++ b/src/intel/vulkan/genX_cmd_compute.c
@@ -285,7 +285,8 @@ get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer,
         0 : 1 + MIN2(shader->bind_map.surface_count, 30),
      .NumberofThreadsinGPGPUThreadGroup = dispatch->threads,
      .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, prog_data->base.total_shared),
-      .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
+      .PreferredSLMAllocationSize =
         intel_compute_preferred_slm_calc_encode_size(devinfo, prog_data->base.total_shared),
      .NumberOfBarriers = prog_data->uses_barrier,
   };
 }
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -1795,7 +1795,7 @@ emit_task_state(struct anv_graphics_pipeline *pipeline)
      task.SharedLocalMemorySize             =
         intel_compute_slm_encode_size(GFX_VER, task_prog_data->base.base.total_shared);
      task.PreferredSLMAllocationSize        =
-         preferred_slm_allocation_size(devinfo);
+         intel_compute_preferred_slm_calc_encode_size(devinfo, task_prog_data->base.base.total_shared);
      /*
       * 3DSTATE_TASK_SHADER_DATA.InlineData[0:1] will be used for an address
@@ -1876,7 +1876,7 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline)
      mesh.SharedLocalMemorySize             =
         intel_compute_slm_encode_size(GFX_VER, mesh_prog_data->base.base.total_shared);
      mesh.PreferredSLMAllocationSize        =
-         preferred_slm_allocation_size(devinfo);
+         intel_compute_preferred_slm_calc_encode_size(devinfo, mesh_prog_data->base.base.total_shared);
      /*
       * 3DSTATE_MESH_SHADER_DATA.InlineData[0:1] will be used for an address