intel: Set preferred SLM allocation size >= than SLM size for Xe2

Xe2 has 2 requirements for preferred SLM size:
- this value needs to be >= then SLM size
- this value must be less than shared SLM/L1$ RAM in the sub-slice of platform

Also Xe2 don't have the special '0' encode that sets preferred SLM
allocation size to the maximum supported.
So here setting a value that is equal or larger than SLM size.

It was always setting SLM_ENCODES_128K for LNL A0 stepping probably
because of Wa_16018610683 but this restriction applies to all Xe2
platforms, also because of the first restriction mentioned here
this workaround is not being properly implemented, will fix that
in the next patch.

We should have a formula to calculate a preferred SLM allocation size
for gfx125 and Xe2 platfoms but until that this is enough to fix at
least the applications and tests below on LNL:
- GFXBench Aztec Ruins VK
- GravityMark VK
- Wildlife Extreme VK
- 5 crucible tests

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Signed-off-by: José Roberto de Souza <jose.souza@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28910>
This commit is contained in:
José Roberto de Souza
2024-04-05 13:12:32 -07:00
committed by Marge Bot
parent c4478ab4e3
commit ddda68bbf5
7 changed files with 32 additions and 20 deletions

View File

@@ -8862,13 +8862,14 @@ iris_upload_compute_walker(struct iris_context *ice,
idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads; idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
idd.SharedLocalMemorySize = idd.SharedLocalMemorySize =
intel_compute_slm_encode_size(GFX_VER, shader->total_shared); intel_compute_slm_encode_size(GFX_VER, shader->total_shared);
idd.PreferredSLMAllocationSize =
intel_compute_preferred_slm_calc_encode_size(devinfo, shader->total_shared);
idd.SamplerStatePointer = shs->sampler_table.offset; idd.SamplerStatePointer = shs->sampler_table.offset;
idd.SamplerCount = encode_sampler_count(shader), idd.SamplerCount = encode_sampler_count(shader),
idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE]; idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
/* Typically set to 0 to avoid prefetching on every thread dispatch. */ /* Typically set to 0 to avoid prefetching on every thread dispatch. */
idd.BindingTableEntryCount = devinfo->verx10 == 125 ? idd.BindingTableEntryCount = devinfo->verx10 == 125 ?
0 : MIN2(shader->bt.size_bytes / 4, 31); 0 : MIN2(shader->bt.size_bytes / 4, 31);
idd.PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo);
idd.NumberOfBarriers = cs_data->uses_barrier; idd.NumberOfBarriers = cs_data->uses_barrier;
iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL); iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);

View File

@@ -1737,7 +1737,8 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
.NumberofThreadsinGPGPUThreadGroup = dispatch.threads, .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
.SharedLocalMemorySize = .SharedLocalMemorySize =
intel_compute_slm_encode_size(GFX_VER, prog_data->total_shared), intel_compute_slm_encode_size(GFX_VER, prog_data->total_shared),
.PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo), .PreferredSLMAllocationSize =
intel_compute_preferred_slm_calc_encode_size(devinfo, prog_data->total_shared),
.NumberOfBarriers = cs_prog_data->uses_barrier, .NumberOfBarriers = cs_prog_data->uses_barrier,
}; };
} }

View File

@@ -137,7 +137,7 @@ static struct slm_encode xe2_preferred_slm_allocation_size_table[] = {
{ .encode = 0xA, .size_in_kb = 384, }, { .encode = 0xA, .size_in_kb = 384, },
}; };
uint32_t static uint32_t
intel_compute_preferred_slm_encode_size(unsigned gen, uint32_t bytes) intel_compute_preferred_slm_encode_size(unsigned gen, uint32_t bytes)
{ {
struct slm_encode *table; struct slm_encode *table;
@@ -153,3 +153,23 @@ intel_compute_preferred_slm_encode_size(unsigned gen, uint32_t bytes)
return slm_encode_lookup(table, table_len, bytes)->encode; return slm_encode_lookup(table, table_len, bytes)->encode;
} }
uint32_t
intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info *devinfo, uint32_t slm_size)
{
/* Older platforms than Xe2 has a encode = 0 that sets preferred SLM
* allocation to maximum supported, so keeping it until we come up
* with a formula to calculate the optimal preferred slm allocation.
*/
if (devinfo->ver < 20)
return 0;
/* Xe2 has 2 requirements for preferred SLM size:
* - this value needs to be >= then SLM size
* - this value must be less than shared SLM/L1$ RAM in the sub-slice of platform
*
* For now it is not calculating the optimal preferred SLM allocation,
* it is just setting the minimum value that comply with first restriction.
*/
return intel_compute_preferred_slm_encode_size(devinfo->ver, slm_size);
}

View File

@@ -7,6 +7,8 @@
#include <stdint.h> #include <stdint.h>
#include "dev/intel_device_info.h"
uint32_t intel_compute_slm_calculate_size(unsigned gen, uint32_t bytes); uint32_t intel_compute_slm_calculate_size(unsigned gen, uint32_t bytes);
uint32_t intel_compute_slm_encode_size(unsigned gen, uint32_t bytes); uint32_t intel_compute_slm_encode_size(unsigned gen, uint32_t bytes);
uint32_t intel_compute_preferred_slm_encode_size(unsigned gen, uint32_t bytes); uint32_t intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info *devinfo, uint32_t slm_size);

View File

@@ -165,19 +165,6 @@ intel_set_ps_dispatch_state(struct GENX(3DSTATE_PS) *ps,
#endif #endif
#if GFX_VERx10 >= 125
UNUSED static int
preferred_slm_allocation_size(const struct intel_device_info *devinfo)
{
if (devinfo->platform == INTEL_PLATFORM_LNL && devinfo->revision == 0)
return SLM_ENCODES_128K;
return 0;
}
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@@ -285,7 +285,8 @@ get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer,
0 : 1 + MIN2(shader->bind_map.surface_count, 30), 0 : 1 + MIN2(shader->bind_map.surface_count, 30),
.NumberofThreadsinGPGPUThreadGroup = dispatch->threads, .NumberofThreadsinGPGPUThreadGroup = dispatch->threads,
.SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, prog_data->base.total_shared), .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, prog_data->base.total_shared),
.PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo), .PreferredSLMAllocationSize =
intel_compute_preferred_slm_calc_encode_size(devinfo, prog_data->base.total_shared),
.NumberOfBarriers = prog_data->uses_barrier, .NumberOfBarriers = prog_data->uses_barrier,
}; };
} }

View File

@@ -1795,7 +1795,7 @@ emit_task_state(struct anv_graphics_pipeline *pipeline)
task.SharedLocalMemorySize = task.SharedLocalMemorySize =
intel_compute_slm_encode_size(GFX_VER, task_prog_data->base.base.total_shared); intel_compute_slm_encode_size(GFX_VER, task_prog_data->base.base.total_shared);
task.PreferredSLMAllocationSize = task.PreferredSLMAllocationSize =
preferred_slm_allocation_size(devinfo); intel_compute_preferred_slm_calc_encode_size(devinfo, task_prog_data->base.base.total_shared);
/* /*
* 3DSTATE_TASK_SHADER_DATA.InlineData[0:1] will be used for an address * 3DSTATE_TASK_SHADER_DATA.InlineData[0:1] will be used for an address
@@ -1876,7 +1876,7 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline)
mesh.SharedLocalMemorySize = mesh.SharedLocalMemorySize =
intel_compute_slm_encode_size(GFX_VER, mesh_prog_data->base.base.total_shared); intel_compute_slm_encode_size(GFX_VER, mesh_prog_data->base.base.total_shared);
mesh.PreferredSLMAllocationSize = mesh.PreferredSLMAllocationSize =
preferred_slm_allocation_size(devinfo); intel_compute_preferred_slm_calc_encode_size(devinfo, mesh_prog_data->base.base.total_shared);
/* /*
* 3DSTATE_MESH_SHADER_DATA.InlineData[0:1] will be used for an address * 3DSTATE_MESH_SHADER_DATA.InlineData[0:1] will be used for an address