intel: Compute the optimal preferred SLM size per subslice

Up to now preferred SLM size was being set to maximum preferred SLM
size for GFX 12.5 platforms and to workgroup SLM size for Xe2 but
neither of those values are the optimal.
The optimal value is:
<number of workgroups that can run per subslice> * <workgroup SLM size>

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Signed-off-by: José Roberto de Souza <jose.souza@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28910>
This commit is contained in:
José Roberto de Souza
2024-04-19 14:11:03 -07:00
committed by Marge Bot
parent fd368f5521
commit 07855b0431
6 changed files with 48 additions and 21 deletions

View File

@@ -8863,7 +8863,10 @@ iris_upload_compute_walker(struct iris_context *ice,
idd.SharedLocalMemorySize =
intel_compute_slm_encode_size(GFX_VER, shader->total_shared);
idd.PreferredSLMAllocationSize =
intel_compute_preferred_slm_calc_encode_size(devinfo, shader->total_shared);
intel_compute_preferred_slm_calc_encode_size(devinfo,
shader->total_shared,
dispatch.group_size,
dispatch.simd_size);
idd.SamplerStatePointer = shs->sampler_table.offset;
idd.SamplerCount = encode_sampler_count(shader),
idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];

View File

@@ -1738,7 +1738,10 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
.SharedLocalMemorySize =
intel_compute_slm_encode_size(GFX_VER, prog_data->total_shared),
.PreferredSLMAllocationSize =
intel_compute_preferred_slm_calc_encode_size(devinfo, prog_data->total_shared),
intel_compute_preferred_slm_calc_encode_size(devinfo,
prog_data->total_shared,
dispatch.group_size,
dispatch.simd_size),
.NumberOfBarriers = cs_prog_data->uses_barrier,
};
}

View File

@@ -154,22 +154,31 @@ intel_compute_preferred_slm_encode_size(unsigned gen, uint32_t bytes)
return slm_encode_lookup(table, table_len, bytes)->encode;
}
/**
* Compute a shared local memory size to be allocated for each sub-slice.
* It estimate how many workgroups will run concurrently per sub-slice and
* multiply that per each workgroup SLM size.
*/
uint32_t
intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info *devinfo, uint32_t slm_size)
intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info *devinfo,
const uint32_t slm_size_per_workgroup,
const uint32_t invocations_per_workgroup,
const uint8_t cs_simd)
{
/* Older platforms than Xe2 has a encode = 0 that sets preferred SLM
* allocation to maximum supported, so keeping it until we come up
* with a formula to calculate the optimal preferred slm allocation.
*/
if (devinfo->ver < 20)
return 0;
const uint32_t max_preferred_slm_size = intel_device_info_get_max_preferred_slm_size(devinfo);
const uint32_t invocations_per_ss = intel_device_info_get_eu_count_first_subslice(devinfo) *
devinfo->num_thread_per_eu * cs_simd;
uint32_t preferred_slm_size;
/* Xe2 has 2 requirements for preferred SLM size:
* - this value needs to be >= then SLM size
* - this value must be less than shared SLM/L1$ RAM in the sub-slice of platform
*
* For now it is not calculating the optimal preferred SLM allocation,
* it is just setting the minimum value that comply with first restriction.
*/
return intel_compute_preferred_slm_encode_size(devinfo->ver, slm_size);
if (slm_size_per_workgroup) {
uint32_t workgroups_per_ss = invocations_per_ss / invocations_per_workgroup;
preferred_slm_size = workgroups_per_ss * slm_size_per_workgroup;
preferred_slm_size = MIN2(preferred_slm_size, max_preferred_slm_size);
} else {
preferred_slm_size = 0;
}
assert(preferred_slm_size >= slm_size_per_workgroup);
return intel_compute_preferred_slm_encode_size(devinfo->ver, preferred_slm_size);
}

View File

@@ -11,4 +11,7 @@
uint32_t intel_compute_slm_calculate_size(unsigned gen, uint32_t bytes);
uint32_t intel_compute_slm_encode_size(unsigned gen, uint32_t bytes);
uint32_t intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info *devinfo, uint32_t slm_size);
uint32_t intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info *devinfo,
const uint32_t slm_size_per_workgroup,
const uint32_t invocations_per_workgroup,
const uint8_t cs_simd);

View File

@@ -286,7 +286,10 @@ get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer,
.NumberofThreadsinGPGPUThreadGroup = dispatch->threads,
.SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, prog_data->base.total_shared),
.PreferredSLMAllocationSize =
intel_compute_preferred_slm_calc_encode_size(devinfo, prog_data->base.total_shared),
intel_compute_preferred_slm_calc_encode_size(devinfo,
prog_data->base.total_shared,
dispatch->group_size,
dispatch->simd_size),
.NumberOfBarriers = prog_data->uses_barrier,
};
}

View File

@@ -1795,7 +1795,10 @@ emit_task_state(struct anv_graphics_pipeline *pipeline)
task.SharedLocalMemorySize =
intel_compute_slm_encode_size(GFX_VER, task_prog_data->base.base.total_shared);
task.PreferredSLMAllocationSize =
intel_compute_preferred_slm_calc_encode_size(devinfo, task_prog_data->base.base.total_shared);
intel_compute_preferred_slm_calc_encode_size(devinfo,
task_prog_data->base.base.total_shared,
task_dispatch.group_size,
task_dispatch.simd_size);
/*
* 3DSTATE_TASK_SHADER_DATA.InlineData[0:1] will be used for an address
@@ -1876,7 +1879,10 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline)
mesh.SharedLocalMemorySize =
intel_compute_slm_encode_size(GFX_VER, mesh_prog_data->base.base.total_shared);
mesh.PreferredSLMAllocationSize =
intel_compute_preferred_slm_calc_encode_size(devinfo, mesh_prog_data->base.base.total_shared);
intel_compute_preferred_slm_calc_encode_size(devinfo,
mesh_prog_data->base.base.total_shared,
mesh_dispatch.group_size,
mesh_dispatch.simd_size);
/*
* 3DSTATE_MESH_SHADER_DATA.InlineData[0:1] will be used for an address