intel/dev: Add a max_cs_workgroup_threads field
This is distinct form max_cs_threads because it also encodes restrictions about the way we use GPGPU/COMPUTE_WALKER. This gets rid of the MIN2(64, devinfo->max_cs_threads) we have scattered all over the driver and puts it in a central place. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11861>
This commit is contained in:

committed by
Marge Bot

parent
915e5a8cc3
commit
6642749458
@@ -545,8 +545,7 @@ crocus_get_compute_param(struct pipe_screen *pscreen,
|
||||
struct crocus_screen *screen = (struct crocus_screen *)pscreen;
|
||||
const struct intel_device_info *devinfo = &screen->devinfo;
|
||||
|
||||
const unsigned max_threads = MIN2(64, devinfo->max_cs_threads);
|
||||
const uint32_t max_invocations = 32 * max_threads;
|
||||
const uint32_t max_invocations = 32 * devinfo->max_cs_workgroup_threads;
|
||||
|
||||
if (devinfo->ver < 7)
|
||||
return 0;
|
||||
|
@@ -516,9 +516,7 @@ iris_get_compute_param(struct pipe_screen *pscreen,
|
||||
struct iris_screen *screen = (struct iris_screen *)pscreen;
|
||||
const struct intel_device_info *devinfo = &screen->devinfo;
|
||||
|
||||
/* Limit max_threads to 64 for the GPGPU_WALKER command. */
|
||||
const unsigned max_threads = MIN2(64, devinfo->max_cs_threads);
|
||||
const uint32_t max_invocations = 32 * max_threads;
|
||||
const uint32_t max_invocations = 32 * devinfo->max_cs_workgroup_threads;
|
||||
|
||||
#define RET(x) do { \
|
||||
if (ret) \
|
||||
|
@@ -10099,7 +10099,7 @@ brw_compile_cs(const struct brw_compiler *compiler,
|
||||
prog_data->local_size[2];
|
||||
|
||||
/* Limit max_threads to 64 for the GPGPU_WALKER command */
|
||||
const uint32_t max_threads = MIN2(64, compiler->devinfo->max_cs_threads);
|
||||
const uint32_t max_threads = compiler->devinfo->max_cs_workgroup_threads;
|
||||
min_dispatch_width = util_next_power_of_two(
|
||||
MAX2(8, DIV_ROUND_UP(local_workgroup_size, max_threads)));
|
||||
assert(min_dispatch_width <= 32);
|
||||
@@ -10316,8 +10316,7 @@ brw_cs_simd_size_for_group_size(const struct intel_device_info *devinfo,
|
||||
if ((INTEL_DEBUG & DEBUG_DO32) && (mask & simd32))
|
||||
return 32;
|
||||
|
||||
/* Limit max_threads to 64 for the GPGPU_WALKER command */
|
||||
const uint32_t max_threads = MIN2(64, devinfo->max_cs_threads);
|
||||
const uint32_t max_threads = devinfo->max_cs_workgroup_threads;
|
||||
|
||||
if ((mask & simd8) && group_size <= 8 * max_threads) {
|
||||
/* Prefer SIMD16 if can do without spilling. Matches logic in
|
||||
|
@@ -1228,6 +1228,21 @@ getparam(int fd, uint32_t param, int *value)
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
update_cs_workgroup_threads(struct intel_device_info *devinfo)
|
||||
{
|
||||
/* GPGPU_WALKER::ThreadWidthCounterMaximum is U6-1 so the most threads we
|
||||
* can program is 64 without going up to a rectangular group. This only
|
||||
* impacts Haswell and TGL which have higher thread counts.
|
||||
*
|
||||
* INTERFACE_DESCRIPTOR_DATA::NumberofThreadsinGPGPUThreadGroup on Xe-HP+
|
||||
* is 10 bits so we have no such restrictions.
|
||||
*/
|
||||
devinfo->max_cs_workgroup_threads =
|
||||
devinfo->verx10 >= 125 ? devinfo->max_cs_threads :
|
||||
MIN2(devinfo->max_cs_threads, 64);
|
||||
}
|
||||
|
||||
bool
|
||||
intel_get_device_info_from_pci_id(int pci_id,
|
||||
struct intel_device_info *devinfo)
|
||||
@@ -1302,6 +1317,8 @@ intel_get_device_info_from_pci_id(int pci_id,
|
||||
if (devinfo->verx10 == 0)
|
||||
devinfo->verx10 = devinfo->ver * 10;
|
||||
|
||||
update_cs_workgroup_threads(devinfo);
|
||||
|
||||
devinfo->chipset_id = pci_id;
|
||||
return true;
|
||||
}
|
||||
@@ -1434,6 +1451,8 @@ fixup_chv_device_info(struct intel_device_info *devinfo)
|
||||
if (max_cs_threads > devinfo->max_cs_threads)
|
||||
devinfo->max_cs_threads = max_cs_threads;
|
||||
|
||||
update_cs_workgroup_threads(devinfo);
|
||||
|
||||
/* Braswell is even more annoying. Its marketing name isn't determinable
|
||||
* from the PCI ID and is also dependent on fusing.
|
||||
*/
|
||||
|
@@ -214,6 +214,17 @@ struct intel_device_info
|
||||
*/
|
||||
unsigned max_cs_threads;
|
||||
|
||||
/**
|
||||
* Maximum number of threads per workgroup supported by the GPGPU_WALKER or
|
||||
* COMPUTE_WALKER command.
|
||||
*
|
||||
* This may be smaller than max_cs_threads as it takes into account added
|
||||
* restrictions on the GPGPU/COMPUTE_WALKER commands. While max_cs_threads
|
||||
* expresses the total parallelism of the GPU, this expresses the maximum
|
||||
* number of threads we can dispatch in a single workgroup.
|
||||
*/
|
||||
unsigned max_cs_workgroup_threads;
|
||||
|
||||
struct {
|
||||
/**
|
||||
* Fixed size of the URB.
|
||||
|
@@ -1899,8 +1899,7 @@ void anv_GetPhysicalDeviceProperties(
|
||||
pdevice->has_bindless_images && pdevice->has_a64_buffer_access
|
||||
? UINT32_MAX : MAX_BINDING_TABLE_SIZE - MAX_RTS - 1;
|
||||
|
||||
/* Limit max_threads to 64 for the GPGPU_WALKER command */
|
||||
const uint32_t max_workgroup_size = 32 * MIN2(64, devinfo->max_cs_threads);
|
||||
const uint32_t max_workgroup_size = 32 * devinfo->max_cs_workgroup_threads;
|
||||
|
||||
VkSampleCountFlags sample_counts =
|
||||
isl_device_get_sample_counts(&pdevice->isl_dev);
|
||||
@@ -2537,8 +2536,7 @@ void anv_GetPhysicalDeviceProperties2(
|
||||
STATIC_ASSERT(8 <= BRW_SUBGROUP_SIZE && BRW_SUBGROUP_SIZE <= 32);
|
||||
props->minSubgroupSize = 8;
|
||||
props->maxSubgroupSize = 32;
|
||||
/* Limit max_threads to 64 for the GPGPU_WALKER command. */
|
||||
props->maxComputeWorkgroupSubgroups = MIN2(64, pdevice->info.max_cs_threads);
|
||||
props->maxComputeWorkgroupSubgroups = pdevice->info.max_cs_workgroup_threads;
|
||||
props->requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT;
|
||||
break;
|
||||
}
|
||||
|
@@ -840,14 +840,8 @@ brw_initialize_cs_context_constants(struct brw_context *brw)
|
||||
|
||||
/* Maximum number of scalar compute shader invocations that can be run in
|
||||
* parallel in the same subslice assuming SIMD32 dispatch.
|
||||
*
|
||||
* We don't advertise more than 64 threads, because we are limited to 64 by
|
||||
* our usage of thread_width_max in the gpgpu walker command. This only
|
||||
* currently impacts Haswell, which otherwise might be able to advertise 70
|
||||
* threads. With SIMD32 and 64 threads, Haswell still provides twice the
|
||||
* required the number of invocation needed for ARB_compute_shader.
|
||||
*/
|
||||
const unsigned max_threads = MIN2(64, devinfo->max_cs_threads);
|
||||
const unsigned max_threads = devinfo->max_cs_workgroup_threads;
|
||||
const uint32_t max_invocations = 32 * max_threads;
|
||||
ctx->Const.MaxComputeWorkGroupSize[0] = max_invocations;
|
||||
ctx->Const.MaxComputeWorkGroupSize[1] = max_invocations;
|
||||
|
Reference in New Issue
Block a user