intel/devinfo: Add devinfo->max_scratch_ids

This consolidates several duplicated pieces of code into devinfo.
max_scratch_ids is an array that provides the max number of threads
for the rendering and compute stages.

This fixes some exceptions missed by crocus for scratch ids on haswell
and cherryview.

It also fills out devinfo->max_scratch_ids properly for stages VS
through CS on Gfx12.5. But, functionally this should not make a
difference as Gfx12.5 already uses COMPUTE for all stages.

Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12799>
This commit is contained in:
Jordan Justen
2021-09-08 16:54:57 -07:00
parent 32e848aeaa
commit c7d8a69249
6 changed files with 123 additions and 239 deletions

View File

@@ -2645,23 +2645,9 @@ crocus_get_scratch_space(struct crocus_context *ice,
struct crocus_bo **bop = &ice->shaders.scratch_bos[encoded_size][stage];
/* TODO: This doesn't seem to match brw_alloc_stage_scratch */
unsigned cs_subslices = 4 * devinfo->num_slices;
if (!*bop) {
unsigned scratch_ids_per_subslice = devinfo->max_cs_threads;
uint32_t max_threads[] = {
[MESA_SHADER_VERTEX] = devinfo->max_vs_threads,
[MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
[MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
[MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads,
[MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads,
[MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * cs_subslices,
};
uint32_t size = per_thread_scratch * max_threads[stage];
assert(stage < ARRAY_SIZE(devinfo->max_scratch_ids));
uint32_t size = per_thread_scratch * devinfo->max_scratch_ids[stage];
*bop = crocus_bo_alloc(bufmgr, "scratch", size);
}

View File

@@ -2320,62 +2320,9 @@ iris_get_scratch_space(struct iris_context *ice,
struct iris_bo **bop = &ice->shaders.scratch_bos[encoded_size][stage];
/* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
*
* "Scratch Space per slice is computed based on 4 sub-slices. SW
* must allocate scratch space enough so that each slice has 4
* slices allowed."
*
* According to the other driver team, this applies to compute shaders
* as well. This is not currently documented at all.
*
* This hack is no longer necessary on Gfx11+.
*
* For, Gfx11+, scratch space allocation is based on the number of threads
* in the base configuration.
*/
unsigned subslice_total = devinfo->subslice_total;
if (devinfo->verx10 == 125)
subslice_total = 32;
else if (devinfo->ver == 12)
subslice_total = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2);
else if (devinfo->ver == 11)
subslice_total = 8;
else if (devinfo->ver < 11)
subslice_total = 4 * devinfo->num_slices;
assert(subslice_total >= devinfo->subslice_total);
if (!*bop) {
unsigned scratch_ids_per_subslice = devinfo->max_cs_threads;
if (devinfo->ver >= 12) {
/* Same as ICL below, but with 16 EUs. */
scratch_ids_per_subslice = 16 * 8;
} else if (devinfo->ver == 11) {
/* The MEDIA_VFE_STATE docs say:
*
* "Starting with this configuration, the Maximum Number of
* Threads must be set to (#EU * 8) for GPGPU dispatches.
*
* Although there are only 7 threads per EU in the configuration,
* the FFTID is calculated as if there are 8 threads per EU,
* which in turn requires a larger amount of Scratch Space to be
* allocated by the driver."
*/
scratch_ids_per_subslice = 8 * 8;
}
uint32_t max_threads[] = {
[MESA_SHADER_VERTEX] = devinfo->max_vs_threads,
[MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
[MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
[MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads,
[MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads,
[MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslice_total,
};
uint32_t size = per_thread_scratch * max_threads[stage];
assert(stage < ARRAY_SIZE(devinfo->max_scratch_ids));
uint32_t size = per_thread_scratch * devinfo->max_scratch_ids[stage];
*bop = iris_bo_alloc(bufmgr, "scratch", size, 1, IRIS_MEMZONE_SHADER, 0);
}

View File

@@ -28,7 +28,6 @@
#include <string.h>
#include <unistd.h>
#include "intel_device_info.h"
#include "compiler/shader_enums.h"
#include "intel/common/intel_gem.h"
#include "util/bitscan.h"
#include "util/debug.h"
@@ -1460,6 +1459,110 @@ fixup_chv_device_info(struct intel_device_info *devinfo)
memcpy(needle, bsw_model, 3);
}
static void
init_max_scratch_ids(struct intel_device_info *devinfo)
{
/* Determine the max number of subslices that potentially might be used in
* scratch space ids.
*
* For, Gfx11+, scratch space allocation is based on the number of threads
* in the base configuration.
*
* For Gfx9, devinfo->subslice_total is the TOTAL number of subslices and
* we wish to view that there are 4 subslices per slice instead of the
* actual number of subslices per slice. The documentation for 3DSTATE_PS
* "Scratch Space Base Pointer" says:
*
* "Scratch Space per slice is computed based on 4 sub-slices. SW
* must allocate scratch space enough so that each slice has 4
* slices allowed."
*
* According to the other driver team, this applies to compute shaders
* as well. This is not currently documented at all.
*
* For Gfx8 and older we user devinfo->subslice_total.
*/
unsigned subslices;
if (devinfo->verx10 == 125)
subslices = 32;
else if (devinfo->ver == 12)
subslices = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2);
else if (devinfo->ver == 11)
subslices = 8;
else if (devinfo->ver >= 9 && devinfo->ver < 11)
subslices = 4 * devinfo->num_slices;
else
subslices = devinfo->subslice_total;
assert(subslices >= devinfo->subslice_total);
unsigned scratch_ids_per_subslice;
if (devinfo->ver >= 12) {
/* Same as ICL below, but with 16 EUs. */
scratch_ids_per_subslice = 16 * 8;
} else if (devinfo->ver >= 11) {
/* The MEDIA_VFE_STATE docs say:
*
* "Starting with this configuration, the Maximum Number of
* Threads must be set to (#EU * 8) for GPGPU dispatches.
*
* Although there are only 7 threads per EU in the configuration,
* the FFTID is calculated as if there are 8 threads per EU,
* which in turn requires a larger amount of Scratch Space to be
* allocated by the driver."
*/
scratch_ids_per_subslice = 8 * 8;
} else if (devinfo->is_haswell) {
/* WaCSScratchSize:hsw
*
* Haswell's scratch space address calculation appears to be sparse
* rather than tightly packed. The Thread ID has bits indicating
* which subslice, EU within a subslice, and thread within an EU it
* is. There's a maximum of two slices and two subslices, so these
* can be stored with a single bit. Even though there are only 10 EUs
* per subslice, this is stored in 4 bits, so there's an effective
* maximum value of 16 EUs. Similarly, although there are only 7
* threads per EU, this is stored in a 3 bit number, giving an
* effective maximum value of 8 threads per EU.
*
* This means that we need to use 16 * 8 instead of 10 * 7 for the
* number of threads per subslice.
*/
scratch_ids_per_subslice = 16 * 8;
} else if (devinfo->is_cherryview) {
/* Cherryview devices have either 6 or 8 EUs per subslice, and each
* EU has 7 threads. The 6 EU devices appear to calculate thread IDs
* as if it had 8 EUs.
*/
scratch_ids_per_subslice = 8 * 7;
} else {
scratch_ids_per_subslice = devinfo->max_cs_threads;
}
unsigned max_thread_ids = scratch_ids_per_subslice * subslices;
if (devinfo->verx10 >= 125) {
/* On GFX version 12.5, scratch access changed to a surface-based model.
* Instead of each shader type having its own layout based on IDs passed
* from the relevant fixed-function unit, all scratch access is based on
* thread IDs like it always has been for compute.
*/
for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++)
devinfo->max_scratch_ids[i] = max_thread_ids;
} else {
unsigned max_scratch_ids[] = {
[MESA_SHADER_VERTEX] = devinfo->max_vs_threads,
[MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
[MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
[MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads,
[MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads,
[MESA_SHADER_COMPUTE] = max_thread_ids,
};
STATIC_ASSERT(sizeof(devinfo->max_scratch_ids) == sizeof(max_scratch_ids));
memcpy(devinfo->max_scratch_ids, max_scratch_ids,
sizeof(devinfo->max_scratch_ids));
}
}
bool
intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo)
{
@@ -1546,5 +1649,7 @@ intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo)
assert(devinfo->subslice_total >= 1 || devinfo->ver <= 7);
devinfo->subslice_total = MAX2(devinfo->subslice_total, 1);
init_max_scratch_ids(devinfo);
return true;
}

View File

@@ -29,6 +29,7 @@
#include <stdint.h>
#include "util/macros.h"
#include "compiler/shader_enums.h"
#ifdef __cplusplus
extern "C" {
@@ -231,6 +232,13 @@ struct intel_device_info
*/
unsigned max_cs_workgroup_threads;
/**
* The maximum number of potential scratch ids. Due to hardware
* implementation details, the range of scratch ids may be larger than the
* number of subslices.
*/
unsigned max_scratch_ids[MESA_SHADER_STAGES];
struct {
/**
* Fixed size of the URB.

View File

@@ -1464,84 +1464,8 @@ anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
if (bo != NULL)
return bo;
unsigned subslices = devinfo->subslice_total;
/* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
*
* "Scratch Space per slice is computed based on 4 sub-slices. SW
* must allocate scratch space enough so that each slice has 4
* slices allowed."
*
* According to the other driver team, this applies to compute shaders
* as well. This is not currently documented at all.
*
* This hack is no longer necessary on Gfx11+.
*
* For, Gfx11+, scratch space allocation is based on the number of threads
* in the base configuration.
*/
if (devinfo->verx10 == 125)
subslices = 32;
else if (devinfo->ver == 12)
subslices = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2);
else if (devinfo->ver == 11)
subslices = 8;
else if (devinfo->ver >= 9)
subslices = 4 * devinfo->num_slices;
unsigned scratch_ids_per_subslice;
if (devinfo->ver >= 12) {
/* Same as ICL below, but with 16 EUs. */
scratch_ids_per_subslice = 16 * 8;
} else if (devinfo->ver == 11) {
/* The MEDIA_VFE_STATE docs say:
*
* "Starting with this configuration, the Maximum Number of
* Threads must be set to (#EU * 8) for GPGPU dispatches.
*
* Although there are only 7 threads per EU in the configuration,
* the FFTID is calculated as if there are 8 threads per EU,
* which in turn requires a larger amount of Scratch Space to be
* allocated by the driver."
*/
scratch_ids_per_subslice = 8 * 8;
} else if (devinfo->is_haswell) {
/* WaCSScratchSize:hsw
*
* Haswell's scratch space address calculation appears to be sparse
* rather than tightly packed. The Thread ID has bits indicating
* which subslice, EU within a subslice, and thread within an EU it
* is. There's a maximum of two slices and two subslices, so these
* can be stored with a single bit. Even though there are only 10 EUs
* per subslice, this is stored in 4 bits, so there's an effective
* maximum value of 16 EUs. Similarly, although there are only 7
* threads per EU, this is stored in a 3 bit number, giving an
* effective maximum value of 8 threads per EU.
*
* This means that we need to use 16 * 8 instead of 10 * 7 for the
* number of threads per subslice.
*/
scratch_ids_per_subslice = 16 * 8;
} else if (devinfo->is_cherryview) {
/* Cherryview devices have either 6 or 8 EUs per subslice, and each EU
* has 7 threads. The 6 EU devices appear to calculate thread IDs as if
* it had 8 EUs.
*/
scratch_ids_per_subslice = 8 * 7;
} else {
scratch_ids_per_subslice = devinfo->max_cs_threads;
}
uint32_t max_threads[] = {
[MESA_SHADER_VERTEX] = devinfo->max_vs_threads,
[MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
[MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
[MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads,
[MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads,
[MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslices,
};
uint32_t size = per_thread_scratch * max_threads[stage];
assert(stage < ARRAY_SIZE(devinfo->max_scratch_ids));
uint32_t size = per_thread_scratch * devinfo->max_scratch_ids[stage];
/* Even though the Scratch base pointers in 3DSTATE_*S are 64 bits, they
* are still relative to the general state base address. When we emit

View File

@@ -437,97 +437,11 @@ brw_alloc_stage_scratch(struct brw_context *brw,
brw_bo_unreference(stage_state->scratch_bo);
const struct intel_device_info *devinfo = &brw->screen->devinfo;
unsigned thread_count;
switch(stage_state->stage) {
case MESA_SHADER_VERTEX:
thread_count = devinfo->max_vs_threads;
break;
case MESA_SHADER_TESS_CTRL:
thread_count = devinfo->max_tcs_threads;
break;
case MESA_SHADER_TESS_EVAL:
thread_count = devinfo->max_tes_threads;
break;
case MESA_SHADER_GEOMETRY:
thread_count = devinfo->max_gs_threads;
break;
case MESA_SHADER_FRAGMENT:
thread_count = devinfo->max_wm_threads;
break;
case MESA_SHADER_COMPUTE: {
unsigned subslices = devinfo->subslice_total;
/* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
*
* "Scratch Space per slice is computed based on 4 sub-slices. SW must
* allocate scratch space enough so that each slice has 4 slices
* allowed."
*
* According to the other driver team, this applies to compute shaders
* as well. This is not currently documented at all.
*
* brw->screen->subslice_total is the TOTAL number of subslices
* and we wish to view that there are 4 subslices per slice
* instead of the actual number of subslices per slice.
*
* For, ICL, scratch space allocation is based on the number of threads
* in the base configuration.
*/
if (devinfo->ver == 11)
subslices = 8;
else if (devinfo->ver >= 9 && devinfo->ver < 11)
subslices = 4 * brw->screen->devinfo.num_slices;
unsigned scratch_ids_per_subslice;
if (devinfo->ver >= 11) {
/* The MEDIA_VFE_STATE docs say:
*
* "Starting with this configuration, the Maximum Number of
* Threads must be set to (#EU * 8) for GPGPU dispatches.
*
* Although there are only 7 threads per EU in the configuration,
* the FFTID is calculated as if there are 8 threads per EU,
* which in turn requires a larger amount of Scratch Space to be
* allocated by the driver."
*/
scratch_ids_per_subslice = 8 * 8;
} else if (devinfo->is_haswell) {
/* WaCSScratchSize:hsw
*
* Haswell's scratch space address calculation appears to be sparse
* rather than tightly packed. The Thread ID has bits indicating
* which subslice, EU within a subslice, and thread within an EU it
* is. There's a maximum of two slices and two subslices, so these
* can be stored with a single bit. Even though there are only 10 EUs
* per subslice, this is stored in 4 bits, so there's an effective
* maximum value of 16 EUs. Similarly, although there are only 7
* threads per EU, this is stored in a 3 bit number, giving an
* effective maximum value of 8 threads per EU.
*
* This means that we need to use 16 * 8 instead of 10 * 7 for the
* number of threads per subslice.
*/
scratch_ids_per_subslice = 16 * 8;
} else if (devinfo->is_cherryview) {
/* Cherryview devices have either 6 or 8 EUs per subslice, and each
* EU has 7 threads. The 6 EU devices appear to calculate thread IDs
* as if it had 8 EUs.
*/
scratch_ids_per_subslice = 8 * 7;
} else {
scratch_ids_per_subslice = devinfo->max_cs_threads;
}
thread_count = scratch_ids_per_subslice * subslices;
break;
}
default:
unreachable("Unsupported stage!");
}
assert(stage_state->stage < ARRAY_SIZE(devinfo->max_scratch_ids));
unsigned max_ids = devinfo->max_scratch_ids[stage_state->stage];
stage_state->scratch_bo =
brw_bo_alloc(brw->bufmgr, "shader scratch space",
per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
per_thread_size * max_ids, BRW_MEMZONE_SCRATCH);
}
void