intel: Add driver support for hardware generated local invocation IDs

This adds a few new fields in the brw_cs_prog_data struct and then
uses them to fill in the relevant COMPUTE_WALKER fields.

Although the Tile Layout field theoretically has different settings for
32/64/128bpe, it appears that the recommended programming is to always
pick either TileY 32bpe or Linear.  It's not very practical to look at
the surface formats involved, anyway.

Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27167>
This commit is contained in:
Kenneth Graunke
2023-11-28 02:33:41 -08:00
committed by Marge Bot
parent 10ed4f1cab
commit 5e7f4ff97f
5 changed files with 40 additions and 0 deletions

View File

@@ -8677,6 +8677,14 @@ iris_upload_compute_walker(struct iris_context *ice,
cw.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0); cw.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
cw.InterfaceDescriptor = idd; cw.InterfaceDescriptor = idd;
#if GFX_VERx10 >= 125
cw.GenerateLocalID = cs_prog_data->generate_local_id != 0;
cw.EmitLocal = cs_prog_data->generate_local_id;
cw.WalkOrder = cs_prog_data->walk_order;
cw.TileLayout = cs_prog_data->walk_order == BRW_WALK_ORDER_YXZ ?
TileY32bpe : Linear;
#endif
assert(brw_cs_push_const_total_size(cs_prog_data, dispatch.threads) == 0); assert(brw_cs_push_const_total_size(cs_prog_data, dispatch.threads) == 0);
} }
} }

View File

@@ -2191,6 +2191,14 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
cw.IndirectDataStartAddress = push_const_offset; cw.IndirectDataStartAddress = push_const_offset;
cw.IndirectDataLength = push_const_size; cw.IndirectDataLength = push_const_size;
#if GFX_VERx10 >= 125
cw.GenerateLocalID = cs_prog_data->generate_local_id != 0;
cw.EmitLocal = cs_prog_data->generate_local_id;
cw.WalkOrder = cs_prog_data->walk_order;
cw.TileLayout = cs_prog_data->walk_order == BRW_WALK_ORDER_YXZ ?
TileY32bpe : Linear;
#endif
cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
.KernelStartPointer = params->cs_prog_kernel, .KernelStartPointer = params->cs_prog_kernel,
.SamplerStatePointer = samplers_offset, .SamplerStatePointer = samplers_offset,

View File

@@ -1322,6 +1322,15 @@ struct brw_push_const_block {
unsigned size; /* Bytes, register aligned */ unsigned size; /* Bytes, register aligned */
}; };
enum PACKED brw_compute_walk_order {
BRW_WALK_ORDER_XYZ = 0,
BRW_WALK_ORDER_XZY = 1,
BRW_WALK_ORDER_YXZ = 2,
BRW_WALK_ORDER_YZX = 3,
BRW_WALK_ORDER_ZXY = 4,
BRW_WALK_ORDER_ZYX = 5,
};
struct brw_cs_prog_data { struct brw_cs_prog_data {
struct brw_stage_prog_data base; struct brw_stage_prog_data base;
@@ -1344,6 +1353,8 @@ struct brw_cs_prog_data {
bool uses_inline_data; bool uses_inline_data;
bool uses_btd_stack_ids; bool uses_btd_stack_ids;
bool uses_systolic; bool uses_systolic;
uint8_t generate_local_id;
enum brw_compute_walk_order walk_order;
struct { struct {
struct brw_push_const_block cross_thread; struct brw_push_const_block cross_thread;

View File

@@ -359,6 +359,11 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
#if GFX_VERx10 == 125 #if GFX_VERx10 == 125
.SystolicModeEnable = prog_data->uses_systolic, .SystolicModeEnable = prog_data->uses_systolic,
#endif #endif
.GenerateLocalID = prog_data->generate_local_id != 0,
.EmitLocal = prog_data->generate_local_id,
.WalkOrder = prog_data->walk_order,
.TileLayout = prog_data->walk_order == BRW_WALK_ORDER_YXZ ?
TileY32bpe : Linear,
.LocalXMaximum = prog_data->local_size[0] - 1, .LocalXMaximum = prog_data->local_size[0] - 1,
.LocalYMaximum = prog_data->local_size[1] - 1, .LocalYMaximum = prog_data->local_size[1] - 1,
.LocalZMaximum = prog_data->local_size[2] - 1, .LocalZMaximum = prog_data->local_size[2] - 1,

View File

@@ -557,6 +557,14 @@ genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state,
cw.ExecutionMask = dispatch.right_mask; cw.ExecutionMask = dispatch.right_mask;
cw.PostSync.MOCS = anv_mocs(device, NULL, 0); cw.PostSync.MOCS = anv_mocs(device, NULL, 0);
#if GFX_VERx10 >= 125
cw.GenerateLocalID = prog_data->generate_local_id != 0;
cw.EmitLocal = prog_data->generate_local_id;
cw.WalkOrder = prog_data->walk_order;
cw.TileLayout = prog_data->walk_order == BRW_WALK_ORDER_YXZ ?
TileY32bpe : Linear;
#endif
cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
.KernelStartPointer = state->kernel->kernel.offset + .KernelStartPointer = state->kernel->kernel.offset +
brw_cs_prog_data_prog_offset(prog_data, brw_cs_prog_data_prog_offset(prog_data,