intel/compiler/mesh: compactify MUE layout

Instead of using 4 dwords for each output slot, use only the amount
of memory actually needed by each variable.

There are some complications from this "obvious" idea:
- flat and non-flat variables can't be merged into the same vec4 slot,
  because flat inputs mask has vec4 stride
- multi-slot variables can have different layout:
   float[N] requires N 1-dword slots, but
   i64vec3 requires 1 fully occupied 4-dword slot followed by 2-dword slot
- some output variables occur both in single-channel/component split
  and combined variants
- crossing vec4 boundary requires generating more writes, so avoiding them
  if possible is beneficial

This patch fixes some issues with arrays in per-vertex and per-primitive data
(func.mesh.ext.outputs.*.indirect_array.q0 in crucible)
and by reduction in single MUE size it allows spawning more threads at
the same time.

Note: this patch doesn't improve vk_meshlet_cadscene performance because
default layout is already optimal enough.

Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
This commit is contained in:
Marcin Ślusarz
2022-12-21 15:40:07 +01:00
committed by Marge Bot
parent fb765a65c8
commit a252123363
8 changed files with 478 additions and 118 deletions

View File

@@ -1764,10 +1764,10 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
const nir_shader *nir,
const struct brw_mue_map *mue_map)
{
memset(prog_data->urb_setup, -1,
sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
int urb_next = 0;
int urb_next = 0; /* in vec4s */
const uint64_t inputs_read =
nir->info.inputs_read & ~nir->info.per_primitive_inputs;
@@ -1782,6 +1782,9 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
uint64_t per_prim_inputs_read =
nir->info.inputs_read & nir->info.per_primitive_inputs;
unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
/* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
* are always at the beginning, because they come from MUE
* Primitive Header, not Per-Primitive Attributes.
@@ -1789,8 +1792,9 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
VARYING_BIT_LAYER |
VARYING_BIT_PRIMITIVE_SHADING_RATE;
bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
if (per_prim_inputs_read & primitive_header_bits) {
if (reads_header) {
/* Primitive Shading Rate, Layer and Viewport live in the same
* 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
* is dword 2).
@@ -1804,23 +1808,30 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
/* 3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputRead[Offset|Length]
* are in full GRFs (8 dwords) and MUE Primitive Header is 8 dwords,
* so next per-primitive attribute must be placed in slot 2 (each slot
* is 4 dwords long).
*/
urb_next = 2;
per_prim_inputs_read &= ~primitive_header_bits;
} else {
/* If fs doesn't need primitive header, then it won't be made
* available through SBE_MESH, so we have to skip them when
* calculating offset from start of per-prim data.
*/
per_prim_start_dw += mue_map->per_primitive_header_size_dw;
per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
}
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
prog_data->urb_setup[i] = urb_next++;
}
u_foreach_bit64(i, per_prim_inputs_read) {
int start = mue_map->start_dw[i];
assert(start >= 0);
assert(mue_map->len_dw[i] > 0);
assert(unsigned(start) >= per_prim_start_dw);
unsigned pos_dw = unsigned(start) - per_prim_start_dw;
prog_data->urb_setup[i] = urb_next + pos_dw / 4;
prog_data->urb_setup_channel[i] = pos_dw % 4;
}
/* The actual setup attributes later must be aligned to a full GRF. */
urb_next = ALIGN(urb_next, 2);
urb_next = per_prim_size_dw / 4;
prog_data->num_per_primitive_inputs = urb_next;
}
@@ -1835,21 +1846,43 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
unique_fs_attrs &= ~clip_dist_bits;
}
unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
/* Per-Vertex header is never available to fragment shader. */
per_vertex_start_dw += 8;
per_vertex_size_dw -= 8;
/* In Mesh, CLIP_DIST slots are always at the beginning, because
* they come from MUE Vertex Header, not Per-Vertex Attributes.
*/
if (inputs_read & clip_dist_bits) {
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
} else if (mue_map->per_vertex_header_size_dw > 8) {
/* Clip distances are in MUE, but we are not reading them in FS. */
per_vertex_start_dw += 8;
per_vertex_size_dw -= 8;
}
/* Per-Vertex attributes are laid out ordered. Because we always link
* Mesh and Fragment shaders, the which slots are written and read by
* each of them will match. */
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
if (unique_fs_attrs & BITFIELD64_BIT(i))
prog_data->urb_setup[i] = urb_next++;
u_foreach_bit64(i, unique_fs_attrs) {
int start = mue_map->start_dw[i];
assert(start >= 0);
assert(mue_map->len_dw[i] > 0);
assert(unsigned(start) >= per_vertex_start_dw);
unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
prog_data->urb_setup[i] = urb_next + pos_dw / 4;
prog_data->urb_setup_channel[i] = pos_dw % 4;
}
urb_next += per_vertex_size_dw / 4;
} else if (devinfo->ver >= 6) {
uint64_t vue_header_bits =
VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;