intel/compiler,anv: put some vertex and primitive data in headers

Both per-primitive and per-vertex space is allocated in MUE in 8 dword
chunks and those 8-dword chunks (granularity of
3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputReadLength)
are passed to fragment shaders as inputs (either non-interpolated
for per-primitive and flat vertex attributes or interpolated
for non-flat vertex attributes).

Some attributes have a special meaning and must be placed in separate
8/16-dword slot called Primitive Header or Vertex Header.

Primitive Header contains 4 such attributes (Cull Primitive,
ViewportIndex, RTAIndex, CPS), leaving 4 dwords (the rest of 8-dword
slot) potentially unused.

Vertex Header is similar - it starts with 3 unused dwords, 1 dword for
Point Size (but if we declare that shader doesn't produce Point Size
then we can reuse it), followed by 4 dwords for Position and optionally
8 dwords for clip distances.

This means we have an interesting optimization problem - we can put
some user attributes into holes in Primitive and Vertex Headers, which
may lead to smaller MUE size and potentially more mesh threads running
in parallel, but we have to be careful to use those holes only when
we need it, otherwise we could force HW to pass too much data to
fragment shader.

Example 1:
Let's assume that Primitive Header is enabled and user defined
12 dwords of per-primitive attributes.

Without packing we would consume 8 + ALIGN(12, 8) = 24 dwords of
MUE space and pass ALIGN(12, 8) = 16 dwords to fragment shader.

With packing, we'll consume 4 + 4 + ALIGN(12 - 4, 8) = 16 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(12 - 4, 8) = 16 dwords to
fragment shader.

16/16 is better than 24/16, so packing makes sense.

Example 2:
Now let's assume that Primitive Header is enabled and user defined
16 dwords of per-primitive attributes.

Without packing we would consume 8 + ALIGN(16, 8) = 24 dwords of
MUE space and pass ALIGN(16, 16) = 16 dwords to fragment shader.

With packing, we'll consume 4 + 4 + ALIGN(16 - 4, 8) = 24 dwords of
MUE space and pass ALIGN(4, 8) + ALIGN(16 - 4, 8) = 24 dwords to
fragment shader.

24/24 is worse than 24/16, so packing doesn't make sense.

This change doesn't affect vk_meshlet_cadscene in default configuration,
but it speeds it up by up to 25% with "-extraattributes N", where
N is some small value divisible by 2 (by default N == 1) and we
are bound by URB size.

Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
This commit is contained in:
Marcin Ślusarz
2022-12-21 15:42:55 +01:00
committed by Marge Bot
parent a252123363
commit c1685f08dd
4 changed files with 193 additions and 20 deletions

View File

@@ -1636,12 +1636,14 @@ struct brw_mue_map {
uint32_t per_primitive_header_size_dw;
uint32_t per_primitive_data_size_dw;
uint32_t per_primitive_pitch_dw;
bool user_data_in_primitive_header;
uint32_t max_vertices;
uint32_t per_vertex_start_dw;
uint32_t per_vertex_header_size_dw;
uint32_t per_vertex_data_size_dw;
uint32_t per_vertex_pitch_dw;
bool user_data_in_vertex_header;
};
struct brw_task_prog_data {

View File

@@ -1794,7 +1794,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
VARYING_BIT_PRIMITIVE_SHADING_RATE;
bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
if (reads_header) {
if (reads_header || mue_map->user_data_in_primitive_header) {
/* Primitive Shading Rate, Layer and Viewport live in the same
* 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
* is dword 2).
@@ -1849,9 +1849,13 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
/* Per-Vertex header is never available to fragment shader. */
per_vertex_start_dw += 8;
per_vertex_size_dw -= 8;
/* Per-Vertex header is available to fragment shader only if there's
* user data there.
*/
if (!mue_map->user_data_in_vertex_header) {
per_vertex_start_dw += 8;
per_vertex_size_dw -= 8;
}
/* In Mesh, CLIP_DIST slots are always at the beginning, because
* they come from MUE Vertex Header, not Per-Vertex Attributes.

View File

@@ -438,7 +438,7 @@ struct attr_type_info {
std::list<struct attr_desc> *order;
/* attributes after which there's hole of size equal to array index */
std::list<int> holes[4];
std::list<int> holes[5];
};
static void
@@ -490,22 +490,71 @@ brw_nir_find_complete_variable_with_location(nir_shader *shader,
return best_var;
}
static unsigned
brw_sum_size(const std::list<struct attr_desc> &orders)
{
unsigned sz = 0;
for (auto it = orders.cbegin(); it != orders.cend(); ++it)
sz += (*it).dwords;
return sz;
}
/* Finds order of outputs which require minimum size, without splitting
* of URB read/write messages (which operate on vec4-aligned memory).
*/
static void
brw_compute_mue_layout(std::list<struct attr_desc> *orders,
uint64_t outputs_written,
struct nir_shader *nir)
struct nir_shader *nir,
bool *pack_prim_data_into_header,
bool *pack_vert_data_into_header)
{
const struct shader_info *info = &nir->info;
struct attr_type_info data[3];
bool no_compact = !debug_get_bool_option("BRW_MESH_COMPACTION", true);
unsigned header_packing = (unsigned)debug_get_num_option("BRW_MESH_HEADER_PACKING", 3);
if ((header_packing & 1) == 0)
*pack_prim_data_into_header = false;
if ((header_packing & 2) == 0)
*pack_vert_data_into_header = false;
for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
data[i].order = &orders[i];
/* If packing into header is enabled, add a hole of size 4 and add
* a virtual location to keep the algorithm happy (it expects holes
* to be preceded by some location). We'll remove those virtual
* locations at the end.
*/
const gl_varying_slot virtual_header_location = VARYING_SLOT_POS;
assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0);
struct attr_desc d;
d.location = virtual_header_location;
d.type = NULL;
d.dwords = 0;
d.slots = 0;
struct attr_desc h;
h.location = -1;
h.type = NULL;
h.dwords = 4;
h.slots = 0;
if (*pack_prim_data_into_header) {
orders[PRIM].push_back(d);
orders[PRIM].push_back(h);
data[PRIM].holes[4].push_back(virtual_header_location);
}
if (*pack_vert_data_into_header) {
orders[VERT].push_back(d);
orders[VERT].push_back(h);
data[VERT].holes[4].push_back(virtual_header_location);
}
u_foreach_bit64(location, outputs_written) {
if ((BITFIELD64_BIT(location) & outputs_written) == 0)
continue;
@@ -519,7 +568,6 @@ brw_compute_mue_layout(std::list<struct attr_desc> *orders,
nir_var_shader_out,
location);
struct attr_desc d;
d.location = location;
d.type = brw_nir_get_var_type(nir, var);
d.dwords = glsl_count_dword_slots(d.type, false);
@@ -539,13 +587,26 @@ brw_compute_mue_layout(std::list<struct attr_desc> *orders,
outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
/* special case to use hole of size 4 */
if (d.dwords == 4 && !holes[4].empty()) {
holes[4].pop_back();
assert(order->front().location == virtual_header_location);
order->pop_front();
assert(order->front().location == -1);
assert(order->front().dwords == 4);
order->front() = d;
continue;
}
int mod = d.dwords % 4;
if (mod == 0) {
order->push_back(d);
continue;
}
struct attr_desc h;
h.location = -1;
h.type = NULL;
h.dwords = 4 - mod;
@@ -568,7 +629,7 @@ brw_compute_mue_layout(std::list<struct attr_desc> *orders,
unsigned found = 0;
/* try to find the smallest hole big enough to hold this attribute */
for (unsigned sz = d.dwords; sz < 4; sz++){
for (unsigned sz = d.dwords; sz <= 4; sz++){
if (!holes[sz].empty()) {
found = sz;
break;
@@ -584,7 +645,7 @@ brw_compute_mue_layout(std::list<struct attr_desc> *orders,
continue;
}
assert(found < 4);
assert(found <= 4);
assert(!holes[found].empty());
int after_loc = holes[found].back();
holes[found].pop_back();
@@ -632,6 +693,61 @@ brw_compute_mue_layout(std::list<struct attr_desc> *orders,
assert(inserted_back);
}
if (*pack_prim_data_into_header) {
if (orders[PRIM].front().location == virtual_header_location)
orders[PRIM].pop_front();
if (!data[PRIM].holes[4].empty()) {
*pack_prim_data_into_header = false;
assert(orders[PRIM].front().location == -1);
assert(orders[PRIM].front().dwords == 4);
orders[PRIM].pop_front();
}
if (*pack_prim_data_into_header) {
unsigned sz = brw_sum_size(orders[PRIM]);
if (sz % 8 == 0 || sz % 8 > 4)
*pack_prim_data_into_header = false;
}
}
if (*pack_vert_data_into_header) {
if (orders[VERT].front().location == virtual_header_location)
orders[VERT].pop_front();
if (!data[VERT].holes[4].empty()) {
*pack_vert_data_into_header = false;
assert(orders[VERT].front().location == -1);
assert(orders[VERT].front().dwords == 4);
orders[VERT].pop_front();
}
if (*pack_vert_data_into_header) {
unsigned sz = brw_sum_size(orders[VERT]) +
brw_sum_size(orders[VERT_FLAT]);
if (sz % 8 == 0 || sz % 8 > 4)
*pack_vert_data_into_header = false;
}
}
if (INTEL_DEBUG(DEBUG_MESH)) {
fprintf(stderr, "MUE attribute order:\n");
for (unsigned i = PRIM; i <= VERT_FLAT; ++i) {
if (!orders[i].empty())
fprintf(stderr, "%d: ", i);
for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) {
fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords);
}
if (!orders[i].empty())
fprintf(stderr, "\n");
}
}
}
/* Mesh URB Entry consists of an initial section
@@ -717,7 +833,22 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
std::list<struct attr_desc> orders[3];
uint64_t regular_outputs = outputs_written &
~(per_primitive_header_bits | per_vertex_header_bits);
brw_compute_mue_layout(orders, regular_outputs, nir);
/* packing into prim header is possible only if prim header is present */
map->user_data_in_primitive_header =
(outputs_written & per_primitive_header_bits) != 0;
/* Packing into vert header is always possible, but we allow it only
* if full vec4 is available (so point size is not used) and there's
* nothing between it and normal vertex data (so no clip distances).
*/
map->user_data_in_vertex_header =
(outputs_written & per_vertex_header_bits) ==
BITFIELD64_BIT(VARYING_SLOT_POS);
brw_compute_mue_layout(orders, regular_outputs, nir,
&map->user_data_in_primitive_header,
&map->user_data_in_vertex_header);
if (outputs_written & per_primitive_header_bits) {
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
@@ -752,13 +883,22 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
map->per_primitive_data_size_dw = 0;
unsigned start_dw = map->per_primitive_start_dw +
map->per_primitive_header_size_dw;
unsigned start_dw = map->per_primitive_start_dw;
if (map->user_data_in_primitive_header)
start_dw += 4; /* first 4 dwords are used */
else
start_dw += map->per_primitive_header_size_dw;
unsigned header_used_dw = 0;
for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
int location = (*it).location;
if (location < 0) {
start_dw += (*it).dwords;
map->per_primitive_data_size_dw += (*it).dwords;
if (map->user_data_in_primitive_header && header_used_dw < 4)
header_used_dw += (*it).dwords;
else
map->per_primitive_data_size_dw += (*it).dwords;
assert(header_used_dw <= 4);
continue;
}
@@ -770,7 +910,11 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
brw_mue_assign_position(&*it, map, start_dw);
start_dw += (*it).dwords;
map->per_primitive_data_size_dw += (*it).dwords;
if (map->user_data_in_primitive_header && header_used_dw < 4)
header_used_dw += (*it).dwords;
else
map->per_primitive_data_size_dw += (*it).dwords;
assert(header_used_dw <= 4);
outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
}
@@ -819,14 +963,24 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
map->per_vertex_data_size_dw = 0;
start_dw = map->per_vertex_start_dw +
map->per_vertex_header_size_dw;
start_dw = map->per_vertex_start_dw;
if (!map->user_data_in_vertex_header)
start_dw += map->per_vertex_header_size_dw;
header_used_dw = 0;
for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
int location = (*it).location;
if (location < 0) {
start_dw += (*it).dwords;
map->per_vertex_data_size_dw += (*it).dwords;
if (map->user_data_in_vertex_header && header_used_dw < 4) {
header_used_dw += (*it).dwords;
assert(header_used_dw <= 4);
if (header_used_dw == 4)
start_dw += 4; /* jump over gl_position */
} else {
map->per_vertex_data_size_dw += (*it).dwords;
}
continue;
}
@@ -837,7 +991,14 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
brw_mue_assign_position(&*it, map, start_dw);
start_dw += (*it).dwords;
map->per_vertex_data_size_dw += (*it).dwords;
if (map->user_data_in_vertex_header && header_used_dw < 4) {
header_used_dw += (*it).dwords;
assert(header_used_dw <= 4);
if (header_used_dw == 4)
start_dw += 4; /* jump over gl_position */
} else {
map->per_vertex_data_size_dw += (*it).dwords;
}
outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
}
}

View File

@@ -558,6 +558,11 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
}
if (mue->user_data_in_vertex_header) {
sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
}
assert(mue->per_primitive_header_size_dw % 8 == 0);
sbe_mesh.PerPrimitiveURBEntryOutputReadOffset = mue->per_primitive_header_size_dw / 8;
sbe_mesh.PerPrimitiveURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
@@ -569,7 +574,8 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
*/
if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0) {
wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 ||
mue->user_data_in_primitive_header) {
assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0);
sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1;
sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;