ac/nir/ngg: Refactor LDS instructions in NGG GS vertex emit and export.
Change NGG GS emit vertex code to emit combined shared stores, also change the export vertex code to emit combined shared loads. This results in more optimal code generation, ie. fewer LDS instructions are generated. GS vertices are stored using an odd stride to minimize the chance of bank conflicts, which means that unfortunately we still can't use an alignment higher than 4 here, so the best we can get are some ds_read2_b32 instructions. Fossil DB stats on Navi 21 (formerly Sienna Cichlid): Totals from 135 (0.10% of 128653) affected shaders: VGPRs: 6416 -> 6512 (+1.50%) CodeSize: 529436 -> 503792 (-4.84%) MaxWaves: 2952 -> 2924 (-0.95%) Instrs: 93384 -> 90176 (-3.44%) Latency: 290283 -> 293611 (+1.15%); split: -0.36%, +1.50% InvThroughput: 81218 -> 82598 (+1.70%) Copies: 6603 -> 6606 (+0.05%) PreVGPRs: 5037 -> 5076 (+0.77%) Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11425>
This commit is contained in:

committed by
Daniel Schürmann

parent
8ab1e9826d
commit
2ac3e921e3
@@ -71,11 +71,11 @@ typedef struct
|
||||
|
||||
typedef struct
|
||||
{
|
||||
/* bitsize of this component (max 32), or 0 if it's never written at all */
|
||||
uint8_t bit_size : 6;
|
||||
/* Bitmask of components used: 4 bits per slot, 1 bit per component. */
|
||||
uint8_t components_mask : 4;
|
||||
/* output stream index */
|
||||
uint8_t stream : 2;
|
||||
} gs_output_component_info;
|
||||
} gs_output_info;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@@ -93,7 +93,7 @@ typedef struct
|
||||
bool found_out_vtxcnt[4];
|
||||
bool output_compile_time_known;
|
||||
bool provoking_vertex_last;
|
||||
gs_output_component_info output_component_info[VARYING_SLOT_MAX][4];
|
||||
gs_output_info output_info[VARYING_SLOT_MAX];
|
||||
} lower_ngg_gs_state;
|
||||
|
||||
/* LDS layout of Mesh Shader workgroup info. */
|
||||
@@ -1637,16 +1637,18 @@ lower_ngg_gs_store_output(nir_builder *b, nir_intrinsic_instr *intrin, lower_ngg
|
||||
if (num_consumed_components > 1)
|
||||
element = nir_extract_bits(b, &element, 1, 0, num_consumed_components, 32);
|
||||
|
||||
/* Save output usage info. */
|
||||
gs_output_info *info = &s->output_info[io_sem.location];
|
||||
/* The same output should always belong to the same stream. */
|
||||
assert(!info->components_mask || info->stream == stream);
|
||||
info->stream = stream;
|
||||
info->components_mask |= BITFIELD_BIT(component_offset + comp * num_consumed_components);
|
||||
|
||||
for (unsigned c = 0; c < num_consumed_components; ++c) {
|
||||
unsigned component_index = (comp * num_consumed_components) + c + component_offset;
|
||||
unsigned base_index = base + base_offset + component_index / 4;
|
||||
component_index %= 4;
|
||||
|
||||
/* Save output usage info */
|
||||
gs_output_component_info *info = &s->output_component_info[base_index][component_index];
|
||||
info->bit_size = MAX2(info->bit_size, MIN2(store_val->bit_size, 32));
|
||||
info->stream = stream;
|
||||
|
||||
/* Store the current component element */
|
||||
nir_ssa_def *component_element = element;
|
||||
if (num_consumed_components > 1)
|
||||
@@ -1679,21 +1681,26 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
|
||||
|
||||
for (unsigned slot = 0; slot < VARYING_SLOT_MAX; ++slot) {
|
||||
unsigned packed_location = util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot)));
|
||||
gs_output_info *info = &s->output_info[slot];
|
||||
if (info->stream != stream || !info->components_mask)
|
||||
continue;
|
||||
|
||||
for (unsigned comp = 0; comp < 4; ++comp) {
|
||||
gs_output_component_info *info = &s->output_component_info[slot][comp];
|
||||
if (info->stream != stream || !info->bit_size)
|
||||
continue;
|
||||
unsigned mask = info->components_mask;
|
||||
while (mask) {
|
||||
int start, count;
|
||||
u_bit_scan_consecutive_range(&mask, &start, &count);
|
||||
nir_ssa_def *values[4] = {0};
|
||||
for (int c = start; c < start + count; ++c) {
|
||||
/* Load output from variable. */
|
||||
values[c - start] = nir_load_var(b, s->output_vars[slot][c]);
|
||||
/* Clear the variable (it is undefined after emit_vertex) */
|
||||
nir_store_var(b, s->output_vars[slot][c], nir_ssa_undef(b, 1, 32), 0x1);
|
||||
}
|
||||
|
||||
/* Store the output to LDS */
|
||||
nir_ssa_def *out_val = nir_load_var(b, s->output_vars[slot][comp]);
|
||||
if (info->bit_size != 32)
|
||||
out_val = nir_u2u(b, out_val, info->bit_size);
|
||||
|
||||
nir_store_shared(b, out_val, gs_emit_vtx_addr, .base = packed_location * 16 + comp * 4);
|
||||
|
||||
/* Clear the variable that holds the output */
|
||||
nir_store_var(b, s->output_vars[slot][comp], nir_ssa_undef(b, 1, 32), 0x1u);
|
||||
nir_ssa_def *store_val = nir_vec(b, values, (unsigned)count);
|
||||
nir_store_shared(b, store_val, gs_emit_vtx_addr,
|
||||
.base = packed_location * 16 + start * 4,
|
||||
.align_mul = 4);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1834,16 +1841,24 @@ ngg_gs_export_vertices(nir_builder *b, nir_ssa_def *max_num_out_vtx, nir_ssa_def
|
||||
if (!(b->shader->info.outputs_written & BITFIELD64_BIT(slot)))
|
||||
continue;
|
||||
|
||||
gs_output_info *info = &s->output_info[slot];
|
||||
if (!info->components_mask || info->stream != 0)
|
||||
continue;
|
||||
|
||||
unsigned packed_location = util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot)));
|
||||
nir_io_semantics io_sem = { .location = slot, .num_slots = 1 };
|
||||
|
||||
for (unsigned comp = 0; comp < 4; ++comp) {
|
||||
gs_output_component_info *info = &s->output_component_info[slot][comp];
|
||||
if (info->stream != 0 || info->bit_size == 0)
|
||||
continue;
|
||||
unsigned mask = info->components_mask;
|
||||
while (mask) {
|
||||
int start, count;
|
||||
u_bit_scan_consecutive_range(&mask, &start, &count);
|
||||
nir_ssa_def *load =
|
||||
nir_load_shared(b, count, 32, exported_out_vtx_lds_addr,
|
||||
.base = packed_location * 16 + start * 4,
|
||||
.align_mul = 4);
|
||||
|
||||
nir_ssa_def *load = nir_load_shared(b, 1, info->bit_size, exported_out_vtx_lds_addr, .base = packed_location * 16u + comp * 4u, .align_mul = 4u);
|
||||
nir_store_output(b, load, nir_imm_int(b, 0), .base = slot, .component = comp, .io_semantics = io_sem);
|
||||
nir_store_output(b, load, nir_imm_int(b, 0), .base = slot, .io_semantics = io_sem,
|
||||
.component = start, .write_mask = BITFIELD_MASK(count));
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user