ac/nir/ngg: refine nogs outputs handling

Gather outputs in advance to save both output data and type. Output data
is used for streamout and gfx11 param export. Output type is used for
streamout latter.

The output info will also be used for nir vertex export in the future.

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Singed-off-by: Qiang Yu <yuq825@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20350>
This commit is contained in:
Qiang Yu
2022-12-08 18:22:24 +08:00
committed by Marge Bot
parent 69d11b6926
commit cd22bf90e7

View File

@@ -97,6 +97,12 @@ typedef struct
nir_variable *clip_vertex_var;
nir_variable *clipdist_neg_mask_var;
bool has_clipdist;
/* outputs */
nir_ssa_def *outputs[VARYING_SLOT_MAX][4];
nir_ssa_def *outputs_16bit_lo[16][4];
nir_ssa_def *outputs_16bit_hi[16][4];
shader_output_types output_types;
} lower_ngg_nogs_state;
typedef struct
@@ -599,6 +605,9 @@ emit_store_ngg_nogs_es_primitive_id(nir_builder *b, lower_ngg_nogs_state *st)
nir_store_output(b, prim_id, nir_imm_zero(b, 1, 32),
.base = st->options->primitive_id_location,
.src_type = nir_type_uint32, .io_semantics = io_sem);
/* Update outputs_written to reflect that the pass added a new output. */
b->shader->info.outputs_written |= VARYING_BIT_PRIMITIVE_ID;
}
static void
@@ -1614,66 +1623,111 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
unreachable("Should be VS or TES.");
}
static bool
do_ngg_nogs_store_output_to_lds(nir_builder *b, nir_instr *instr, void *state)
static void
ngg_nogs_store_edgeflag_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
{
lower_ngg_nogs_state *st = (lower_ngg_nogs_state *)state;
if (!s->outputs[VARYING_SLOT_EDGE][0])
return;
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_store_output)
return false;
/* no indirect output */
assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
b->cursor = nir_before_instr(instr);
nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
unsigned component = nir_intrinsic_component(intrin);
unsigned write_mask = nir_intrinsic_write_mask(intrin);
nir_ssa_def *store_val = intrin->src[0].ssa;
if (sem.location == VARYING_SLOT_EDGE) {
if (st->has_user_edgeflags) {
/* clamp user edge flag to 1 for latter bit operations */
store_val = nir_umin(b, store_val, nir_imm_int(b, 1));
/* remove instr after cursor point to the new node */
nir_instr_remove(instr);
} else {
/* remove the edge flag output anyway as it should not be passed to next stage */
nir_instr_remove(instr);
return true;
}
} else {
write_mask = nir_instr_xfb_write_mask(intrin) >> component;
if (!(write_mask && st->streamout_enabled))
return false;
}
/* clamp user edge flag to 1 for latter bit operations */
nir_ssa_def *edgeflag = s->outputs[VARYING_SLOT_EDGE][0];
edgeflag = nir_umin(b, edgeflag, nir_imm_int(b, 1));
/* user edge flag is stored at the beginning of a vertex if streamout is not enabled */
unsigned offset = 0;
if (st->streamout_enabled) {
if (s->streamout_enabled) {
unsigned packed_location =
util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(sem.location));
offset = packed_location * 16 + component * 4;
util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(VARYING_SLOT_EDGE));
offset = packed_location * 16;
}
nir_ssa_def *tid = nir_load_local_invocation_index(b);
nir_ssa_def *addr = pervertex_lds_addr(b, tid, st->pervertex_lds_bytes);
nir_ssa_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);
nir_store_shared(b, store_val, addr, .base = offset, .write_mask = write_mask);
return true;
nir_store_shared(b, edgeflag, addr, .base = offset);
}
static void
ngg_nogs_store_all_outputs_to_lds(nir_shader *shader, lower_ngg_nogs_state *st)
ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
{
nir_shader_instructions_pass(shader, do_ngg_nogs_store_output_to_lds,
nir_metadata_block_index | nir_metadata_dominance, st);
nir_xfb_info *info = b->shader->xfb_info;
uint64_t xfb_outputs = 0;
unsigned xfb_outputs_16bit = 0;
uint8_t xfb_mask[VARYING_SLOT_MAX] = {0};
uint8_t xfb_mask_16bit_lo[16] = {0};
uint8_t xfb_mask_16bit_hi[16] = {0};
/* Get XFB output mask for each slot. */
for (int i = 0; i < info->output_count; i++) {
nir_xfb_output_info *out = info->outputs + i;
if (out->location < VARYING_SLOT_VAR0_16BIT) {
xfb_outputs |= BITFIELD64_BIT(out->location);
xfb_mask[out->location] |= out->component_mask;
} else {
unsigned index = out->location - VARYING_SLOT_VAR0_16BIT;
xfb_outputs_16bit |= BITFIELD_BIT(index);
if (out->high_16bits)
xfb_mask_16bit_hi[index] |= out->component_mask;
else
xfb_mask_16bit_lo[index] |= out->component_mask;
}
}
nir_ssa_def *tid = nir_load_local_invocation_index(b);
nir_ssa_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);
u_foreach_bit64(slot, xfb_outputs) {
unsigned packed_location =
util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(slot));
unsigned mask = xfb_mask[slot];
while (mask) {
int start, count;
u_bit_scan_consecutive_range(&mask, &start, &count);
/* Outputs here are sure to be 32bit.
*
* 64bit outputs have been lowered to two 32bit. As 16bit outputs:
* Vulkan does not allow streamout outputs less than 32bit.
* OpenGL puts 16bit outputs in VARYING_SLOT_VAR0_16BIT.
*/
nir_ssa_def *store_val = nir_vec(b, &s->outputs[slot][start], (unsigned)count);
nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4);
}
}
unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written);
u_foreach_bit64(slot, xfb_outputs_16bit) {
unsigned packed_location = num_32bit_outputs +
util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(slot));
unsigned mask_lo = xfb_mask_16bit_lo[slot];
unsigned mask_hi = xfb_mask_16bit_hi[slot];
nir_ssa_def **outputs_lo = s->outputs_16bit_lo[slot];
nir_ssa_def **outputs_hi = s->outputs_16bit_hi[slot];
nir_ssa_def *undef = nir_ssa_undef(b, 1, 16);
unsigned mask = mask_lo | mask_hi;
while (mask) {
int start, count;
u_bit_scan_consecutive_range(&mask, &start, &count);
nir_ssa_def *values[4] = {0};
for (int c = start; c < start + count; ++c) {
nir_ssa_def *lo = mask_lo & BITFIELD_BIT(c) ? outputs_lo[c] : undef;
nir_ssa_def *hi = mask_hi & BITFIELD_BIT(c) ? outputs_hi[c] : undef;
/* extend 8/16 bit to 32 bit, 64 bit has been lowered */
values[c - start] = nir_pack_32_2x16_split(b, lo, hi);
}
nir_ssa_def *store_val = nir_vec(b, values, (unsigned)count);
nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4);
}
}
}
static void
@@ -1937,24 +1991,17 @@ ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
return pervertex_lds_bytes;
}
static unsigned
gather_vs_outputs(nir_builder *b, struct exec_list *cf_list, vs_output *outputs,
const uint8_t *vs_output_param_offset)
static void
ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nogs_state *s)
{
uint64_t output_mask32 = 0;
nir_ssa_def *outputs32[VARYING_SLOT_MAX][4] = {0};
unsigned output_mask16_lo = 0;
unsigned output_mask16_hi = 0;
nir_ssa_def *outputs16_lo[16][4];
nir_ssa_def *outputs16_hi[16][4];
/* Assume:
* - the shader used nir_lower_io_to_temporaries
* - 64-bit outputs are lowered
* - no indirect indexing is present
*/
struct nir_cf_node *first_node = exec_node_data(nir_cf_node, exec_list_get_head(cf_list), node);
struct nir_cf_node *first_node =
exec_node_data(nir_cf_node, exec_list_get_head(cf_list), node);
for (nir_block *block = nir_cf_node_cf_tree_first(first_node); block != NULL;
block = nir_block_cf_tree_next(block)) {
nir_foreach_instr_safe (instr, block) {
@@ -1967,61 +2014,92 @@ gather_vs_outputs(nir_builder *b, struct exec_list *cf_list, vs_output *outputs,
assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
unsigned slot = nir_intrinsic_io_semantics(intrin).location;
if (vs_output_param_offset[slot] > AC_EXP_PARAM_OFFSET_31)
continue;
nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
unsigned slot = sem.location;
bool is_hi = nir_intrinsic_io_semantics(intrin).high_16bits;
bool is_16bit = slot >= VARYING_SLOT_VAR0_16BIT;
u_foreach_bit (i, nir_intrinsic_write_mask(intrin)) {
unsigned comp = nir_intrinsic_component(intrin) + i;
nir_ssa_def *chan = nir_channel(b, intrin->src[0].ssa, i);
if (is_16bit && is_hi)
outputs16_hi[slot - VARYING_SLOT_VAR0_16BIT][comp] = chan;
else if (is_16bit)
outputs16_lo[slot - VARYING_SLOT_VAR0_16BIT][comp] = chan;
else
outputs32[slot][comp] = chan;
nir_ssa_def **output;
nir_alu_type *type;
if (slot >= VARYING_SLOT_VAR0_16BIT) {
unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
if (sem.high_16bits) {
output = s->outputs_16bit_hi[index];
type = s->output_types.types_16bit_hi[index];
} else {
output = s->outputs_16bit_lo[index];
type = s->output_types.types_16bit_lo[index];
}
} else {
output = s->outputs[slot];
type = s->output_types.types[slot];
}
if (is_16bit && is_hi)
output_mask16_hi |= BITFIELD_BIT(slot - VARYING_SLOT_VAR0_16BIT);
else if (is_16bit)
output_mask16_lo |= BITFIELD_BIT(slot - VARYING_SLOT_VAR0_16BIT);
else
output_mask32 |= BITFIELD64_BIT(slot);
unsigned component = nir_intrinsic_component(intrin);
unsigned write_mask = nir_intrinsic_write_mask(intrin);
nir_alu_type src_type = nir_intrinsic_src_type(intrin);
if (slot >= VARYING_SLOT_VAR0 || !(BITFIELD64_BIT(slot) & POS_EXPORT_MASK))
nir_instr_remove(&intrin->instr);
u_foreach_bit (i, write_mask) {
unsigned c = component + i;
output[c] = nir_channel(b, intrin->src[0].ssa, i);
type[c] = src_type;
}
/* remove the edge flag output anyway as it should not be passed to next stage */
bool is_edge_slot = slot == VARYING_SLOT_EDGE;
/* remove non-pos-export slot when GFX11, they are written to buffer memory */
bool is_pos_export_slot = slot < VARYING_SLOT_MAX && (BITFIELD64_BIT(slot) & POS_EXPORT_MASK);
if (is_edge_slot || (s->options->gfx_level >= GFX11 && !is_pos_export_slot))
nir_instr_remove(instr);
}
}
}
static unsigned
gather_vs_outputs(nir_builder *b, vs_output *outputs, lower_ngg_nogs_state *s)
{
unsigned num_outputs = 0;
u_foreach_bit64 (i, output_mask32) {
outputs[num_outputs].slot = i;
for (unsigned j = 0; j < 4; j++) {
nir_ssa_def *chan = outputs32[i][j];
u_foreach_bit64 (slot, b->shader->info.outputs_written) {
if (s->options->vs_output_param_offset[slot] > AC_EXP_PARAM_OFFSET_31)
continue;
/* skip output if no one written before */
if (!s->outputs[slot][0] && !s->outputs[slot][1] &&
!s->outputs[slot][2] && !s->outputs[slot][3])
continue;
outputs[num_outputs].slot = slot;
for (int i = 0; i < 4; i++) {
nir_ssa_def *chan = s->outputs[slot][i];
/* RADV implements 16-bit outputs as 32-bit with VARYING_SLOT_VAR0-31. */
outputs[num_outputs].chan[j] = chan && chan->bit_size == 16 ? nir_u2u32(b, chan) : chan;
outputs[num_outputs].chan[i] = chan && chan->bit_size == 16 ? nir_u2u32(b, chan) : chan;
}
num_outputs++;
}
if (output_mask16_lo | output_mask16_hi) {
nir_ssa_def *undef = nir_ssa_undef(b, 1, 16);
u_foreach_bit (i, output_mask16_lo | output_mask16_hi) {
vs_output *output = &outputs[num_outputs++];
u_foreach_bit (i, b->shader->info.outputs_written_16bit) {
unsigned slot = VARYING_SLOT_VAR0_16BIT + i;
if (s->options->vs_output_param_offset[slot] > AC_EXP_PARAM_OFFSET_31)
continue;
output->slot = i + VARYING_SLOT_VAR0_16BIT;
for (unsigned j = 0; j < 4; j++) {
nir_ssa_def *lo = output_mask16_lo & BITFIELD_BIT(i) ? outputs16_lo[i][j] : NULL;
nir_ssa_def *hi = output_mask16_hi & BITFIELD_BIT(i) ? outputs16_hi[i][j] : NULL;
if (lo || hi)
output->chan[j] = nir_pack_32_2x16_split(b, lo ? lo : undef, hi ? hi : undef);
else
output->chan[j] = NULL;
}
/* skip output if no one written before */
if (!s->outputs_16bit_lo[i][0] && !s->outputs_16bit_lo[i][1] &&
!s->outputs_16bit_lo[i][2] && !s->outputs_16bit_lo[i][3] &&
!s->outputs_16bit_hi[i][0] && !s->outputs_16bit_hi[i][1] &&
!s->outputs_16bit_hi[i][2] && !s->outputs_16bit_hi[i][3])
continue;
vs_output *output = &outputs[num_outputs++];
output->slot = slot;
nir_ssa_def **output_lo = s->outputs_16bit_lo[i];
nir_ssa_def **output_hi = s->outputs_16bit_hi[i];
nir_ssa_def *undef = nir_ssa_undef(b, 1, 16);
for (int j = 0; j < 4; j++) {
nir_ssa_def *lo = output_lo[j] ? output_lo[j] : undef;
nir_ssa_def *hi = output_hi[j] ? output_hi[j] : undef;
if (output_lo[j] || output_hi[j])
output->chan[j] = nir_pack_32_2x16_split(b, lo, hi);
else
output->chan[j] = NULL;
}
}
@@ -2225,44 +2303,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
}
nir_pop_if(b, if_es_thread);
if (state.streamout_enabled) {
/* TODO: support culling after streamout. */
assert(!options->can_cull);
ngg_nogs_build_streamout(b, &state);
}
if (state.streamout_enabled || has_user_edgeflags) {
ngg_nogs_store_all_outputs_to_lds(shader, &state);
b->cursor = nir_after_cf_list(&impl->body);
}
/* Take care of late primitive export */
if (!state.early_prim_export) {
emit_ngg_nogs_prim_export(b, &state, nir_load_var(b, prim_exp_arg_var));
}
/* Export varyings for GFX11+ */
if (state.options->gfx_level >= GFX11) {
vs_output outputs[64];
b->cursor = nir_after_cf_list(&if_es_thread->then_list);
unsigned num_outputs =
gather_vs_outputs(b, &if_es_thread->then_list, outputs, options->vs_output_param_offset);
if (num_outputs) {
b->cursor = nir_after_cf_node(&if_es_thread->cf_node);
create_vertex_param_phis(b, num_outputs, outputs);
b->cursor = nir_after_cf_list(&impl->body);
if (!num_es_threads)
num_es_threads = nir_load_merged_wave_info_amd(b);
export_vertex_params_gfx11(b, NULL, num_es_threads, num_outputs, outputs,
options->vs_output_param_offset);
}
}
if (options->can_cull) {
/* Replace uniforms. */
apply_reusable_variables(b, &state);
@@ -2279,7 +2319,50 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
nir_ssa_def *pos_val = nir_load_var(b, state.position_value_var);
nir_io_semantics io_sem = { .location = VARYING_SLOT_POS, .num_slots = 1 };
nir_store_output(b, pos_val, nir_imm_int(b, 0), .base = state.position_store_base,
.component = 0, .io_semantics = io_sem);
.component = 0, .io_semantics = io_sem, .src_type = nir_type_float32);
}
/* Gather outputs data and types */
b->cursor = nir_after_cf_list(&if_es_thread->then_list);
ngg_nogs_gather_outputs(b, &if_es_thread->then_list, &state);
if (state.has_user_edgeflags)
ngg_nogs_store_edgeflag_to_lds(b, &state);
if (state.streamout_enabled) {
/* TODO: support culling after streamout. */
assert(!options->can_cull);
ngg_nogs_store_xfb_outputs_to_lds(b, &state);
b->cursor = nir_after_cf_list(&impl->body);
ngg_nogs_build_streamout(b, &state);
}
/* Take care of late primitive export */
if (!state.early_prim_export) {
b->cursor = nir_after_cf_list(&impl->body);
emit_ngg_nogs_prim_export(b, &state, nir_load_var(b, prim_exp_arg_var));
}
/* Export varyings for GFX11+ */
if (state.options->gfx_level >= GFX11) {
vs_output outputs[64];
b->cursor = nir_after_cf_list(&if_es_thread->then_list);
unsigned num_outputs = gather_vs_outputs(b, outputs, &state);
if (num_outputs) {
b->cursor = nir_after_cf_node(&if_es_thread->cf_node);
create_vertex_param_phis(b, num_outputs, outputs);
b->cursor = nir_after_cf_list(&impl->body);
if (!num_es_threads)
num_es_threads = nir_load_merged_wave_info_amd(b);
export_vertex_params_gfx11(b, NULL, num_es_threads, num_outputs, outputs,
options->vs_output_param_offset);
}
}
nir_metadata_preserve(impl, nir_metadata_none);