ac/nir/ngg: gs support 16bit outputs

radeonsi uses 16bit varying slots.

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Signed-off-by: Qiang Yu <yuq825@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19697>
This commit is contained in:
Qiang Yu
2022-11-12 11:58:03 +08:00
committed by Marge Bot
parent 68519891a7
commit abe2e99e9e

View File

@@ -111,7 +111,6 @@ typedef struct
const ac_nir_lower_ngg_options *options;
nir_function_impl *impl;
nir_variable *output_vars[VARYING_SLOT_MAX][4];
nir_variable *current_clear_primflag_idx_var;
int const_out_vtxcnt[4];
int const_out_prmcnt[4];
@@ -124,7 +123,14 @@ typedef struct
bool found_out_vtxcnt[4];
bool output_compile_time_known;
bool streamout_enabled;
/* 32 bit outputs */
nir_variable *output_vars[VARYING_SLOT_MAX][4];
gs_output_info output_info[VARYING_SLOT_MAX];
/* 16 bit outputs */
nir_variable *output_vars_16bit_hi[16][4];
nir_variable *output_vars_16bit_lo[16][4];
gs_output_info output_info_16bit_hi[16];
gs_output_info output_info_16bit_lo[16];
} lower_ngg_gs_state;
/* LDS layout of Mesh Shader workgroup info. */
@@ -2470,10 +2476,7 @@ lower_ngg_gs_store_output(nir_builder *b, nir_intrinsic_instr *intrin, lower_ngg
nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
unsigned location = io_sem.location + base_offset;
assert(location < VARYING_SLOT_MAX);
unsigned base_index = base + base_offset;
assert(base_index < VARYING_SLOT_MAX);
nir_ssa_def *store_val = intrin->src[0].ssa;
nir_alu_type src_type = nir_intrinsic_src_type(intrin);
@@ -2487,8 +2490,25 @@ lower_ngg_gs_store_output(nir_builder *b, nir_intrinsic_instr *intrin, lower_ngg
assert(store_val->bit_size <= 32);
assert(glsl_base_type_get_bit_size(val_type) == store_val->bit_size);
/* Save output usage info. */
gs_output_info *info = &s->output_info[location];
/* Get corresponding output variable and usage info. */
nir_variable **var;
gs_output_info *info;
if (location >= VARYING_SLOT_VAR0_16BIT) {
unsigned index = location - VARYING_SLOT_VAR0_16BIT;
assert(index < 16);
if (io_sem.high_16bits) {
var = s->output_vars_16bit_hi[index];
info = s->output_info_16bit_hi + index;
} else {
var = s->output_vars_16bit_lo[index];
info = s->output_info_16bit_lo + index;
}
} else {
assert(location < VARYING_SLOT_MAX);
var = s->output_vars[location];
info = s->output_info + location;
}
for (unsigned comp = 0; comp < store_val->num_components; ++comp) {
if (!(writemask & (1 << comp)))
@@ -2516,14 +2536,13 @@ lower_ngg_gs_store_output(nir_builder *b, nir_intrinsic_instr *intrin, lower_ngg
info->no_varying = io_sem.no_varying;
info->no_sysval_output = io_sem.no_sysval_output;
nir_variable *var = s->output_vars[location][component];
if (!var) {
var = nir_local_variable_create(s->impl, glsl_scalar_type(val_type), "output");
s->output_vars[location][component] = var;
if (!var[component]) {
var[component] =
nir_local_variable_create(s->impl, glsl_scalar_type(val_type), "output");
}
assert(glsl_get_base_type(var->type) == val_type);
assert(glsl_get_base_type(var[component]->type) == val_type);
nir_store_var(b, var, nir_channel(b, store_val, comp), 0x1u);
nir_store_var(b, var[component], nir_channel(b, store_val, comp), 0x1u);
}
nir_instr_remove(&intrin->instr);
@@ -2598,6 +2617,51 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
}
}
/* Store 16bit outputs to LDS. */
unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written);
u_foreach_bit(slot, b->shader->info.outputs_written_16bit) {
unsigned packed_location = num_32bit_outputs +
util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(slot));
unsigned mask_lo = gs_output_component_mask_with_stream(s->output_info_16bit_lo + slot, stream);
unsigned mask_hi = gs_output_component_mask_with_stream(s->output_info_16bit_hi + slot, stream);
unsigned mask = mask_lo | mask_hi;
if (!mask)
continue;
nir_ssa_def *undef = nir_ssa_undef(b, 1, 16);
while (mask) {
int start, count;
u_bit_scan_consecutive_range(&mask, &start, &count);
nir_ssa_def *values[4] = {0};
for (int c = start; c < start + count; ++c) {
/* Load and reset the low half var. */
nir_ssa_def *lo = undef;
nir_variable *var_lo = s->output_vars_16bit_lo[slot][c];
if (var_lo) {
lo = nir_load_var(b, var_lo);
nir_store_var(b, var_lo, undef, 1);
}
/* Load and reset the high half var.*/
nir_ssa_def *hi = undef;
nir_variable *var_hi = s->output_vars_16bit_hi[slot][c];
if (var_hi) {
hi = nir_load_var(b, var_hi);
nir_store_var(b, var_hi, undef, 1);
}
values[c - start] = nir_pack_32_2x16_split(b, lo, hi);
}
nir_ssa_def *store_val = nir_vec(b, values, (unsigned)count);
nir_store_shared(b, store_val, gs_emit_vtx_addr,
.base = packed_location * 16 + start * 4,
.align_mul = 4);
}
}
/* Calculate and store per-vertex primitive flags based on vertex counts:
* - bit 0: whether this vertex finishes a primitive (a real primitive, not the strip)
* - bit 1: whether the primitive index is odd (if we are emitting triangle strips, otherwise always 0)
@@ -2741,7 +2805,8 @@ ngg_gs_export_vertices(nir_builder *b, nir_ssa_def *max_num_out_vtx, nir_ssa_def
}
unsigned num_outputs = 0;
vs_output outputs[64];
/* 16 is for 16bit slots */
vs_output outputs[VARYING_SLOT_MAX + 16];
u_foreach_bit64(slot, b->shader->info.outputs_written) {
gs_output_info *info = &s->output_info[slot];
@@ -2802,6 +2867,80 @@ ngg_gs_export_vertices(nir_builder *b, nir_ssa_def *max_num_out_vtx, nir_ssa_def
}
}
/* 16bit outputs */
unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written);
u_foreach_bit(i, b->shader->info.outputs_written_16bit) {
unsigned packed_location = num_32bit_outputs +
util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(i));
unsigned slot = VARYING_SLOT_VAR0_16BIT + i;
gs_output_info *info_lo = s->output_info_16bit_lo + i;
gs_output_info *info_hi = s->output_info_16bit_hi + i;
unsigned mask_lo = info_lo->no_varying ? 0 :
gs_output_component_mask_with_stream(info_lo, 0);
unsigned mask_hi = info_hi->no_varying ? 0 :
gs_output_component_mask_with_stream(info_hi, 0);
unsigned mask = mask_lo | mask_hi;
if (!mask)
continue;
nir_io_semantics io_sem_lo = {
.location = slot,
.num_slots = 1,
.no_varying = info_lo->no_varying,
};
nir_io_semantics io_sem_hi = {
.location = slot,
.num_slots = 1,
.no_varying = info_hi->no_varying,
.high_16bits = true,
};
vs_output *output = NULL;
if (s->options->gfx_level >= GFX11 &&
s->options->vs_output_param_offset[slot] <= AC_EXP_PARAM_OFFSET_31) {
output = &outputs[num_outputs++];
output->slot = slot;
}
while (mask) {
int start, count;
u_bit_scan_consecutive_range(&mask, &start, &count);
nir_ssa_def *load =
nir_load_shared(b, count, 32, exported_out_vtx_lds_addr,
.base = packed_location * 16 + start * 4,
.align_mul = 4);
for (int i = 0; i < count; i++) {
nir_ssa_def *val = nir_channel(b, load, i);
unsigned comp = start + i;
if (output) {
/* low and high varyings have been packed when LDS store */
output->chan[comp] = val;
} else {
if (mask_lo & BITFIELD_BIT(comp)) {
nir_store_output(b, nir_unpack_32_2x16_split_x(b, val),
nir_imm_int(b, 0),
.base = info_lo->base,
.io_semantics = io_sem_lo,
.component = comp,
.write_mask = 1);
}
if (mask_hi & BITFIELD_BIT(comp)) {
nir_store_output(b, nir_unpack_32_2x16_split_y(b, val),
nir_imm_int(b, 0),
.base = info_hi->base,
.io_semantics = io_sem_hi,
.component = comp,
.write_mask = 1);
}
}
}
}
}
nir_export_vertex_amd(b);
nir_pop_if(b, if_vtx_export_thread);