v3d: add initial compiler plumbing for geometry shaders

Most of the relevant work happens in the v3d_nir_lower_io. Since
geometry shaders can write any number of output vertices, this pass
injects a few variables into the shader code to keep track of things
like the number of vertices emitted or the offsets into the VPM
of the current vertex output, etc. This is also where we handle
EmitVertex() and EmitPrimitive() intrinsics.

The geometry shader VPM output layout has a specific structure
with a 32-bit general header, then another 32-bit header slot for
each output vertex, and finally the actual vertex data.

When vertex shaders are paired with geometry shaders we also need
to consider the following:
  - Only geometry shaders emit fixed function outputs.
  - The coordinate shader used for the vertex stage during binning must
    not drop varyings other than those used by transform feedback, since
    these may be read by the binning GS.

v2:
 - Use MAX3 instead of a chain of MAX2 (Alejandro).
 - Make all loop variables unsigned in ntq_setup_gs_inputs (Alejandro)
 - Update comment in IO owering so it includes the GS stage (Alejandro)

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
This commit is contained in:
Iago Toral Quiroga
2019-10-28 13:24:44 +01:00
parent f63750accf
commit 5d578c27ce
7 changed files with 778 additions and 110 deletions

View File

@@ -1367,11 +1367,20 @@ emit_frag_end(struct v3d_compile *c)
vir_emit_tlb_color_write(c, rt);
}
static inline void
vir_VPM_WRITE_indirect(struct v3d_compile *c,
struct qreg val,
struct qreg vpm_index)
{
assert(c->devinfo->ver >= 40);
vir_STVPMV(c, vpm_index, val);
}
static void
vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
{
if (c->devinfo->ver >= 40) {
vir_STVPMV(c, vir_uniform_ui(c, vpm_index), val);
vir_VPM_WRITE_indirect(c, val, vir_uniform_ui(c, vpm_index));
} else {
/* XXX: v3d33_vir_vpm_write_setup(c); */
vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
@@ -1387,6 +1396,15 @@ emit_vert_end(struct v3d_compile *c)
vir_VPMWT(c);
}
static void
emit_geom_end(struct v3d_compile *c)
{
/* GFXH-1684: VPM writes need to be complete by the end of the shader.
*/
if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
vir_VPMWT(c);
}
void
v3d_optimize_nir(struct nir_shader *s)
{
@@ -1474,7 +1492,7 @@ ntq_emit_vpm_read(struct v3d_compile *c,
}
static void
ntq_setup_vpm_inputs(struct v3d_compile *c)
ntq_setup_vs_inputs(struct v3d_compile *c)
{
/* Figure out how many components of each vertex attribute the shader
* uses. Each variable should have been split to individual
@@ -1565,24 +1583,69 @@ program_reads_point_coord(struct v3d_compile *c)
}
static void
ntq_setup_fs_inputs(struct v3d_compile *c)
get_sorted_input_variables(struct v3d_compile *c,
unsigned *num_entries,
nir_variable ***vars)
{
unsigned num_entries = 0;
*num_entries = 0;
nir_foreach_variable(var, &c->s->inputs)
num_entries++;
(*num_entries)++;
nir_variable *vars[num_entries];
*vars = ralloc_array(c, nir_variable *, *num_entries);
unsigned i = 0;
nir_foreach_variable(var, &c->s->inputs)
vars[i++] = var;
(*vars)[i++] = var;
/* Sort the variables so that we emit the input setup in
* driver_location order. This is required for VPM reads, whose data
* is fetched into the VPM in driver_location (TGSI register index)
* order.
*/
qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
qsort(*vars, *num_entries, sizeof(**vars), driver_location_compare);
}
static void
ntq_setup_gs_inputs(struct v3d_compile *c)
{
nir_variable **vars;
unsigned num_entries;
get_sorted_input_variables(c, &num_entries, &vars);
for (unsigned i = 0; i < num_entries; i++) {
nir_variable *var = vars[i];
/* All GS inputs are arrays with as many entries as vertices
* in the input primitive, but here we only care about the
* per-vertex input type.
*/
const struct glsl_type *type = glsl_without_array(var->type);
unsigned array_len = MAX2(glsl_get_length(type), 1);
unsigned loc = var->data.driver_location;
resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
(loc + array_len) * 4);
for (unsigned j = 0; j < array_len; j++) {
unsigned num_elements = glsl_get_vector_elements(type);
for (unsigned k = 0; k < num_elements; k++) {
unsigned chan = var->data.location_frac + k;
unsigned input_idx = c->num_inputs++;
struct v3d_varying_slot slot =
v3d_slot_from_slot_and_component(var->data.location + j, chan);
c->input_slots[input_idx] = slot;
}
}
}
}
static void
ntq_setup_fs_inputs(struct v3d_compile *c)
{
nir_variable **vars;
unsigned num_entries;
get_sorted_input_variables(c, &num_entries, &vars);
for (unsigned i = 0; i < num_entries; i++) {
nir_variable *var = vars[i];
@@ -1948,6 +2011,40 @@ ntq_emit_color_write(struct v3d_compile *c,
}
}
static void
emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
assert(instr->num_components == 1);
uint32_t base_offset = nir_intrinsic_base(instr);
struct qreg src_offset = ntq_get_src(c, instr->src[1], 0);
struct qreg offset =
vir_ADD(c, vir_uniform_ui(c, base_offset), src_offset);
vir_VPM_WRITE_indirect(c, ntq_get_src(c, instr->src[0], 0), offset);
}
static void
ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
/* XXX perf: Use stvpmv with uniform non-constant offsets and
* stvpmd with non-uniform offsets and enable
* PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR.
*/
if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
ntq_emit_color_write(c, instr);
} else if (c->s->info.stage == MESA_SHADER_GEOMETRY) {
emit_store_output_gs(c, instr);
} else {
assert(c->s->info.stage == MESA_SHADER_VERTEX);
assert(instr->num_components == 1);
vir_VPM_WRITE(c,
ntq_get_src(c, instr->src[0], 0),
nir_intrinsic_base(instr));
}
}
static void
ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
@@ -2090,19 +2187,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_store_output:
/* XXX perf: Use stvpmv with uniform non-constant offsets and
* stvpmd with non-uniform offsets and enable
* PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR.
*/
if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
ntq_emit_color_write(c, instr);
} else {
assert(instr->num_components == 1);
vir_VPM_WRITE(c,
ntq_get_src(c, instr->src[0], 0),
nir_intrinsic_base(instr));
}
ntq_emit_store_output(c, instr);
break;
case nir_intrinsic_image_deref_size:
@@ -2214,6 +2299,34 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
break;
case nir_intrinsic_load_per_vertex_input: {
/* col: vertex index, row = varying index */
struct qreg col = ntq_get_src(c, instr->src[0], 0);
uint32_t row_idx = nir_intrinsic_base(instr) * 4 +
nir_intrinsic_component(instr);
for (int i = 0; i < instr->num_components; i++) {
struct qreg row = vir_uniform_ui(c, row_idx++);
ntq_store_dest(c, &instr->dest, i,
vir_LDVPMG_IN(c, row, col));
}
break;
}
case nir_intrinsic_emit_vertex:
case nir_intrinsic_end_primitive:
unreachable("Should have been lowered in v3d_nir_lower_io");
break;
case nir_intrinsic_load_primitive_id: {
/* gl_PrimitiveIdIn is written by the GBG in the first word of
* VPM output header. According to docs, we should read this
* using ldvpm(v,d)_in (See Table 71).
*/
ntq_store_dest(c, &instr->dest, 0,
vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
break;
}
default:
fprintf(stderr, "Unknown intrinsic: ");
nir_print_instr(&instr->instr, stderr);
@@ -2636,10 +2749,21 @@ nir_to_vir(struct v3d_compile *c)
c->spill_size += V3D_CHANNELS * c->s->scratch_size;
}
if (c->s->info.stage == MESA_SHADER_FRAGMENT)
switch (c->s->info.stage) {
case MESA_SHADER_VERTEX:
ntq_setup_vs_inputs(c);
break;
case MESA_SHADER_GEOMETRY:
ntq_setup_gs_inputs(c);
break;
case MESA_SHADER_FRAGMENT:
ntq_setup_fs_inputs(c);
else
ntq_setup_vpm_inputs(c);
break;
case MESA_SHADER_COMPUTE:
break;
default:
unreachable("unsupported shader stage");
}
ntq_setup_outputs(c);
@@ -2785,6 +2909,9 @@ v3d_nir_to_vir(struct v3d_compile *c)
case MESA_SHADER_FRAGMENT:
emit_frag_end(c);
break;
case MESA_SHADER_GEOMETRY:
emit_geom_end(c);
break;
case MESA_SHADER_VERTEX:
emit_vert_end(c);
break;