Revert "vc4: Lazily emit our FS/VS input loads."
This reverts commit 292c24ddac
. It broke a
lot of GLES2 deqp, and I see at least one problem that will require some
serious rework to fix.
This commit is contained in:
@@ -174,10 +174,10 @@ struct vc4_compiled_shader {
|
|||||||
|
|
||||||
uint8_t num_inputs;
|
uint8_t num_inputs;
|
||||||
|
|
||||||
/** Byte offsets for the start of the vertex attributes. */
|
/* Byte offsets for the start of the vertex attributes 0-7, and the
|
||||||
uint8_t vattr_offsets[8];
|
* total size as "attribute" 8.
|
||||||
/** Total size of the vertex inputs, in bytes. */
|
*/
|
||||||
uint8_t vattr_total_size;
|
uint8_t vattr_offsets[9];
|
||||||
uint8_t vattrs_live;
|
uint8_t vattrs_live;
|
||||||
|
|
||||||
const struct vc4_fs_inputs *fs_inputs;
|
const struct vc4_fs_inputs *fs_inputs;
|
||||||
|
@@ -170,14 +170,14 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4,
|
|||||||
/* VC4_DIRTY_COMPILED_VS */
|
/* VC4_DIRTY_COMPILED_VS */
|
||||||
cl_u16(&shader_rec, 0); /* vs num uniforms */
|
cl_u16(&shader_rec, 0); /* vs num uniforms */
|
||||||
cl_u8(&shader_rec, vc4->prog.vs->vattrs_live);
|
cl_u8(&shader_rec, vc4->prog.vs->vattrs_live);
|
||||||
cl_u8(&shader_rec, vc4->prog.vs->vattr_total_size);
|
cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]);
|
||||||
cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.vs->bo, 0);
|
cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.vs->bo, 0);
|
||||||
cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
|
cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
|
||||||
|
|
||||||
/* VC4_DIRTY_COMPILED_CS */
|
/* VC4_DIRTY_COMPILED_CS */
|
||||||
cl_u16(&shader_rec, 0); /* cs num uniforms */
|
cl_u16(&shader_rec, 0); /* cs num uniforms */
|
||||||
cl_u8(&shader_rec, vc4->prog.cs->vattrs_live);
|
cl_u8(&shader_rec, vc4->prog.cs->vattrs_live);
|
||||||
cl_u8(&shader_rec, vc4->prog.cs->vattr_total_size);
|
cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]);
|
||||||
cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.cs->bo, 0);
|
cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.cs->bo, 0);
|
||||||
cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
|
cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
|
||||||
|
|
||||||
|
@@ -733,14 +733,11 @@ emit_vertex_input(struct vc4_compile *c, int attr)
|
|||||||
{
|
{
|
||||||
enum pipe_format format = c->vs_key->attr_formats[attr];
|
enum pipe_format format = c->vs_key->attr_formats[attr];
|
||||||
uint32_t attr_size = util_format_get_blocksize(format);
|
uint32_t attr_size = util_format_get_blocksize(format);
|
||||||
uint32_t vpm_attr = c->next_vpm_input++;
|
|
||||||
|
|
||||||
c->vpm_input_order[vpm_attr] = attr;
|
c->vattr_sizes[attr] = align(attr_size, 4);
|
||||||
|
|
||||||
c->vattr_sizes[vpm_attr] = align(attr_size, 4);
|
|
||||||
for (int i = 0; i < align(attr_size, 4) / 4; i++) {
|
for (int i = 0; i < align(attr_size, 4) / 4; i++) {
|
||||||
c->inputs[attr * 4 + i] =
|
c->inputs[attr * 4 + i] =
|
||||||
qir_MOV(c, qir_reg(QFILE_VPM, vpm_attr * 4 + i));
|
qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i));
|
||||||
c->num_inputs++;
|
c->num_inputs++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1469,7 +1466,6 @@ emit_stub_vpm_read(struct vc4_compile *c)
|
|||||||
if (c->num_inputs)
|
if (c->num_inputs)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
c->next_vpm_input++;
|
|
||||||
c->vattr_sizes[0] = 4;
|
c->vattr_sizes[0] = 4;
|
||||||
(void)qir_MOV(c, qir_reg(QFILE_VPM, 0));
|
(void)qir_MOV(c, qir_reg(QFILE_VPM, 0));
|
||||||
c->num_inputs++;
|
c->num_inputs++;
|
||||||
@@ -1556,6 +1552,64 @@ vc4_optimize_nir(struct nir_shader *s)
|
|||||||
} while (progress);
|
} while (progress);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
driver_location_compare(const void *in_a, const void *in_b)
|
||||||
|
{
|
||||||
|
const nir_variable *const *a = in_a;
|
||||||
|
const nir_variable *const *b = in_b;
|
||||||
|
|
||||||
|
return (*a)->data.driver_location - (*b)->data.driver_location;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
ntq_setup_inputs(struct vc4_compile *c)
|
||||||
|
{
|
||||||
|
unsigned num_entries = 0;
|
||||||
|
nir_foreach_variable(var, &c->s->inputs)
|
||||||
|
num_entries++;
|
||||||
|
|
||||||
|
nir_variable *vars[num_entries];
|
||||||
|
|
||||||
|
unsigned i = 0;
|
||||||
|
nir_foreach_variable(var, &c->s->inputs)
|
||||||
|
vars[i++] = var;
|
||||||
|
|
||||||
|
/* Sort the variables so that we emit the input setup in
|
||||||
|
* driver_location order. This is required for VPM reads, whose data
|
||||||
|
* is fetched into the VPM in driver_location (TGSI register index)
|
||||||
|
* order.
|
||||||
|
*/
|
||||||
|
qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < num_entries; i++) {
|
||||||
|
nir_variable *var = vars[i];
|
||||||
|
unsigned array_len = MAX2(glsl_get_length(var->type), 1);
|
||||||
|
unsigned loc = var->data.driver_location;
|
||||||
|
|
||||||
|
assert(array_len == 1);
|
||||||
|
(void)array_len;
|
||||||
|
resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
|
||||||
|
(loc + 1) * 4);
|
||||||
|
|
||||||
|
if (c->stage == QSTAGE_FRAG) {
|
||||||
|
if (var->data.location == VARYING_SLOT_POS) {
|
||||||
|
emit_fragcoord_input(c, loc);
|
||||||
|
} else if (var->data.location == VARYING_SLOT_PNTC ||
|
||||||
|
(var->data.location >= VARYING_SLOT_VAR0 &&
|
||||||
|
(c->fs_key->point_sprite_mask &
|
||||||
|
(1 << (var->data.location -
|
||||||
|
VARYING_SLOT_VAR0))))) {
|
||||||
|
c->inputs[loc * 4 + 0] = c->point_x;
|
||||||
|
c->inputs[loc * 4 + 1] = c->point_y;
|
||||||
|
} else {
|
||||||
|
emit_fragment_input(c, loc, var->data.location);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
emit_vertex_input(c, loc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
ntq_setup_outputs(struct vc4_compile *c)
|
ntq_setup_outputs(struct vc4_compile *c)
|
||||||
{
|
{
|
||||||
@@ -1686,73 +1740,10 @@ ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Size our inputs array as far as this input. Input arrays are
|
uint32_t offset = nir_intrinsic_base(instr) + const_offset->u32[0];
|
||||||
* small, and we don't have a shader_info field that tells us up front
|
|
||||||
* what the maximum driver_location is.
|
|
||||||
*/
|
|
||||||
uint32_t loc = nir_intrinsic_base(instr) + const_offset->u32[0];
|
|
||||||
if ((loc + 1) * 4 > c->inputs_array_size) {
|
|
||||||
resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
|
|
||||||
(loc + 1) * 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If we've already loaded this input, just return it. This would
|
|
||||||
* happen for VPM loads, where we load an entire vertex attribute at
|
|
||||||
* once, or possibly also in the FS if we haven't CSEed away repeated
|
|
||||||
* loads.
|
|
||||||
*/
|
|
||||||
int comp = nir_intrinsic_component(instr);
|
int comp = nir_intrinsic_component(instr);
|
||||||
if (c->inputs[loc * 4 + comp].file != QFILE_NULL) {
|
|
||||||
ntq_store_dest(c, &instr->dest, 0,
|
|
||||||
qir_MOV(c, c->inputs[loc * 4 + comp]));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* In the FS, we always have to fully drain our FS FIFO before
|
|
||||||
* terminating the shader. For the VS we only have to drain whatever
|
|
||||||
* VPM setup we configure, but vc4_qpu_emit.c configures it for the
|
|
||||||
* entire vertex attribute space. Because of this, we emit our lazy
|
|
||||||
* varying/VPM loads at the last top level basic block.
|
|
||||||
*/
|
|
||||||
struct qblock *saved_cur_block = c->cur_block;
|
|
||||||
c->cur_block = c->last_top_block;
|
|
||||||
|
|
||||||
/* Look up the NIR variable for this input, so we can see how big the
|
|
||||||
* input is, or what sort of interpolation is necessary.
|
|
||||||
*/
|
|
||||||
nir_variable *var = NULL;
|
|
||||||
nir_foreach_variable(search_var, &c->s->inputs) {
|
|
||||||
unsigned search_len = MAX2(glsl_get_length(search_var->type), 1);
|
|
||||||
unsigned search_loc = search_var->data.driver_location;
|
|
||||||
|
|
||||||
if (loc >= search_loc && loc < search_loc + search_len) {
|
|
||||||
var = search_var;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assert(var);
|
|
||||||
|
|
||||||
if (c->stage == QSTAGE_FRAG) {
|
|
||||||
if (var->data.location == VARYING_SLOT_POS) {
|
|
||||||
emit_fragcoord_input(c, loc);
|
|
||||||
} else if (var->data.location == VARYING_SLOT_PNTC ||
|
|
||||||
(var->data.location >= VARYING_SLOT_VAR0 &&
|
|
||||||
(c->fs_key->point_sprite_mask &
|
|
||||||
(1 << (var->data.location -
|
|
||||||
VARYING_SLOT_VAR0))))) {
|
|
||||||
c->inputs[loc * 4 + 0] = c->point_x;
|
|
||||||
c->inputs[loc * 4 + 1] = c->point_y;
|
|
||||||
} else {
|
|
||||||
emit_fragment_input(c, loc, var->data.location);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
emit_vertex_input(c, loc);
|
|
||||||
}
|
|
||||||
|
|
||||||
c->cur_block = saved_cur_block;
|
|
||||||
|
|
||||||
ntq_store_dest(c, &instr->dest, 0,
|
ntq_store_dest(c, &instr->dest, 0,
|
||||||
qir_MOV(c, c->inputs[loc * 4 + comp]));
|
qir_MOV(c, c->inputs[offset * 4 + comp]));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@@ -2170,6 +2161,7 @@ nir_to_qir(struct vc4_compile *c)
|
|||||||
if (c->stage == QSTAGE_FRAG && c->s->info->fs.uses_discard)
|
if (c->stage == QSTAGE_FRAG && c->s->info->fs.uses_discard)
|
||||||
c->discard = qir_MOV(c, qir_uniform_ui(c, 0));
|
c->discard = qir_MOV(c, qir_uniform_ui(c, 0));
|
||||||
|
|
||||||
|
ntq_setup_inputs(c);
|
||||||
ntq_setup_outputs(c);
|
ntq_setup_outputs(c);
|
||||||
ntq_setup_uniforms(c);
|
ntq_setup_uniforms(c);
|
||||||
ntq_setup_registers(c, &c->s->registers);
|
ntq_setup_registers(c, &c->s->registers);
|
||||||
@@ -2596,17 +2588,14 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
|
|||||||
} else {
|
} else {
|
||||||
shader->num_inputs = c->num_inputs;
|
shader->num_inputs = c->num_inputs;
|
||||||
|
|
||||||
uint8_t next_vattr_offset = 0;
|
shader->vattr_offsets[0] = 0;
|
||||||
for (int i = 0; i < c->next_vpm_input; i++) {
|
for (int i = 0; i < 8; i++) {
|
||||||
if (!c->vattr_sizes[i])
|
shader->vattr_offsets[i + 1] =
|
||||||
continue;
|
shader->vattr_offsets[i] + c->vattr_sizes[i];
|
||||||
|
|
||||||
uint32_t nir_attr = c->vpm_input_order[i];
|
if (c->vattr_sizes[i])
|
||||||
shader->vattr_offsets[nir_attr] = next_vattr_offset;
|
shader->vattrs_live |= (1 << i);
|
||||||
next_vattr_offset += c->vattr_sizes[i];
|
|
||||||
shader->vattrs_live |= (1 << nir_attr);
|
|
||||||
}
|
}
|
||||||
shader->vattr_total_size = next_vattr_offset;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
shader->failed = c->failed;
|
shader->failed = c->failed;
|
||||||
|
@@ -461,13 +461,6 @@ struct vc4_compile {
|
|||||||
|
|
||||||
uint8_t vattr_sizes[8];
|
uint8_t vattr_sizes[8];
|
||||||
|
|
||||||
/**
|
|
||||||
* Order in which the vattrs were loaded by the program, to arrange
|
|
||||||
* vattr_offsets[] in the program data appropriately.
|
|
||||||
*/
|
|
||||||
uint8_t vpm_input_order[8];
|
|
||||||
uint8_t next_vpm_input;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
|
* Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
|
||||||
*
|
*
|
||||||
|
Reference in New Issue
Block a user