broadcom/compiler: track pipelineable ldvary sequences
If we have two (or more) smooth varyings like this: nop t3; ldvary.rf0 fmul t5, t3, t0 fadd t6, t5, r5 nop t7; ldvary.rf0 fmul t9, t7, t0 fadd t10, t9, r5 nop t11; ldvary.rf0 fmul t13, t11, t0 fadd t14, t13, r5 We may be able to pipeline them like this: nop ; nop ; ldvary.r4 nop ; fmul r0, r4, rf0 ; ldvary.r1 fadd rf13, r0, r5 ; fmul r2, r1, rf0 ; ldvary.r3 fadd rf12, r2, r5 ; fmul r4, r3, rf0 ; ldvary.r0 But in order to do this, we will need to manually tweak the QPU scheduling. This patch tracks information about ldvary sequences that are good candidates for pipelining, and a follow-up patch will use this information to pipeline them when we emit the QPU code. v2 (apinheiro): - Rename the v3d_compile fields to avoid confusion with the qinst fields. - Assert that a sequence's start instruction is not the same as the end. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9304>
This commit is contained in:
@@ -921,6 +921,59 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
|
||||
c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
|
||||
}
|
||||
|
||||
static struct qreg
|
||||
ldvary_sequence_inst(struct v3d_compile *c, struct qreg result)
|
||||
{
|
||||
struct qinst *producer =
|
||||
(struct qinst *) c->cur_block->instructions.prev;
|
||||
assert(producer);
|
||||
producer->ldvary_pipelining = true;
|
||||
c->ldvary_sequence_end_inst = producer;
|
||||
return result;
|
||||
}
|
||||
|
||||
static struct qreg
|
||||
emit_smooth_varying(struct v3d_compile *c,
|
||||
struct qinst *ldvary,
|
||||
struct qreg vary, struct qreg w, struct qreg r5)
|
||||
{
|
||||
if (ldvary) {
|
||||
c->ldvary_sequence_length++;
|
||||
ldvary->ldvary_pipelining = true;
|
||||
if (c->ldvary_sequence_length == 1) {
|
||||
ldvary->ldvary_pipelining_start = true;
|
||||
c->ldvary_sequence_start_inst = ldvary;
|
||||
}
|
||||
}
|
||||
return ldvary_sequence_inst(c, vir_FADD(c,
|
||||
ldvary_sequence_inst(c, vir_FMUL(c, vary, w)), r5));
|
||||
}
|
||||
|
||||
static void
|
||||
break_smooth_varying_sequence(struct v3d_compile *c)
|
||||
{
|
||||
if (!c->ldvary_sequence_start_inst) {
|
||||
assert(!c->ldvary_sequence_end_inst);
|
||||
assert(c->ldvary_sequence_length == 0);
|
||||
return;
|
||||
}
|
||||
|
||||
assert(c->ldvary_sequence_start_inst);
|
||||
assert(c->ldvary_sequence_end_inst);
|
||||
assert(c->ldvary_sequence_start_inst != c->ldvary_sequence_end_inst);
|
||||
|
||||
/* We need at least two smooth ldvary sequences to do some pipelining */
|
||||
if (c->ldvary_sequence_length == 1)
|
||||
c->ldvary_sequence_start_inst->ldvary_pipelining_start = false;
|
||||
|
||||
if (c->ldvary_sequence_length > 1)
|
||||
c->ldvary_sequence_end_inst->ldvary_pipelining_end = true;
|
||||
|
||||
c->ldvary_sequence_length = 0;
|
||||
c->ldvary_sequence_start_inst = NULL;
|
||||
c->ldvary_sequence_end_inst = NULL;
|
||||
}
|
||||
|
||||
static struct qreg
|
||||
emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
|
||||
int8_t input_idx, uint8_t swizzle, int array_index)
|
||||
@@ -928,9 +981,10 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
|
||||
struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
|
||||
struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
|
||||
|
||||
struct qinst *ldvary = NULL;
|
||||
struct qreg vary;
|
||||
if (c->devinfo->ver >= 41) {
|
||||
struct qinst *ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
|
||||
ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
|
||||
c->undef, c->undef);
|
||||
ldvary->qpu.sig.ldvary = true;
|
||||
vary = vir_emit_def(c, ldvary);
|
||||
@@ -955,7 +1009,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
|
||||
*/
|
||||
if (!var) {
|
||||
assert(input_idx < 0);
|
||||
return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
|
||||
return emit_smooth_varying(c, ldvary, vary, c->payload_w, r5);
|
||||
}
|
||||
|
||||
int i = c->num_inputs++;
|
||||
@@ -969,19 +1023,22 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
|
||||
case INTERP_MODE_SMOOTH:
|
||||
if (var->data.centroid) {
|
||||
BITSET_SET(c->centroid_flags, i);
|
||||
result = vir_FADD(c, vir_FMUL(c, vary,
|
||||
c->payload_w_centroid), r5);
|
||||
result = emit_smooth_varying(c, ldvary, vary,
|
||||
c->payload_w_centroid, r5);
|
||||
} else {
|
||||
result = vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
|
||||
result = emit_smooth_varying(c, ldvary, vary,
|
||||
c->payload_w, r5);
|
||||
}
|
||||
break;
|
||||
|
||||
case INTERP_MODE_NOPERSPECTIVE:
|
||||
break_smooth_varying_sequence(c);
|
||||
BITSET_SET(c->noperspective_flags, i);
|
||||
result = vir_FADD(c, vir_MOV(c, vary), r5);
|
||||
break;
|
||||
|
||||
case INTERP_MODE_FLAT:
|
||||
break_smooth_varying_sequence(c);
|
||||
BITSET_SET(c->flat_shade_flags, i);
|
||||
vir_MOV_dest(c, c->undef, vary);
|
||||
result = vir_MOV(c, r5);
|
||||
@@ -2019,6 +2076,8 @@ ntq_setup_fs_inputs(struct v3d_compile *c)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
break_smooth_varying_sequence(c);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@@ -162,6 +162,19 @@ struct qinst {
|
||||
* otherwise.
|
||||
*/
|
||||
int uniform;
|
||||
|
||||
/* Set if this instruction participates in a pipelinable sequence of
|
||||
* smooth varyings.
|
||||
*/
|
||||
bool ldvary_pipelining;
|
||||
/* Set if this is the ldvary instruction starting a pipelinable
|
||||
* sequence of smooth varyings.
|
||||
*/
|
||||
bool ldvary_pipelining_start;
|
||||
/* Set if this is the fadd instruction ending a pipelinable
|
||||
* sequence of smooth varyings.
|
||||
*/
|
||||
bool ldvary_pipelining_end;
|
||||
};
|
||||
|
||||
enum quniform_contents {
|
||||
@@ -769,6 +782,11 @@ struct v3d_compile {
|
||||
uint32_t program_id;
|
||||
uint32_t variant_id;
|
||||
|
||||
/* Used to track pipelinable sequences of smooth varyings */
|
||||
struct qinst *ldvary_sequence_start_inst;
|
||||
struct qinst *ldvary_sequence_end_inst;
|
||||
uint32_t ldvary_sequence_length;
|
||||
|
||||
/* Set to compile program in in 1x, 2x, or 4x threaded mode, where
|
||||
* SIG_THREAD_SWITCH is used to hide texturing latency at the cost of
|
||||
* limiting ourselves to the part of the physical reg space.
|
||||
|
Reference in New Issue
Block a user