broadcom/compiler: track pipelineable ldvary sequences

If we have two (or more) smooth varyings like this:

nop t3; ldvary.rf0
fmul t5, t3, t0
fadd t6, t5, r5
nop t7; ldvary.rf0
fmul t9, t7, t0
fadd t10, t9, r5
nop t11; ldvary.rf0
fmul t13, t11, t0
fadd t14, t13, r5

We may be able to pipeline them like this:

nop                  ; nop               ; ldvary.r4
nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
fadd  rf13, r0, r5   ; fmul  r2, r1, rf0 ; ldvary.r3
fadd  rf12, r2, r5   ; fmul  r4, r3, rf0 ; ldvary.r0

But in order to do this, we will need to manually tweak the
QPU scheduling.

This patch tracks information about ldvary sequences that are
good candidates for pipelining, and a follow-up patch will
use this information to pipeline them when we emit the QPU
code.

v2 (apinheiro):
  - Rename the v3d_compile fields to avoid confusion with the qinst fields.
  - Assert that a sequence's start instruction is not the same as the end.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9304>
This commit is contained in:
Iago Toral Quiroga
2021-02-26 12:23:12 +01:00
parent c2c2cdc3d3
commit 1d021539a2
2 changed files with 83 additions and 6 deletions

View File

@@ -921,6 +921,59 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
}
static struct qreg
ldvary_sequence_inst(struct v3d_compile *c, struct qreg result)
{
struct qinst *producer =
(struct qinst *) c->cur_block->instructions.prev;
assert(producer);
producer->ldvary_pipelining = true;
c->ldvary_sequence_end_inst = producer;
return result;
}
static struct qreg
emit_smooth_varying(struct v3d_compile *c,
struct qinst *ldvary,
struct qreg vary, struct qreg w, struct qreg r5)
{
if (ldvary) {
c->ldvary_sequence_length++;
ldvary->ldvary_pipelining = true;
if (c->ldvary_sequence_length == 1) {
ldvary->ldvary_pipelining_start = true;
c->ldvary_sequence_start_inst = ldvary;
}
}
return ldvary_sequence_inst(c, vir_FADD(c,
ldvary_sequence_inst(c, vir_FMUL(c, vary, w)), r5));
}
static void
break_smooth_varying_sequence(struct v3d_compile *c)
{
if (!c->ldvary_sequence_start_inst) {
assert(!c->ldvary_sequence_end_inst);
assert(c->ldvary_sequence_length == 0);
return;
}
assert(c->ldvary_sequence_start_inst);
assert(c->ldvary_sequence_end_inst);
assert(c->ldvary_sequence_start_inst != c->ldvary_sequence_end_inst);
/* We need at least two smooth ldvary sequences to do some pipelining */
if (c->ldvary_sequence_length == 1)
c->ldvary_sequence_start_inst->ldvary_pipelining_start = false;
if (c->ldvary_sequence_length > 1)
c->ldvary_sequence_end_inst->ldvary_pipelining_end = true;
c->ldvary_sequence_length = 0;
c->ldvary_sequence_start_inst = NULL;
c->ldvary_sequence_end_inst = NULL;
}
static struct qreg
emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
int8_t input_idx, uint8_t swizzle, int array_index)
@@ -928,9 +981,10 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
struct qinst *ldvary = NULL;
struct qreg vary;
if (c->devinfo->ver >= 41) {
struct qinst *ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
c->undef, c->undef);
ldvary->qpu.sig.ldvary = true;
vary = vir_emit_def(c, ldvary);
@@ -955,7 +1009,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
*/
if (!var) {
assert(input_idx < 0);
return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
return emit_smooth_varying(c, ldvary, vary, c->payload_w, r5);
}
int i = c->num_inputs++;
@@ -969,19 +1023,22 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
case INTERP_MODE_SMOOTH:
if (var->data.centroid) {
BITSET_SET(c->centroid_flags, i);
result = vir_FADD(c, vir_FMUL(c, vary,
c->payload_w_centroid), r5);
result = emit_smooth_varying(c, ldvary, vary,
c->payload_w_centroid, r5);
} else {
result = vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
result = emit_smooth_varying(c, ldvary, vary,
c->payload_w, r5);
}
break;
case INTERP_MODE_NOPERSPECTIVE:
break_smooth_varying_sequence(c);
BITSET_SET(c->noperspective_flags, i);
result = vir_FADD(c, vir_MOV(c, vary), r5);
break;
case INTERP_MODE_FLAT:
break_smooth_varying_sequence(c);
BITSET_SET(c->flat_shade_flags, i);
vir_MOV_dest(c, c->undef, vary);
result = vir_MOV(c, r5);
@@ -2019,6 +2076,8 @@ ntq_setup_fs_inputs(struct v3d_compile *c)
}
}
}
break_smooth_varying_sequence(c);
}
static void

View File

@@ -162,6 +162,19 @@ struct qinst {
* otherwise.
*/
int uniform;
/* Set if this instruction participates in a pipelinable sequence of
* smooth varyings.
*/
bool ldvary_pipelining;
/* Set if this is the ldvary instruction starting a pipelinable
* sequence of smooth varyings.
*/
bool ldvary_pipelining_start;
/* Set if this is the fadd instruction ending a pipelinable
* sequence of smooth varyings.
*/
bool ldvary_pipelining_end;
};
enum quniform_contents {
@@ -769,6 +782,11 @@ struct v3d_compile {
uint32_t program_id;
uint32_t variant_id;
/* Used to track pipelinable sequences of smooth varyings */
struct qinst *ldvary_sequence_start_inst;
struct qinst *ldvary_sequence_end_inst;
uint32_t ldvary_sequence_length;
/* Set to compile program in in 1x, 2x, or 4x threaded mode, where
* SIG_THREAD_SWITCH is used to hide texturing latency at the cost of
* limiting ourselves to the part of the physical reg space.