broadcom/compiler: emit instructions producing flags earlier

We usually emit flags right before consuming them but this is
suboptimal from the point of view of register pressure: if an
instruction is only used to generate flags then waiting to emit
it right before reading the flags extends the liveness of the
sources used to generate the flags for no gain. This pass will
check for such instructions and try to move them as early as
possible.

Shader-db results below show this is effective to reduce register
pressure, allowing a few shaders to increase thread counts and/or
reduce spilling:

total instructions in shared programs: 11057173 -> 11057076 (<.01%)
instructions in affected programs: 1955543 -> 1955446 (<.01%)
helped: 4214
HURT: 3905
Inconclusive result (value mean confidence interval includes 0).

total threads in shared programs: 425096 -> 425170 (0.02%)
threads in affected programs: 74 -> 148 (100.00%)
helped: 37
HURT: 0
Threads are helped.

total uniforms in shared programs: 3846275 -> 3845674 (-0.02%)
uniforms in affected programs: 23574 -> 22973 (-2.55%)
helped: 217
HURT: 30
Uniforms are helped.

total max-temps in shared programs: 2222910 -> 2220488 (-0.11%)
max-temps in affected programs: 61904 -> 59482 (-3.91%)
helped: 2145
HURT: 113
Max-temps are helped.

total spills in shared programs: 4294 -> 4280 (-0.33%)
spills in affected programs: 148 -> 134 (-9.46%)
helped: 8
HURT: 0

total fills in shared programs: 6497 -> 6468 (-0.45%)
fills in affected programs: 291 -> 262 (-9.97%)
helped: 8
HURT: 0

total sfu-stalls in shared programs: 14344 -> 14611 (1.86%)
sfu-stalls in affected programs: 1308 -> 1575 (20.41%)
helped: 217
HURT: 335
Inconclusive result (%-change mean confidence interval includes 0).

total inst-and-stalls in shared programs: 11071517 -> 11071687 (<.01%)
inst-and-stalls in affected programs: 1946767 -> 1946937 (<.01%)
helped: 4191
HURT: 3909
Inconclusive result (value mean confidence interval includes 0).

total nops in shared programs: 270628 -> 269829 (-0.30%)
nops in affected programs: 22032 -> 21233 (-3.63%)
helped: 1213
HURT: 571
Inconclusive result (%-change mean confidence interval includes 0).

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30511>
This commit is contained in:
Iago Toral Quiroga
2024-08-01 11:22:37 +02:00
parent d9849ac466
commit 086ed1e54b

View File

@@ -4621,6 +4621,71 @@ ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
ntq_emit_cf_list(c, &impl->body);
}
static bool
vir_inst_reads_reg(struct qinst *inst, struct qreg r)
{
for (int i = 0; i < vir_get_nsrc(inst); i++) {
if (inst->src[i].file == r.file && inst->src[i].index == r.index)
return true;
}
return false;
}
static void
sched_flags_in_block(struct v3d_compile *c, struct qblock *block)
{
struct qinst *flags_inst = NULL;
list_for_each_entry_safe_rev(struct qinst, inst, &block->instructions, link) {
/* Check for cases that would prevent us from moving a flags
* instruction any earlier than this instruction:
*
* - The flags instruction reads the result of this instr.
* - The instruction reads or writes flags.
*/
if (flags_inst) {
if (vir_inst_reads_reg(flags_inst, inst->dst) ||
v3d_qpu_writes_flags(&inst->qpu) ||
v3d_qpu_reads_flags(&inst->qpu)) {
list_move_to(&flags_inst->link, &inst->link);
flags_inst = NULL;
}
}
/* Skip if this instruction does more than just write flags */
if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
inst->dst.file != QFILE_NULL ||
!v3d_qpu_writes_flags(&inst->qpu)) {
continue;
}
/* If we already had a flags_inst we should've moved it after
* this instruction in the if (flags_inst) above.
*/
assert(!flags_inst);
flags_inst = inst;
}
/* If we reached the beginning of the block and we still have a flags
* instruction selected we can put it at the top of the block.
*/
if (flags_inst) {
list_move_to(&flags_inst->link, &block->instructions);
flags_inst = NULL;
}
}
/**
* The purpose of this pass is to emit instructions that are only concerned
* with producing flags as early as possible to hopefully reduce liveness
* of their source arguments.
*/
static void
sched_flags(struct v3d_compile *c)
{
vir_for_each_block(block, c)
sched_flags_in_block(c, block);
}
static void
nir_to_vir(struct v3d_compile *c)
{
@@ -4894,6 +4959,7 @@ v3d_nir_to_vir(struct v3d_compile *c)
}
vir_optimize(c);
sched_flags(c);
vir_check_payload_w(c);