broadcom/compiler: skip jumps in non-uniform if/then when block cost is small

We have an optimization for non-uniform if/else where if all channels meet the
jump condition we emit a branch to jump straight to the ELSE block. Similarly,
if at the end of the THEN block we don't have any channels that would execute
the ELSE block, we emit a branch to jump straight to the AFTER block.

This optimization has a cost though: we need to emit the condition for the
branch and a branch instruction (which also comes with a 3 delay slot), so for
very small blocks (just a couple of ALU for example) emitting the branch
instruction is typically worse. Futher, if the condition for the branch is not
met, we still pay the cost for no benefit at all.

Here is an example:

nop                           ; fmul.ifa rf26, 0x3e800000, rf54
xor.pushz -, rf52, 2          ; nop
bu.alla  32, r:unif (0x00000000 / 0.000000)
nop                           ; nop
nop                           ; nop
nop                           ; nop
xor.pushz -, rf52, 3          ; nop
nop                           ; mov.ifa rf52, 0
nop                           ; mov.pushz -, rf52
nop                           ; mov.ifa rf26, 0x3f800000

The bu instruction here is setup to jump over the following 4 instructions
(the last 4 instructions in there). To do this, we pay the price of the xor
to generate the condition, the bu instruction, and the 3 delay slots right
after it, so we end up paying 6 instructions to skip over 4 which we pay
always, even if the branch is not taken and we still have to execute those
4 instructions. With this change, we produce:

nop                           ; fmul.ifa rf56, 0x3e800000, rf28
xor.pushz -, rf9, 3           ; nop
nop                           ; mov.ifa rf9, 0
nop                           ; mov.pushz -, rf9
nop                           ; mov.ifa rf56, 0x3f800000

Now we don't try to skip the small block, ever. At worse, if all channels
would have met the branch condition, we only pay the cost of the 4
instructions instead of 6, at best, if any channel wouldn't take the
branch, we save ourselves 5 cycles for the branch condition, the branch
instruction and its 3 delay slots.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23161>
This commit is contained in:
Iago Toral Quiroga
2023-05-16 11:34:51 +02:00
committed by Marge Bot
parent 4c8be22c66
commit e401add741

View File

@@ -3819,6 +3819,25 @@ ntq_activate_execute_for_block(struct v3d_compile *c)
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
}
static bool
is_cheap_block(nir_block *block)
{
int32_t cost = 3;
nir_foreach_instr(instr, block) {
switch (instr->type) {
case nir_instr_type_alu:
case nir_instr_type_ssa_undef:
case nir_instr_type_load_const:
if (--cost <= 0)
return false;
break;
default:
return false;
}
}
return true;
}
static void
ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
{
@@ -3963,12 +3982,16 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
c->execute,
vir_uniform_ui(c, else_block->index));
/* Jump to ELSE if nothing is active for THEN, otherwise fall
* through.
/* Jump to ELSE if nothing is active for THEN (unless THEN block is
* so small it won't pay off), otherwise fall through.
*/
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
vir_link_blocks(c->cur_block, else_block);
bool is_cheap = exec_list_is_singular(&if_stmt->then_list) &&
is_cheap_block(nir_if_first_then_block(if_stmt));
if (!is_cheap) {
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
vir_link_blocks(c->cur_block, else_block);
}
vir_link_blocks(c->cur_block, then_block);
/* Process the THEN block. */
@@ -3985,13 +4008,19 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
vir_uniform_ui(c, after_block->index));
/* If everything points at ENDIF, then jump there immediately. */
vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
c->execute,
vir_uniform_ui(c, after_block->index)),
V3D_QPU_PF_PUSHZ);
vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
vir_link_blocks(c->cur_block, after_block);
/* If everything points at ENDIF, then jump there immediately
* (unless ELSE block is so small it won't pay off).
*/
bool is_cheap = exec_list_is_singular(&if_stmt->else_list) &&
is_cheap_block(nir_else_block);
if (!is_cheap) {
vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
c->execute,
vir_uniform_ui(c, after_block->index)),
V3D_QPU_PF_PUSHZ);
vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
vir_link_blocks(c->cur_block, after_block);
}
vir_link_blocks(c->cur_block, else_block);
vir_set_emit_block(c, else_block);