broadcom/compiler: track pending ldtmu count with each TMU lookup

And use this information when scheduling QPU to avoid merging
a new TMU request into a previous ldtmu instruction when doing
so may cause TMU output fifo overflow due to a stalling ldtmu.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22044>
This commit is contained in:
Iago Toral Quiroga
2023-03-20 11:15:40 +01:00
committed by Marge Bot
parent c09482b293
commit 1e28f2a6f2
5 changed files with 86 additions and 17 deletions

View File

@@ -449,6 +449,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
int offset_src,
struct qreg base_offset,
uint32_t const_offset,
uint32_t dest_components,
uint32_t *tmu_writes)
{
if (mode == MODE_COUNT) {
@@ -494,6 +495,8 @@ emit_tmu_general_address_write(struct v3d_compile *c,
if (vir_in_nonuniform_control_flow(c))
vir_set_cond(tmu, V3D_QPU_COND_IFA);
tmu->ldtmu_count = dest_components;
}
/**
@@ -684,7 +687,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
emit_tmu_general_address_write(c, mode, instr, config,
dynamic_src, offset_src,
base_offset, const_offset,
&tmu_writes);
dest_components, &tmu_writes);
assert(tmu_writes > 0);
if (mode == MODE_COUNT) {

View File

@@ -496,6 +496,8 @@ struct choose_scoreboard {
bool last_thrsw_emitted;
bool fixup_ldvary;
int ldvary_count;
int pending_ldtmu_count;
bool first_ldtmu_after_thrsw;
};
static bool
@@ -1207,6 +1209,29 @@ retry:
continue;
}
/* We can emit a new tmu lookup with a previous ldtmu
* if doing this would free just enough space in the
* TMU output fifo so we don't overflow, however, this
* is only safe if the ldtmu cannot stall.
*
* A ldtmu can stall if it is not the first following a
* thread switch and corresponds to the first word of a
* read request.
*
* FIXME: For now we forbid pairing up a new lookup
* with a previous ldtmu that is not the first after a
* thrsw if that could overflow the TMU output fifo
* regardless of whether the ldtmu is reading the first
* word of a TMU result or not, since we don't track
* this aspect in the compiler yet.
*/
if (prev_inst->inst->qpu.sig.ldtmu &&
!scoreboard->first_ldtmu_after_thrsw &&
(scoreboard->pending_ldtmu_count +
n->inst->ldtmu_count > 16 / c->threads)) {
continue;
}
struct v3d_qpu_instr merged_inst;
if (!qpu_merge_inst(c->devinfo, &merged_inst,
&prev_inst->inst->qpu, inst)) {
@@ -1294,11 +1319,32 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
}
}
static void
update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
const struct qinst *inst)
{
/* Track if the have seen any ldtmu after the last thread switch */
if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
scoreboard->first_ldtmu_after_thrsw = true;
/* Track the number of pending ldtmu instructions for outstanding
* TMU lookups.
*/
scoreboard->pending_ldtmu_count += inst->ldtmu_count;
if (inst->qpu.sig.ldtmu) {
assert(scoreboard->pending_ldtmu_count > 0);
scoreboard->pending_ldtmu_count--;
scoreboard->first_ldtmu_after_thrsw = false;
}
}
static void
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
const struct v3d_qpu_instr *inst,
const struct qinst *qinst,
const struct v3d_device_info *devinfo)
{
const struct v3d_qpu_instr *inst = &qinst->qpu;
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
return;
@@ -1334,6 +1380,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
if (inst->sig.ldvary)
scoreboard->last_ldvary_tick = scoreboard->tick;
update_scoreboard_tmu_tracking(scoreboard, qinst);
}
static void
@@ -1495,7 +1543,7 @@ insert_scheduled_instruction(struct v3d_compile *c,
{
list_addtail(&inst->link, &block->instructions);
update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
c->qpu_inst_count++;
scoreboard->tick++;
}
@@ -2229,6 +2277,9 @@ schedule_instructions(struct v3d_compile *c,
merge->inst->uniform;
}
chosen->inst->ldtmu_count +=
merge->inst->ldtmu_count;
if (debug) {
fprintf(stderr, "t=%4d: merging: ",
time);
@@ -2478,6 +2529,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
scoreboard.last_branch_tick = -10;
scoreboard.last_setmsf_tick = -10;
scoreboard.last_stallable_sfu_tick = -10;
scoreboard.first_ldtmu_after_thrsw = true;
if (debug) {
fprintf(stderr, "Pre-schedule instructions\n");

View File

@@ -30,25 +30,27 @@
#define __gen_emit_reloc(cl, reloc)
#include "cle/v3d_packet_v41_pack.h"
static inline void
static inline struct qinst *
vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
{
/* XXX perf: We should figure out how to merge ALU operations
* producing the val with this MOV, when possible.
*/
vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
}
static inline void
static inline struct qinst *
vir_TMU_WRITE_or_count(struct v3d_compile *c,
enum v3d_qpu_waddr waddr,
struct qreg val,
uint32_t *tmu_writes)
{
if (tmu_writes)
if (tmu_writes) {
(*tmu_writes)++;
else
vir_TMU_WRITE(c, waddr, val);
return NULL;
} else {
return vir_TMU_WRITE(c, waddr, val);
}
}
static void
@@ -381,17 +383,19 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
/* Emit retiring TMU write */
struct qinst *retiring;
if (instr->op == nir_texop_txf) {
assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
} else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
} else if (instr->op == nir_texop_txl) {
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
} else {
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
}
retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
ntq_add_pending_tmu_flush(c, &instr->dest,
p0_unpacked.return_words_of_texture_data);
}
@@ -440,7 +444,7 @@ v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr)
* which is why we always call ntq_get_src() even if we are only interested in
* register write counts.
*/
static void
static struct qinst *
vir_image_emit_register_writes(struct v3d_compile *c,
nir_intrinsic_instr *instr,
bool atomic_add_replaced,
@@ -507,7 +511,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
V3D_QPU_PF_PUSHZ);
}
vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
struct qinst *retiring =
vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
instr->intrinsic != nir_intrinsic_image_load) {
@@ -515,6 +520,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
(struct qinst *)c->cur_block->instructions.prev;
vir_set_cond(last_inst, V3D_QPU_COND_IFA);
}
return retiring;
}
static unsigned
@@ -612,8 +619,9 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)))
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
struct qinst *retiring =
vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
ntq_add_pending_tmu_flush(c, &instr->dest,
p0_unpacked.return_words_of_texture_data);
}

View File

@@ -173,6 +173,11 @@ struct qinst {
/* If this is a a TLB Z write */
bool is_tlb_z_write;
/* If this is a retiring TMU instruction (the last in a lookup sequence),
* how many ldtmu instructions are required to read the results.
*/
uint32_t ldtmu_count;
/* Position of this instruction in the program. Filled in during
* register allocation.
*/

View File

@@ -443,6 +443,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
inst->qpu.flags.ac = cond;
inst->ldtmu_count = 1;
inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
0xffffff7f); /* per-quad */