broadcom/compiler: track pending ldtmu count with each TMU lookup

And use this information when scheduling QPU to avoid merging
a new TMU request into a previous ldtmu instruction when doing
so may cause TMU output fifo overflow due to a stalling ldtmu.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22044>
This commit is contained in:
Iago Toral Quiroga
2023-03-20 11:15:40 +01:00
committed by Marge Bot
parent c09482b293
commit 1e28f2a6f2
5 changed files with 86 additions and 17 deletions

View File

@@ -449,6 +449,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
int offset_src, int offset_src,
struct qreg base_offset, struct qreg base_offset,
uint32_t const_offset, uint32_t const_offset,
uint32_t dest_components,
uint32_t *tmu_writes) uint32_t *tmu_writes)
{ {
if (mode == MODE_COUNT) { if (mode == MODE_COUNT) {
@@ -494,6 +495,8 @@ emit_tmu_general_address_write(struct v3d_compile *c,
if (vir_in_nonuniform_control_flow(c)) if (vir_in_nonuniform_control_flow(c))
vir_set_cond(tmu, V3D_QPU_COND_IFA); vir_set_cond(tmu, V3D_QPU_COND_IFA);
tmu->ldtmu_count = dest_components;
} }
/** /**
@@ -684,7 +687,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
emit_tmu_general_address_write(c, mode, instr, config, emit_tmu_general_address_write(c, mode, instr, config,
dynamic_src, offset_src, dynamic_src, offset_src,
base_offset, const_offset, base_offset, const_offset,
&tmu_writes); dest_components, &tmu_writes);
assert(tmu_writes > 0); assert(tmu_writes > 0);
if (mode == MODE_COUNT) { if (mode == MODE_COUNT) {

View File

@@ -496,6 +496,8 @@ struct choose_scoreboard {
bool last_thrsw_emitted; bool last_thrsw_emitted;
bool fixup_ldvary; bool fixup_ldvary;
int ldvary_count; int ldvary_count;
int pending_ldtmu_count;
bool first_ldtmu_after_thrsw;
}; };
static bool static bool
@@ -1207,6 +1209,29 @@ retry:
continue; continue;
} }
/* We can emit a new tmu lookup with a previous ldtmu
* if doing this would free just enough space in the
* TMU output fifo so we don't overflow, however, this
* is only safe if the ldtmu cannot stall.
*
* A ldtmu can stall if it is not the first following a
* thread switch and corresponds to the first word of a
* read request.
*
* FIXME: For now we forbid pairing up a new lookup
* with a previous ldtmu that is not the first after a
* thrsw if that could overflow the TMU output fifo
* regardless of whether the ldtmu is reading the first
* word of a TMU result or not, since we don't track
* this aspect in the compiler yet.
*/
if (prev_inst->inst->qpu.sig.ldtmu &&
!scoreboard->first_ldtmu_after_thrsw &&
(scoreboard->pending_ldtmu_count +
n->inst->ldtmu_count > 16 / c->threads)) {
continue;
}
struct v3d_qpu_instr merged_inst; struct v3d_qpu_instr merged_inst;
if (!qpu_merge_inst(c->devinfo, &merged_inst, if (!qpu_merge_inst(c->devinfo, &merged_inst,
&prev_inst->inst->qpu, inst)) { &prev_inst->inst->qpu, inst)) {
@@ -1294,11 +1319,32 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
} }
} }
static void
update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
const struct qinst *inst)
{
/* Track if the have seen any ldtmu after the last thread switch */
if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
scoreboard->first_ldtmu_after_thrsw = true;
/* Track the number of pending ldtmu instructions for outstanding
* TMU lookups.
*/
scoreboard->pending_ldtmu_count += inst->ldtmu_count;
if (inst->qpu.sig.ldtmu) {
assert(scoreboard->pending_ldtmu_count > 0);
scoreboard->pending_ldtmu_count--;
scoreboard->first_ldtmu_after_thrsw = false;
}
}
static void static void
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
const struct v3d_qpu_instr *inst, const struct qinst *qinst,
const struct v3d_device_info *devinfo) const struct v3d_device_info *devinfo)
{ {
const struct v3d_qpu_instr *inst = &qinst->qpu;
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
return; return;
@@ -1334,6 +1380,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
if (inst->sig.ldvary) if (inst->sig.ldvary)
scoreboard->last_ldvary_tick = scoreboard->tick; scoreboard->last_ldvary_tick = scoreboard->tick;
update_scoreboard_tmu_tracking(scoreboard, qinst);
} }
static void static void
@@ -1495,7 +1543,7 @@ insert_scheduled_instruction(struct v3d_compile *c,
{ {
list_addtail(&inst->link, &block->instructions); list_addtail(&inst->link, &block->instructions);
update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo); update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
c->qpu_inst_count++; c->qpu_inst_count++;
scoreboard->tick++; scoreboard->tick++;
} }
@@ -2229,6 +2277,9 @@ schedule_instructions(struct v3d_compile *c,
merge->inst->uniform; merge->inst->uniform;
} }
chosen->inst->ldtmu_count +=
merge->inst->ldtmu_count;
if (debug) { if (debug) {
fprintf(stderr, "t=%4d: merging: ", fprintf(stderr, "t=%4d: merging: ",
time); time);
@@ -2478,6 +2529,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
scoreboard.last_branch_tick = -10; scoreboard.last_branch_tick = -10;
scoreboard.last_setmsf_tick = -10; scoreboard.last_setmsf_tick = -10;
scoreboard.last_stallable_sfu_tick = -10; scoreboard.last_stallable_sfu_tick = -10;
scoreboard.first_ldtmu_after_thrsw = true;
if (debug) { if (debug) {
fprintf(stderr, "Pre-schedule instructions\n"); fprintf(stderr, "Pre-schedule instructions\n");

View File

@@ -30,25 +30,27 @@
#define __gen_emit_reloc(cl, reloc) #define __gen_emit_reloc(cl, reloc)
#include "cle/v3d_packet_v41_pack.h" #include "cle/v3d_packet_v41_pack.h"
static inline void static inline struct qinst *
vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val) vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
{ {
/* XXX perf: We should figure out how to merge ALU operations /* XXX perf: We should figure out how to merge ALU operations
* producing the val with this MOV, when possible. * producing the val with this MOV, when possible.
*/ */
vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val); return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
} }
static inline void static inline struct qinst *
vir_TMU_WRITE_or_count(struct v3d_compile *c, vir_TMU_WRITE_or_count(struct v3d_compile *c,
enum v3d_qpu_waddr waddr, enum v3d_qpu_waddr waddr,
struct qreg val, struct qreg val,
uint32_t *tmu_writes) uint32_t *tmu_writes)
{ {
if (tmu_writes) if (tmu_writes) {
(*tmu_writes)++; (*tmu_writes)++;
else return NULL;
vir_TMU_WRITE(c, waddr, val); } else {
return vir_TMU_WRITE(c, waddr, val);
}
} }
static void static void
@@ -381,17 +383,19 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed); vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
/* Emit retiring TMU write */ /* Emit retiring TMU write */
struct qinst *retiring;
if (instr->op == nir_texop_txf) { if (instr->op == nir_texop_txf) {
assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE); assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s); retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
} else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s); retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
} else if (instr->op == nir_texop_txl) { } else if (instr->op == nir_texop_txl) {
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s); retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
} else { } else {
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s); retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
} }
retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
ntq_add_pending_tmu_flush(c, &instr->dest, ntq_add_pending_tmu_flush(c, &instr->dest,
p0_unpacked.return_words_of_texture_data); p0_unpacked.return_words_of_texture_data);
} }
@@ -440,7 +444,7 @@ v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr)
* which is why we always call ntq_get_src() even if we are only interested in * which is why we always call ntq_get_src() even if we are only interested in
* register write counts. * register write counts.
*/ */
static void static struct qinst *
vir_image_emit_register_writes(struct v3d_compile *c, vir_image_emit_register_writes(struct v3d_compile *c,
nir_intrinsic_instr *instr, nir_intrinsic_instr *instr,
bool atomic_add_replaced, bool atomic_add_replaced,
@@ -507,7 +511,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
V3D_QPU_PF_PUSHZ); V3D_QPU_PF_PUSHZ);
} }
vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes); struct qinst *retiring =
vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
if (!tmu_writes && vir_in_nonuniform_control_flow(c) && if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
instr->intrinsic != nir_intrinsic_image_load) { instr->intrinsic != nir_intrinsic_image_load) {
@@ -515,6 +520,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
(struct qinst *)c->cur_block->instructions.prev; (struct qinst *)c->cur_block->instructions.prev;
vir_set_cond(last_inst, V3D_QPU_COND_IFA); vir_set_cond(last_inst, V3D_QPU_COND_IFA);
} }
return retiring;
} }
static unsigned static unsigned
@@ -612,8 +619,9 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked))) if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)))
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed); vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL); struct qinst *retiring =
vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
ntq_add_pending_tmu_flush(c, &instr->dest, ntq_add_pending_tmu_flush(c, &instr->dest,
p0_unpacked.return_words_of_texture_data); p0_unpacked.return_words_of_texture_data);
} }

View File

@@ -173,6 +173,11 @@ struct qinst {
/* If this is a a TLB Z write */ /* If this is a a TLB Z write */
bool is_tlb_z_write; bool is_tlb_z_write;
/* If this is a retiring TMU instruction (the last in a lookup sequence),
* how many ldtmu instructions are required to read the results.
*/
uint32_t ldtmu_count;
/* Position of this instruction in the program. Filled in during /* Position of this instruction in the program. Filled in during
* register allocation. * register allocation.
*/ */

View File

@@ -443,6 +443,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset); struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
inst->qpu.flags.ac = cond; inst->qpu.flags.ac = cond;
inst->ldtmu_count = 1;
inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
0xffffff7f); /* per-quad */ 0xffffff7f); /* per-quad */