broadcom/compiler: track pending ldtmu count with each TMU lookup
And use this information when scheduling QPU to avoid merging a new TMU request into a previous ldtmu instruction when doing so may cause TMU output fifo overflow due to a stalling ldtmu. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22044>
This commit is contained in:

committed by
Marge Bot

parent
c09482b293
commit
1e28f2a6f2
@@ -449,6 +449,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
|
||||
int offset_src,
|
||||
struct qreg base_offset,
|
||||
uint32_t const_offset,
|
||||
uint32_t dest_components,
|
||||
uint32_t *tmu_writes)
|
||||
{
|
||||
if (mode == MODE_COUNT) {
|
||||
@@ -494,6 +495,8 @@ emit_tmu_general_address_write(struct v3d_compile *c,
|
||||
|
||||
if (vir_in_nonuniform_control_flow(c))
|
||||
vir_set_cond(tmu, V3D_QPU_COND_IFA);
|
||||
|
||||
tmu->ldtmu_count = dest_components;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -684,7 +687,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
|
||||
emit_tmu_general_address_write(c, mode, instr, config,
|
||||
dynamic_src, offset_src,
|
||||
base_offset, const_offset,
|
||||
&tmu_writes);
|
||||
dest_components, &tmu_writes);
|
||||
|
||||
assert(tmu_writes > 0);
|
||||
if (mode == MODE_COUNT) {
|
||||
|
@@ -496,6 +496,8 @@ struct choose_scoreboard {
|
||||
bool last_thrsw_emitted;
|
||||
bool fixup_ldvary;
|
||||
int ldvary_count;
|
||||
int pending_ldtmu_count;
|
||||
bool first_ldtmu_after_thrsw;
|
||||
};
|
||||
|
||||
static bool
|
||||
@@ -1207,6 +1209,29 @@ retry:
|
||||
continue;
|
||||
}
|
||||
|
||||
/* We can emit a new tmu lookup with a previous ldtmu
|
||||
* if doing this would free just enough space in the
|
||||
* TMU output fifo so we don't overflow, however, this
|
||||
* is only safe if the ldtmu cannot stall.
|
||||
*
|
||||
* A ldtmu can stall if it is not the first following a
|
||||
* thread switch and corresponds to the first word of a
|
||||
* read request.
|
||||
*
|
||||
* FIXME: For now we forbid pairing up a new lookup
|
||||
* with a previous ldtmu that is not the first after a
|
||||
* thrsw if that could overflow the TMU output fifo
|
||||
* regardless of whether the ldtmu is reading the first
|
||||
* word of a TMU result or not, since we don't track
|
||||
* this aspect in the compiler yet.
|
||||
*/
|
||||
if (prev_inst->inst->qpu.sig.ldtmu &&
|
||||
!scoreboard->first_ldtmu_after_thrsw &&
|
||||
(scoreboard->pending_ldtmu_count +
|
||||
n->inst->ldtmu_count > 16 / c->threads)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
struct v3d_qpu_instr merged_inst;
|
||||
if (!qpu_merge_inst(c->devinfo, &merged_inst,
|
||||
&prev_inst->inst->qpu, inst)) {
|
||||
@@ -1294,11 +1319,32 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
|
||||
const struct qinst *inst)
|
||||
{
|
||||
/* Track if the have seen any ldtmu after the last thread switch */
|
||||
if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
|
||||
scoreboard->first_ldtmu_after_thrsw = true;
|
||||
|
||||
/* Track the number of pending ldtmu instructions for outstanding
|
||||
* TMU lookups.
|
||||
*/
|
||||
scoreboard->pending_ldtmu_count += inst->ldtmu_count;
|
||||
if (inst->qpu.sig.ldtmu) {
|
||||
assert(scoreboard->pending_ldtmu_count > 0);
|
||||
scoreboard->pending_ldtmu_count--;
|
||||
scoreboard->first_ldtmu_after_thrsw = false;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
|
||||
const struct v3d_qpu_instr *inst,
|
||||
const struct qinst *qinst,
|
||||
const struct v3d_device_info *devinfo)
|
||||
{
|
||||
const struct v3d_qpu_instr *inst = &qinst->qpu;
|
||||
|
||||
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
|
||||
return;
|
||||
|
||||
@@ -1334,6 +1380,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
|
||||
|
||||
if (inst->sig.ldvary)
|
||||
scoreboard->last_ldvary_tick = scoreboard->tick;
|
||||
|
||||
update_scoreboard_tmu_tracking(scoreboard, qinst);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -1495,7 +1543,7 @@ insert_scheduled_instruction(struct v3d_compile *c,
|
||||
{
|
||||
list_addtail(&inst->link, &block->instructions);
|
||||
|
||||
update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
|
||||
update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
|
||||
c->qpu_inst_count++;
|
||||
scoreboard->tick++;
|
||||
}
|
||||
@@ -2229,6 +2277,9 @@ schedule_instructions(struct v3d_compile *c,
|
||||
merge->inst->uniform;
|
||||
}
|
||||
|
||||
chosen->inst->ldtmu_count +=
|
||||
merge->inst->ldtmu_count;
|
||||
|
||||
if (debug) {
|
||||
fprintf(stderr, "t=%4d: merging: ",
|
||||
time);
|
||||
@@ -2478,6 +2529,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
|
||||
scoreboard.last_branch_tick = -10;
|
||||
scoreboard.last_setmsf_tick = -10;
|
||||
scoreboard.last_stallable_sfu_tick = -10;
|
||||
scoreboard.first_ldtmu_after_thrsw = true;
|
||||
|
||||
if (debug) {
|
||||
fprintf(stderr, "Pre-schedule instructions\n");
|
||||
|
@@ -30,25 +30,27 @@
|
||||
#define __gen_emit_reloc(cl, reloc)
|
||||
#include "cle/v3d_packet_v41_pack.h"
|
||||
|
||||
static inline void
|
||||
static inline struct qinst *
|
||||
vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
|
||||
{
|
||||
/* XXX perf: We should figure out how to merge ALU operations
|
||||
* producing the val with this MOV, when possible.
|
||||
*/
|
||||
vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
|
||||
return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
|
||||
}
|
||||
|
||||
static inline void
|
||||
static inline struct qinst *
|
||||
vir_TMU_WRITE_or_count(struct v3d_compile *c,
|
||||
enum v3d_qpu_waddr waddr,
|
||||
struct qreg val,
|
||||
uint32_t *tmu_writes)
|
||||
{
|
||||
if (tmu_writes)
|
||||
if (tmu_writes) {
|
||||
(*tmu_writes)++;
|
||||
else
|
||||
vir_TMU_WRITE(c, waddr, val);
|
||||
return NULL;
|
||||
} else {
|
||||
return vir_TMU_WRITE(c, waddr, val);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -381,17 +383,19 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
|
||||
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
|
||||
|
||||
/* Emit retiring TMU write */
|
||||
struct qinst *retiring;
|
||||
if (instr->op == nir_texop_txf) {
|
||||
assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
|
||||
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
|
||||
retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
|
||||
} else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
|
||||
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
|
||||
retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
|
||||
} else if (instr->op == nir_texop_txl) {
|
||||
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
|
||||
retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
|
||||
} else {
|
||||
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
|
||||
retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
|
||||
}
|
||||
|
||||
retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
|
||||
ntq_add_pending_tmu_flush(c, &instr->dest,
|
||||
p0_unpacked.return_words_of_texture_data);
|
||||
}
|
||||
@@ -440,7 +444,7 @@ v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr)
|
||||
* which is why we always call ntq_get_src() even if we are only interested in
|
||||
* register write counts.
|
||||
*/
|
||||
static void
|
||||
static struct qinst *
|
||||
vir_image_emit_register_writes(struct v3d_compile *c,
|
||||
nir_intrinsic_instr *instr,
|
||||
bool atomic_add_replaced,
|
||||
@@ -507,7 +511,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
|
||||
V3D_QPU_PF_PUSHZ);
|
||||
}
|
||||
|
||||
vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
|
||||
struct qinst *retiring =
|
||||
vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
|
||||
|
||||
if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
|
||||
instr->intrinsic != nir_intrinsic_image_load) {
|
||||
@@ -515,6 +520,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
|
||||
(struct qinst *)c->cur_block->instructions.prev;
|
||||
vir_set_cond(last_inst, V3D_QPU_COND_IFA);
|
||||
}
|
||||
|
||||
return retiring;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
@@ -612,8 +619,9 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
|
||||
if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)))
|
||||
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
|
||||
|
||||
vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
|
||||
|
||||
struct qinst *retiring =
|
||||
vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
|
||||
retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
|
||||
ntq_add_pending_tmu_flush(c, &instr->dest,
|
||||
p0_unpacked.return_words_of_texture_data);
|
||||
}
|
||||
|
@@ -173,6 +173,11 @@ struct qinst {
|
||||
/* If this is a a TLB Z write */
|
||||
bool is_tlb_z_write;
|
||||
|
||||
/* If this is a retiring TMU instruction (the last in a lookup sequence),
|
||||
* how many ldtmu instructions are required to read the results.
|
||||
*/
|
||||
uint32_t ldtmu_count;
|
||||
|
||||
/* Position of this instruction in the program. Filled in during
|
||||
* register allocation.
|
||||
*/
|
||||
|
@@ -443,6 +443,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
|
||||
struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
|
||||
struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
|
||||
inst->qpu.flags.ac = cond;
|
||||
inst->ldtmu_count = 1;
|
||||
inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
|
||||
0xffffff7f); /* per-quad */
|
||||
|
||||
|
Reference in New Issue
Block a user