broadcom/compiler: track pending ldtmu count with each TMU lookup
And use this information when scheduling QPU to avoid merging a new TMU request into a previous ldtmu instruction when doing so may cause TMU output fifo overflow due to a stalling ldtmu. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22044>
This commit is contained in:

committed by
Marge Bot

parent
c09482b293
commit
1e28f2a6f2
@@ -449,6 +449,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
|
|||||||
int offset_src,
|
int offset_src,
|
||||||
struct qreg base_offset,
|
struct qreg base_offset,
|
||||||
uint32_t const_offset,
|
uint32_t const_offset,
|
||||||
|
uint32_t dest_components,
|
||||||
uint32_t *tmu_writes)
|
uint32_t *tmu_writes)
|
||||||
{
|
{
|
||||||
if (mode == MODE_COUNT) {
|
if (mode == MODE_COUNT) {
|
||||||
@@ -494,6 +495,8 @@ emit_tmu_general_address_write(struct v3d_compile *c,
|
|||||||
|
|
||||||
if (vir_in_nonuniform_control_flow(c))
|
if (vir_in_nonuniform_control_flow(c))
|
||||||
vir_set_cond(tmu, V3D_QPU_COND_IFA);
|
vir_set_cond(tmu, V3D_QPU_COND_IFA);
|
||||||
|
|
||||||
|
tmu->ldtmu_count = dest_components;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -684,7 +687,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
|
|||||||
emit_tmu_general_address_write(c, mode, instr, config,
|
emit_tmu_general_address_write(c, mode, instr, config,
|
||||||
dynamic_src, offset_src,
|
dynamic_src, offset_src,
|
||||||
base_offset, const_offset,
|
base_offset, const_offset,
|
||||||
&tmu_writes);
|
dest_components, &tmu_writes);
|
||||||
|
|
||||||
assert(tmu_writes > 0);
|
assert(tmu_writes > 0);
|
||||||
if (mode == MODE_COUNT) {
|
if (mode == MODE_COUNT) {
|
||||||
|
@@ -496,6 +496,8 @@ struct choose_scoreboard {
|
|||||||
bool last_thrsw_emitted;
|
bool last_thrsw_emitted;
|
||||||
bool fixup_ldvary;
|
bool fixup_ldvary;
|
||||||
int ldvary_count;
|
int ldvary_count;
|
||||||
|
int pending_ldtmu_count;
|
||||||
|
bool first_ldtmu_after_thrsw;
|
||||||
};
|
};
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
@@ -1207,6 +1209,29 @@ retry:
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* We can emit a new tmu lookup with a previous ldtmu
|
||||||
|
* if doing this would free just enough space in the
|
||||||
|
* TMU output fifo so we don't overflow, however, this
|
||||||
|
* is only safe if the ldtmu cannot stall.
|
||||||
|
*
|
||||||
|
* A ldtmu can stall if it is not the first following a
|
||||||
|
* thread switch and corresponds to the first word of a
|
||||||
|
* read request.
|
||||||
|
*
|
||||||
|
* FIXME: For now we forbid pairing up a new lookup
|
||||||
|
* with a previous ldtmu that is not the first after a
|
||||||
|
* thrsw if that could overflow the TMU output fifo
|
||||||
|
* regardless of whether the ldtmu is reading the first
|
||||||
|
* word of a TMU result or not, since we don't track
|
||||||
|
* this aspect in the compiler yet.
|
||||||
|
*/
|
||||||
|
if (prev_inst->inst->qpu.sig.ldtmu &&
|
||||||
|
!scoreboard->first_ldtmu_after_thrsw &&
|
||||||
|
(scoreboard->pending_ldtmu_count +
|
||||||
|
n->inst->ldtmu_count > 16 / c->threads)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
struct v3d_qpu_instr merged_inst;
|
struct v3d_qpu_instr merged_inst;
|
||||||
if (!qpu_merge_inst(c->devinfo, &merged_inst,
|
if (!qpu_merge_inst(c->devinfo, &merged_inst,
|
||||||
&prev_inst->inst->qpu, inst)) {
|
&prev_inst->inst->qpu, inst)) {
|
||||||
@@ -1294,11 +1319,32 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
|
||||||
|
const struct qinst *inst)
|
||||||
|
{
|
||||||
|
/* Track if the have seen any ldtmu after the last thread switch */
|
||||||
|
if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
|
||||||
|
scoreboard->first_ldtmu_after_thrsw = true;
|
||||||
|
|
||||||
|
/* Track the number of pending ldtmu instructions for outstanding
|
||||||
|
* TMU lookups.
|
||||||
|
*/
|
||||||
|
scoreboard->pending_ldtmu_count += inst->ldtmu_count;
|
||||||
|
if (inst->qpu.sig.ldtmu) {
|
||||||
|
assert(scoreboard->pending_ldtmu_count > 0);
|
||||||
|
scoreboard->pending_ldtmu_count--;
|
||||||
|
scoreboard->first_ldtmu_after_thrsw = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
|
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
|
||||||
const struct v3d_qpu_instr *inst,
|
const struct qinst *qinst,
|
||||||
const struct v3d_device_info *devinfo)
|
const struct v3d_device_info *devinfo)
|
||||||
{
|
{
|
||||||
|
const struct v3d_qpu_instr *inst = &qinst->qpu;
|
||||||
|
|
||||||
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
|
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@@ -1334,6 +1380,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
|
|||||||
|
|
||||||
if (inst->sig.ldvary)
|
if (inst->sig.ldvary)
|
||||||
scoreboard->last_ldvary_tick = scoreboard->tick;
|
scoreboard->last_ldvary_tick = scoreboard->tick;
|
||||||
|
|
||||||
|
update_scoreboard_tmu_tracking(scoreboard, qinst);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@@ -1495,7 +1543,7 @@ insert_scheduled_instruction(struct v3d_compile *c,
|
|||||||
{
|
{
|
||||||
list_addtail(&inst->link, &block->instructions);
|
list_addtail(&inst->link, &block->instructions);
|
||||||
|
|
||||||
update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
|
update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
|
||||||
c->qpu_inst_count++;
|
c->qpu_inst_count++;
|
||||||
scoreboard->tick++;
|
scoreboard->tick++;
|
||||||
}
|
}
|
||||||
@@ -2229,6 +2277,9 @@ schedule_instructions(struct v3d_compile *c,
|
|||||||
merge->inst->uniform;
|
merge->inst->uniform;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
chosen->inst->ldtmu_count +=
|
||||||
|
merge->inst->ldtmu_count;
|
||||||
|
|
||||||
if (debug) {
|
if (debug) {
|
||||||
fprintf(stderr, "t=%4d: merging: ",
|
fprintf(stderr, "t=%4d: merging: ",
|
||||||
time);
|
time);
|
||||||
@@ -2478,6 +2529,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
|
|||||||
scoreboard.last_branch_tick = -10;
|
scoreboard.last_branch_tick = -10;
|
||||||
scoreboard.last_setmsf_tick = -10;
|
scoreboard.last_setmsf_tick = -10;
|
||||||
scoreboard.last_stallable_sfu_tick = -10;
|
scoreboard.last_stallable_sfu_tick = -10;
|
||||||
|
scoreboard.first_ldtmu_after_thrsw = true;
|
||||||
|
|
||||||
if (debug) {
|
if (debug) {
|
||||||
fprintf(stderr, "Pre-schedule instructions\n");
|
fprintf(stderr, "Pre-schedule instructions\n");
|
||||||
|
@@ -30,25 +30,27 @@
|
|||||||
#define __gen_emit_reloc(cl, reloc)
|
#define __gen_emit_reloc(cl, reloc)
|
||||||
#include "cle/v3d_packet_v41_pack.h"
|
#include "cle/v3d_packet_v41_pack.h"
|
||||||
|
|
||||||
static inline void
|
static inline struct qinst *
|
||||||
vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
|
vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
|
||||||
{
|
{
|
||||||
/* XXX perf: We should figure out how to merge ALU operations
|
/* XXX perf: We should figure out how to merge ALU operations
|
||||||
* producing the val with this MOV, when possible.
|
* producing the val with this MOV, when possible.
|
||||||
*/
|
*/
|
||||||
vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
|
return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void
|
static inline struct qinst *
|
||||||
vir_TMU_WRITE_or_count(struct v3d_compile *c,
|
vir_TMU_WRITE_or_count(struct v3d_compile *c,
|
||||||
enum v3d_qpu_waddr waddr,
|
enum v3d_qpu_waddr waddr,
|
||||||
struct qreg val,
|
struct qreg val,
|
||||||
uint32_t *tmu_writes)
|
uint32_t *tmu_writes)
|
||||||
{
|
{
|
||||||
if (tmu_writes)
|
if (tmu_writes) {
|
||||||
(*tmu_writes)++;
|
(*tmu_writes)++;
|
||||||
else
|
return NULL;
|
||||||
vir_TMU_WRITE(c, waddr, val);
|
} else {
|
||||||
|
return vir_TMU_WRITE(c, waddr, val);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@@ -381,17 +383,19 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
|
|||||||
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
|
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
|
||||||
|
|
||||||
/* Emit retiring TMU write */
|
/* Emit retiring TMU write */
|
||||||
|
struct qinst *retiring;
|
||||||
if (instr->op == nir_texop_txf) {
|
if (instr->op == nir_texop_txf) {
|
||||||
assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
|
assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
|
||||||
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
|
retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
|
||||||
} else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
|
} else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
|
||||||
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
|
retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
|
||||||
} else if (instr->op == nir_texop_txl) {
|
} else if (instr->op == nir_texop_txl) {
|
||||||
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
|
retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
|
||||||
} else {
|
} else {
|
||||||
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
|
retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
|
||||||
ntq_add_pending_tmu_flush(c, &instr->dest,
|
ntq_add_pending_tmu_flush(c, &instr->dest,
|
||||||
p0_unpacked.return_words_of_texture_data);
|
p0_unpacked.return_words_of_texture_data);
|
||||||
}
|
}
|
||||||
@@ -440,7 +444,7 @@ v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr)
|
|||||||
* which is why we always call ntq_get_src() even if we are only interested in
|
* which is why we always call ntq_get_src() even if we are only interested in
|
||||||
* register write counts.
|
* register write counts.
|
||||||
*/
|
*/
|
||||||
static void
|
static struct qinst *
|
||||||
vir_image_emit_register_writes(struct v3d_compile *c,
|
vir_image_emit_register_writes(struct v3d_compile *c,
|
||||||
nir_intrinsic_instr *instr,
|
nir_intrinsic_instr *instr,
|
||||||
bool atomic_add_replaced,
|
bool atomic_add_replaced,
|
||||||
@@ -507,7 +511,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
|
|||||||
V3D_QPU_PF_PUSHZ);
|
V3D_QPU_PF_PUSHZ);
|
||||||
}
|
}
|
||||||
|
|
||||||
vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
|
struct qinst *retiring =
|
||||||
|
vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
|
||||||
|
|
||||||
if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
|
if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
|
||||||
instr->intrinsic != nir_intrinsic_image_load) {
|
instr->intrinsic != nir_intrinsic_image_load) {
|
||||||
@@ -515,6 +520,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
|
|||||||
(struct qinst *)c->cur_block->instructions.prev;
|
(struct qinst *)c->cur_block->instructions.prev;
|
||||||
vir_set_cond(last_inst, V3D_QPU_COND_IFA);
|
vir_set_cond(last_inst, V3D_QPU_COND_IFA);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return retiring;
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned
|
static unsigned
|
||||||
@@ -612,8 +619,9 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
|
|||||||
if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)))
|
if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)))
|
||||||
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
|
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
|
||||||
|
|
||||||
vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
|
struct qinst *retiring =
|
||||||
|
vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
|
||||||
|
retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
|
||||||
ntq_add_pending_tmu_flush(c, &instr->dest,
|
ntq_add_pending_tmu_flush(c, &instr->dest,
|
||||||
p0_unpacked.return_words_of_texture_data);
|
p0_unpacked.return_words_of_texture_data);
|
||||||
}
|
}
|
||||||
|
@@ -173,6 +173,11 @@ struct qinst {
|
|||||||
/* If this is a a TLB Z write */
|
/* If this is a a TLB Z write */
|
||||||
bool is_tlb_z_write;
|
bool is_tlb_z_write;
|
||||||
|
|
||||||
|
/* If this is a retiring TMU instruction (the last in a lookup sequence),
|
||||||
|
* how many ldtmu instructions are required to read the results.
|
||||||
|
*/
|
||||||
|
uint32_t ldtmu_count;
|
||||||
|
|
||||||
/* Position of this instruction in the program. Filled in during
|
/* Position of this instruction in the program. Filled in during
|
||||||
* register allocation.
|
* register allocation.
|
||||||
*/
|
*/
|
||||||
|
@@ -443,6 +443,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
|
|||||||
struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
|
struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
|
||||||
struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
|
struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
|
||||||
inst->qpu.flags.ac = cond;
|
inst->qpu.flags.ac = cond;
|
||||||
|
inst->ldtmu_count = 1;
|
||||||
inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
|
inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
|
||||||
0xffffff7f); /* per-quad */
|
0xffffff7f); /* per-quad */
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user