broadcom/compiler: let QPUs stall on TMU input/config overflows
We have been trying to avoid this by tracking fifo usages in the driver and
flushing all outstanding TMU sequences if we overflowed any of these, however,
this is actually not the most efficient strategy. Instead, we would like to
flush only enough operations to get things going again, which is better for
pipelining. Doing that in the driver would require some additional work, but
thankfully, it is not required, since this seems to be what the hardware does
automatically, so we can just remove overflow tracking for these two fifos
and enjoy the benefits.
This also further improves shader-db stats:
total instructions in shared programs: 8975062 -> 8955145 (-0.22%)
instructions in affected programs: 1637624 -> 1617707 (-1.22%)
helped: 4050
HURT: 2241
Instructions are helped.
total threads in shared programs: 236802 -> 237042 (0.10%)
threads in affected programs: 252 -> 492 (95.24%)
helped: 122
HURT: 2
Threads are helped.
total sfu-stalls in shared programs: 19901 -> 19592 (-1.55%)
sfu-stalls in affected programs: 4744 -> 4435 (-6.51%)
helped: 1248
HURT: 1051
Sfu-stalls are helped.
total inst-and-stalls in shared programs: 8994963 -> 8974737 (-0.22%)
inst-and-stalls in affected programs: 1636184
-> 1615958 (-1.24%)
helped: 4050
HURT: 2239
Inst-and-stalls are helped.
Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8825>
This commit is contained in:

committed by
Marge Bot

parent
d57a358128
commit
6630825dcf
@@ -203,25 +203,23 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if pipelining a new TMU operation requiring 'components' LDTMUs and
|
* Checks if pipelining a new TMU operation requiring 'components' LDTMUs
|
||||||
* 'writes' TMU register writes would overflow any of the TMU fifos.
|
* would overflow the Output TMU fifo.
|
||||||
|
*
|
||||||
|
* It is not allowed to overflow the Output fifo, however, we can overflow
|
||||||
|
* Input and Config fifos. Doing that makes the shader stall, but only for as
|
||||||
|
* long as it needs to be able to continue so it is better for pipelining to
|
||||||
|
* let the QPU stall on these if needed than trying to emit TMU flushes in the
|
||||||
|
* driver.
|
||||||
*/
|
*/
|
||||||
bool
|
bool
|
||||||
ntq_tmu_fifo_overflow(struct v3d_compile *c,
|
ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components)
|
||||||
uint32_t components,
|
|
||||||
uint32_t writes)
|
|
||||||
{
|
{
|
||||||
if (c->tmu.input_fifo_size + writes > 16 / c->threads)
|
if (c->tmu.flush_count >= MAX_TMU_QUEUE_SIZE)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
/* Output and Config fifos are only involved with TMU lookups */
|
return components > 0 &&
|
||||||
if (components > 0 &&
|
c->tmu.output_fifo_size + components > 16 / c->threads;
|
||||||
(c->tmu.config_fifo_size + 1 > 8 / c->threads ||
|
|
||||||
c->tmu.output_fifo_size + components > 16 / c->threads)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -254,8 +252,6 @@ ntq_flush_tmu(struct v3d_compile *c)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
c->tmu.input_fifo_size = 0;
|
|
||||||
c->tmu.config_fifo_size = 0;
|
|
||||||
c->tmu.output_fifo_size = 0;
|
c->tmu.output_fifo_size = 0;
|
||||||
c->tmu.flush_count = 0;
|
c->tmu.flush_count = 0;
|
||||||
_mesa_set_clear(c->tmu.outstanding_regs, NULL);
|
_mesa_set_clear(c->tmu.outstanding_regs, NULL);
|
||||||
@@ -269,15 +265,12 @@ ntq_flush_tmu(struct v3d_compile *c)
|
|||||||
void
|
void
|
||||||
ntq_add_pending_tmu_flush(struct v3d_compile *c,
|
ntq_add_pending_tmu_flush(struct v3d_compile *c,
|
||||||
nir_dest *dest,
|
nir_dest *dest,
|
||||||
uint32_t component_mask,
|
uint32_t component_mask)
|
||||||
uint32_t tmu_writes)
|
|
||||||
{
|
{
|
||||||
const uint32_t num_components = util_bitcount(component_mask);
|
const uint32_t num_components = util_bitcount(component_mask);
|
||||||
assert(!ntq_tmu_fifo_overflow(c, num_components, tmu_writes));
|
assert(!ntq_tmu_fifo_overflow(c, num_components));
|
||||||
|
|
||||||
c->tmu.input_fifo_size += tmu_writes;
|
|
||||||
if (num_components > 0) {
|
if (num_components > 0) {
|
||||||
c->tmu.config_fifo_size += 1;
|
|
||||||
c->tmu.output_fifo_size += num_components;
|
c->tmu.output_fifo_size += num_components;
|
||||||
if (!dest->is_ssa)
|
if (!dest->is_ssa)
|
||||||
_mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg);
|
_mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg);
|
||||||
@@ -544,14 +537,14 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* We are ready to emit TMU register writes now, but before we actually
|
/* We are ready to emit TMU register writes now, but before we actually
|
||||||
* emit them we need to know the amount of writes we will require
|
* emit them we need to flush outstanding TMU operations if any of our
|
||||||
* and we need to flush outstanding TMU operations if any of the writes
|
* writes reads from the result of an outstanding TMU operation before
|
||||||
* reads from the result of an outstanding TMU operation before we emit
|
* we start the TMU sequence for this operation, since otherwise the
|
||||||
* any of the writes for the current operation to avoid corrupting its
|
* flush could happen in the middle of the TMU sequence we are about to
|
||||||
* TMU sequence. To do this we run this logic twice, the first time
|
* emit, which is illegal. To do this we run this logic twice, the
|
||||||
* it will count register writes and flush pending TMU requests if
|
* first time it will count required register writes and flush pending
|
||||||
* necessary due to a dependency, and the second one will emit the
|
* TMU requests if necessary due to a dependency, and the second one
|
||||||
* actual TMU writes.
|
* will emit the actual TMU writes.
|
||||||
*/
|
*/
|
||||||
const uint32_t dest_components = nir_intrinsic_dest_components(instr);
|
const uint32_t dest_components = nir_intrinsic_dest_components(instr);
|
||||||
uint32_t base_const_offset = const_offset;
|
uint32_t base_const_offset = const_offset;
|
||||||
@@ -623,7 +616,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
|
|||||||
/* If pipelining this TMU operation would
|
/* If pipelining this TMU operation would
|
||||||
* overflow TMU fifos, we need to flush.
|
* overflow TMU fifos, we need to flush.
|
||||||
*/
|
*/
|
||||||
if (ntq_tmu_fifo_overflow(c, dest_components, tmu_writes))
|
if (ntq_tmu_fifo_overflow(c, dest_components))
|
||||||
ntq_flush_tmu(c);
|
ntq_flush_tmu(c);
|
||||||
} else {
|
} else {
|
||||||
/* Delay emission of the thread switch and
|
/* Delay emission of the thread switch and
|
||||||
@@ -633,8 +626,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
|
|||||||
const uint32_t component_mask =
|
const uint32_t component_mask =
|
||||||
(1 << dest_components) - 1;
|
(1 << dest_components) - 1;
|
||||||
ntq_add_pending_tmu_flush(c, &instr->dest,
|
ntq_add_pending_tmu_flush(c, &instr->dest,
|
||||||
component_mask,
|
component_mask);
|
||||||
tmu_writes);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (is_store && writemask != 0);
|
} while (is_store && writemask != 0);
|
||||||
|
@@ -262,7 +262,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
|
|||||||
*/
|
*/
|
||||||
const unsigned dest_components =
|
const unsigned dest_components =
|
||||||
util_bitcount(p0_unpacked.return_words_of_texture_data);
|
util_bitcount(p0_unpacked.return_words_of_texture_data);
|
||||||
if (ntq_tmu_fifo_overflow(c, dest_components, tmu_writes))
|
if (ntq_tmu_fifo_overflow(c, dest_components))
|
||||||
ntq_flush_tmu(c);
|
ntq_flush_tmu(c);
|
||||||
|
|
||||||
/* Process tex sources emitting corresponding TMU writes */
|
/* Process tex sources emitting corresponding TMU writes */
|
||||||
@@ -380,8 +380,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
|
|||||||
}
|
}
|
||||||
|
|
||||||
ntq_add_pending_tmu_flush(c, &instr->dest,
|
ntq_add_pending_tmu_flush(c, &instr->dest,
|
||||||
p0_unpacked.return_words_of_texture_data,
|
p0_unpacked.return_words_of_texture_data);
|
||||||
tmu_writes);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t
|
static uint32_t
|
||||||
@@ -591,7 +590,7 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
|
|||||||
/* If pipelining this TMU operation would overflow TMU fifos, we need
|
/* If pipelining this TMU operation would overflow TMU fifos, we need
|
||||||
* to flush any outstanding TMU operations.
|
* to flush any outstanding TMU operations.
|
||||||
*/
|
*/
|
||||||
if (ntq_tmu_fifo_overflow(c, instr_return_channels, tmu_writes))
|
if (ntq_tmu_fifo_overflow(c, instr_return_channels))
|
||||||
ntq_flush_tmu(c);
|
ntq_flush_tmu(c);
|
||||||
|
|
||||||
vir_WRTMUC(c, QUNIFORM_IMAGE_TMU_CONFIG_P0, p0_packed);
|
vir_WRTMUC(c, QUNIFORM_IMAGE_TMU_CONFIG_P0, p0_packed);
|
||||||
@@ -603,6 +602,5 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
|
|||||||
vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
|
vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
|
||||||
|
|
||||||
ntq_add_pending_tmu_flush(c, &instr->dest,
|
ntq_add_pending_tmu_flush(c, &instr->dest,
|
||||||
p0_unpacked.return_words_of_texture_data,
|
p0_unpacked.return_words_of_texture_data);
|
||||||
tmu_writes);
|
|
||||||
}
|
}
|
||||||
|
@@ -42,6 +42,25 @@
|
|||||||
#include "qpu/qpu_instr.h"
|
#include "qpu/qpu_instr.h"
|
||||||
#include "pipe/p_state.h"
|
#include "pipe/p_state.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maximum number of outstanding TMU operations we can queue for execution.
|
||||||
|
*
|
||||||
|
* This is mostly limited by the size of the TMU fifos. The Input and Config
|
||||||
|
* fifos can stall, but we prefer that than injecting TMU flushes manually
|
||||||
|
* in the driver, so we can ignore these, but we can't overflow the Output fifo,
|
||||||
|
* which has 16 / threads per-thread entries, meaning that the maximum number
|
||||||
|
* of outstanding LDTMUs we can ever have is 8, for a 2-way threaded shader.
|
||||||
|
* This means that at most we can have 8 outstanding TMU loads, if each load
|
||||||
|
* is just one component.
|
||||||
|
*
|
||||||
|
* NOTE: we could actually have a larger value here because TMU stores don't
|
||||||
|
* consume any entries in the Output fifo (so we could have any number of
|
||||||
|
* outstanding stores) and the driver keeps track of used Output fifo entries
|
||||||
|
* and will flush if we ever needs more than 8, but since loads are much more
|
||||||
|
* common than stores, it is probably not worth it.
|
||||||
|
*/
|
||||||
|
#define MAX_TMU_QUEUE_SIZE 8
|
||||||
|
|
||||||
struct nir_builder;
|
struct nir_builder;
|
||||||
|
|
||||||
struct v3d_fs_inputs {
|
struct v3d_fs_inputs {
|
||||||
@@ -573,15 +592,13 @@ struct v3d_compile {
|
|||||||
*/
|
*/
|
||||||
struct set *outstanding_regs;
|
struct set *outstanding_regs;
|
||||||
|
|
||||||
uint32_t input_fifo_size;
|
|
||||||
uint32_t config_fifo_size;
|
|
||||||
uint32_t output_fifo_size;
|
uint32_t output_fifo_size;
|
||||||
|
|
||||||
struct {
|
struct {
|
||||||
nir_dest *dest;
|
nir_dest *dest;
|
||||||
uint8_t num_components;
|
uint8_t num_components;
|
||||||
uint8_t component_mask;
|
uint8_t component_mask;
|
||||||
} flush[8]; /* 16 entries / 2 threads for input/output fifos */
|
} flush[MAX_TMU_QUEUE_SIZE];
|
||||||
uint32_t flush_count;
|
uint32_t flush_count;
|
||||||
} tmu;
|
} tmu;
|
||||||
|
|
||||||
@@ -943,9 +960,9 @@ uint8_t vir_channels_written(struct qinst *inst);
|
|||||||
struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
|
struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
|
||||||
void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
|
void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
|
||||||
struct qreg result);
|
struct qreg result);
|
||||||
bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components, uint32_t writes);
|
bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components);
|
||||||
void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest,
|
void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest,
|
||||||
uint32_t component_mask, uint32_t tmu_writes);
|
uint32_t component_mask);
|
||||||
void ntq_flush_tmu(struct v3d_compile *c);
|
void ntq_flush_tmu(struct v3d_compile *c);
|
||||||
void vir_emit_thrsw(struct v3d_compile *c);
|
void vir_emit_thrsw(struct v3d_compile *c);
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user