v3d: Rematerialize MOVs of uniforms instead of spilling them.
If we have a MOV of a uniform value available to spill, that's one of our best choices. We can just not spill the value, and emit a new load of the uniform as the fill. This saves bothering the TMU and the thrsw, and is the same cost in uniforms (since the spill offset is a uniform anyway). This doesn't have a huge impact on shader-db, since there aren't a whole lot of spills and we usually copy-prop the uniforms at the VIR level such that the only uniform MOVs are from vir_lower_uniforms: total instructions in shared programs: 6430292 -> 6430279 (<.01%) total uniforms in shared programs: 2386023 -> 2385787 (<.01%) total spills in shared programs: 4961 -> 4960 (-0.02%) total fills in shared programs: 6352 -> 6350 (-0.03%) However, I'm interested in dropping the uniforms copy-prop in the backend, since it would be cheaper to not load repeated uniforms if we have the registers to spare. This also saves many spills on dEQP-GLES31.functional.ubo.random.all_per_block_buffers.20, which is what motivated a bunch of my recent backend work in the first place: before: 46 spills, 106 fills, 3062 instructions after: 0 spills, 0 fills, 2611 instructions
This commit is contained in:
@@ -1152,4 +1152,8 @@ vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
|
|||||||
vir_for_each_block(_block, c) \
|
vir_for_each_block(_block, c) \
|
||||||
vir_for_each_inst(inst, _block)
|
vir_for_each_inst(inst, _block)
|
||||||
|
|
||||||
|
#define vir_for_each_inst_inorder_safe(inst, c) \
|
||||||
|
vir_for_each_block(_block, c) \
|
||||||
|
vir_for_each_inst_safe(inst, _block)
|
||||||
|
|
||||||
#endif /* V3D_COMPILER_H */
|
#endif /* V3D_COMPILER_H */
|
||||||
|
@@ -47,10 +47,21 @@ is_last_ldtmu(struct qinst *inst, struct qblock *block)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
vir_is_mov_uniform(struct v3d_compile *c, int temp)
|
||||||
|
{
|
||||||
|
struct qinst *def = c->defs[temp];
|
||||||
|
|
||||||
|
return (def &&
|
||||||
|
vir_is_raw_mov(def) &&
|
||||||
|
def->src[0].file == QFILE_UNIF);
|
||||||
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
|
v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
|
||||||
uint32_t *temp_to_node)
|
uint32_t *temp_to_node)
|
||||||
{
|
{
|
||||||
|
const float tmu_scale = 5;
|
||||||
float block_scale = 1.0;
|
float block_scale = 1.0;
|
||||||
float spill_costs[c->num_temps];
|
float spill_costs[c->num_temps];
|
||||||
bool in_tmu_operation = false;
|
bool in_tmu_operation = false;
|
||||||
@@ -75,22 +86,28 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
int temp = inst->src[i].index;
|
int temp = inst->src[i].index;
|
||||||
if (no_spilling) {
|
if (vir_is_mov_uniform(c, temp)) {
|
||||||
BITSET_CLEAR(c->spillable,
|
|
||||||
temp);
|
|
||||||
} else {
|
|
||||||
spill_costs[temp] += block_scale;
|
spill_costs[temp] += block_scale;
|
||||||
|
} else if (!no_spilling) {
|
||||||
|
spill_costs[temp] += (block_scale *
|
||||||
|
tmu_scale);
|
||||||
|
} else {
|
||||||
|
BITSET_CLEAR(c->spillable, temp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (inst->dst.file == QFILE_TEMP) {
|
if (inst->dst.file == QFILE_TEMP) {
|
||||||
int temp = inst->dst.index;
|
int temp = inst->dst.index;
|
||||||
|
|
||||||
if (no_spilling) {
|
if (vir_is_mov_uniform(c, temp)) {
|
||||||
BITSET_CLEAR(c->spillable,
|
/* We just rematerialize the unform
|
||||||
temp);
|
* later.
|
||||||
|
*/
|
||||||
|
} else if (!no_spilling) {
|
||||||
|
spill_costs[temp] += (block_scale *
|
||||||
|
tmu_scale);
|
||||||
} else {
|
} else {
|
||||||
spill_costs[temp] += block_scale;
|
BITSET_CLEAR(c->spillable, temp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -184,18 +201,28 @@ v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset)
|
|||||||
static void
|
static void
|
||||||
v3d_spill_reg(struct v3d_compile *c, int spill_temp)
|
v3d_spill_reg(struct v3d_compile *c, int spill_temp)
|
||||||
{
|
{
|
||||||
|
bool is_uniform = vir_is_mov_uniform(c, spill_temp);
|
||||||
|
|
||||||
|
uint32_t spill_offset = 0;
|
||||||
|
|
||||||
|
if (!is_uniform) {
|
||||||
uint32_t spill_offset = c->spill_size;
|
uint32_t spill_offset = c->spill_size;
|
||||||
c->spill_size += 16 * sizeof(uint32_t);
|
c->spill_size += 16 * sizeof(uint32_t);
|
||||||
|
|
||||||
if (spill_offset == 0)
|
if (spill_offset == 0)
|
||||||
v3d_setup_spill_base(c);
|
v3d_setup_spill_base(c);
|
||||||
|
}
|
||||||
|
|
||||||
struct qinst *last_thrsw = c->last_thrsw;
|
struct qinst *last_thrsw = c->last_thrsw;
|
||||||
assert(!last_thrsw || last_thrsw->is_last_thrsw);
|
assert(!last_thrsw || last_thrsw->is_last_thrsw);
|
||||||
|
|
||||||
int start_num_temps = c->num_temps;
|
int start_num_temps = c->num_temps;
|
||||||
|
|
||||||
vir_for_each_inst_inorder(inst, c) {
|
struct qreg uniform_src = c->undef;
|
||||||
|
if (is_uniform)
|
||||||
|
uniform_src = c->defs[spill_temp]->src[0];
|
||||||
|
|
||||||
|
vir_for_each_inst_inorder_safe(inst, c) {
|
||||||
for (int i = 0; i < vir_get_nsrc(inst); i++) {
|
for (int i = 0; i < vir_get_nsrc(inst); i++) {
|
||||||
if (inst->src[i].file != QFILE_TEMP ||
|
if (inst->src[i].file != QFILE_TEMP ||
|
||||||
inst->src[i].index != spill_temp) {
|
inst->src[i].index != spill_temp) {
|
||||||
@@ -204,31 +231,41 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
|
|||||||
|
|
||||||
c->cursor = vir_before_inst(inst);
|
c->cursor = vir_before_inst(inst);
|
||||||
|
|
||||||
|
if (is_uniform) {
|
||||||
|
inst->src[i] = vir_MOV(c, uniform_src);
|
||||||
|
} else {
|
||||||
v3d_emit_spill_tmua(c, spill_offset);
|
v3d_emit_spill_tmua(c, spill_offset);
|
||||||
vir_emit_thrsw(c);
|
vir_emit_thrsw(c);
|
||||||
inst->src[i] = vir_LDTMU(c);
|
inst->src[i] = vir_LDTMU(c);
|
||||||
c->fills++;
|
c->fills++;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (inst->dst.file == QFILE_TEMP &&
|
if (inst->dst.file == QFILE_TEMP &&
|
||||||
inst->dst.index == spill_temp) {
|
inst->dst.index == spill_temp) {
|
||||||
|
if (is_uniform) {
|
||||||
|
c->cursor.link = NULL;
|
||||||
|
vir_remove_instruction(c, inst);
|
||||||
|
} else {
|
||||||
c->cursor = vir_after_inst(inst);
|
c->cursor = vir_after_inst(inst);
|
||||||
|
|
||||||
inst->dst.index = c->num_temps++;
|
inst->dst.index = c->num_temps++;
|
||||||
vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
|
vir_MOV_dest(c, vir_reg(QFILE_MAGIC,
|
||||||
|
V3D_QPU_WADDR_TMUD),
|
||||||
inst->dst);
|
inst->dst);
|
||||||
v3d_emit_spill_tmua(c, spill_offset);
|
v3d_emit_spill_tmua(c, spill_offset);
|
||||||
vir_emit_thrsw(c);
|
vir_emit_thrsw(c);
|
||||||
vir_TMUWT(c);
|
vir_TMUWT(c);
|
||||||
c->spills++;
|
c->spills++;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* If we didn't have a last-thrsw inserted by nir_to_vir and
|
/* If we didn't have a last-thrsw inserted by nir_to_vir and
|
||||||
* we've been inserting thrsws, then insert a new last_thrsw
|
* we've been inserting thrsws, then insert a new last_thrsw
|
||||||
* right before we start the vpm/tlb sequence for the last
|
* right before we start the vpm/tlb sequence for the last
|
||||||
* thread segment.
|
* thread segment.
|
||||||
*/
|
*/
|
||||||
if (!last_thrsw && c->last_thrsw &&
|
if (!is_uniform && !last_thrsw && c->last_thrsw &&
|
||||||
(v3d_qpu_writes_vpm(&inst->qpu) ||
|
(v3d_qpu_writes_vpm(&inst->qpu) ||
|
||||||
v3d_qpu_uses_tlb(&inst->qpu))) {
|
v3d_qpu_uses_tlb(&inst->qpu))) {
|
||||||
c->cursor = vir_before_inst(inst);
|
c->cursor = vir_before_inst(inst);
|
||||||
|
Reference in New Issue
Block a user