broadcom/compiler: try to use ldunif(a) instead of ldunif(a)rf in v71
The rf variants need to encode the destination in the cond bits, which prevents these to be merged with any other instruction that need them. In 4.x, ldunif(a) write to r5 which is a special register that only ldunif(a) and ldvary can write so we have a special register class for it and only allow it for them. Then when we need to choose a register for a node, if this register is available we always use it. In 7.x these instructions write to rf0, which can be used by any instruction, so instead of restricting rf0, we track the temps that are used as ldunif(a) destinations and use that information to favor rf0 for them. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25450>
This commit is contained in:

committed by
Marge Bot

parent
d8a25bdb07
commit
3a36a618d7
@@ -605,6 +605,9 @@ struct v3d_ra_node_info {
|
|||||||
struct {
|
struct {
|
||||||
uint32_t priority;
|
uint32_t priority;
|
||||||
uint8_t class_bits;
|
uint8_t class_bits;
|
||||||
|
|
||||||
|
/* V3D 7.x */
|
||||||
|
bool is_ldunif_dst;
|
||||||
} *info;
|
} *info;
|
||||||
uint32_t alloc_count;
|
uint32_t alloc_count;
|
||||||
};
|
};
|
||||||
|
@@ -384,6 +384,7 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
|
|||||||
/* We fill the node priority after we are done inserting spills */
|
/* We fill the node priority after we are done inserting spills */
|
||||||
c->nodes.info[node].class_bits = class_bits;
|
c->nodes.info[node].class_bits = class_bits;
|
||||||
c->nodes.info[node].priority = 0;
|
c->nodes.info[node].priority = 0;
|
||||||
|
c->nodes.info[node].is_ldunif_dst = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* The spill offset for this thread takes a bit of setup, so do it once at
|
/* The spill offset for this thread takes a bit of setup, so do it once at
|
||||||
@@ -899,9 +900,22 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
|
|||||||
|
|
||||||
static bool
|
static bool
|
||||||
v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
|
v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
|
||||||
|
unsigned int node,
|
||||||
BITSET_WORD *regs,
|
BITSET_WORD *regs,
|
||||||
unsigned int *out)
|
unsigned int *out)
|
||||||
{
|
{
|
||||||
|
/* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
|
||||||
|
* so we can avoid turning them into ldunifrf (which uses the
|
||||||
|
* cond field to encode the dst and would prevent merge with
|
||||||
|
* instructions that use cond flags).
|
||||||
|
*/
|
||||||
|
if (v3d_ra->nodes->info[node].is_ldunif_dst &&
|
||||||
|
BITSET_TEST(regs, v3d_ra->phys_index)) {
|
||||||
|
assert(v3d_ra->devinfo->ver >= 71);
|
||||||
|
*out = v3d_ra->phys_index;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < PHYS_COUNT; i++) {
|
for (int i = 0; i < PHYS_COUNT; i++) {
|
||||||
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
|
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
|
||||||
int phys = v3d_ra->phys_index + phys_off;
|
int phys = v3d_ra->phys_index + phys_off;
|
||||||
@@ -927,7 +941,7 @@ v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
|
|||||||
return reg;
|
return reg;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (v3d_ra_select_rf(v3d_ra, regs, ®))
|
if (v3d_ra_select_rf(v3d_ra, n, regs, ®))
|
||||||
return reg;
|
return reg;
|
||||||
|
|
||||||
/* If we ran out of physical registers try to assign an accumulator
|
/* If we ran out of physical registers try to assign an accumulator
|
||||||
@@ -1139,15 +1153,24 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/* If the instruction has an implicit write
|
/* Make sure we don't allocate the ldvary's
|
||||||
* we can't allocate its dest to the same
|
* destination to rf0, since it would clash
|
||||||
* register.
|
* with its implicit write to that register.
|
||||||
*/
|
*/
|
||||||
if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
|
if (inst->qpu.sig.ldvary) {
|
||||||
ra_add_node_interference(c->g,
|
ra_add_node_interference(c->g,
|
||||||
temp_to_node(c, inst->dst.index),
|
temp_to_node(c, inst->dst.index),
|
||||||
implicit_rf_nodes[0]);
|
implicit_rf_nodes[0]);
|
||||||
}
|
}
|
||||||
|
/* Flag dst temps from ldunif(a) instructions
|
||||||
|
* so we can try to assign rf0 to them and avoid
|
||||||
|
* converting these to ldunif(a)rf.
|
||||||
|
*/
|
||||||
|
if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) {
|
||||||
|
const uint32_t dst_n =
|
||||||
|
temp_to_node(c, inst->dst.index);
|
||||||
|
c->nodes.info[dst_n].is_ldunif_dst = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1222,6 +1245,7 @@ v3d_register_allocate(struct v3d_compile *c)
|
|||||||
* without accumulators that can have implicit writes to phys regs.
|
* without accumulators that can have implicit writes to phys regs.
|
||||||
*/
|
*/
|
||||||
for (uint32_t i = 0; i < num_ra_nodes; i++) {
|
for (uint32_t i = 0; i < num_ra_nodes; i++) {
|
||||||
|
c->nodes.info[i].is_ldunif_dst = false;
|
||||||
if (c->devinfo->has_accumulators && i < ACC_COUNT) {
|
if (c->devinfo->has_accumulators && i < ACC_COUNT) {
|
||||||
acc_nodes[i] = i;
|
acc_nodes[i] = i;
|
||||||
ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
|
ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
|
||||||
|
@@ -345,8 +345,15 @@ v3d_generate_code_block(struct v3d_compile *c,
|
|||||||
assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
|
assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
|
||||||
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
|
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
|
||||||
|
|
||||||
if (!dst.magic ||
|
bool use_rf;
|
||||||
dst.index != V3D_QPU_WADDR_R5) {
|
if (c->devinfo->has_accumulators) {
|
||||||
|
use_rf = !dst.magic ||
|
||||||
|
dst.index != V3D_QPU_WADDR_R5;
|
||||||
|
} else {
|
||||||
|
use_rf = dst.magic || dst.index != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (use_rf) {
|
||||||
assert(c->devinfo->ver >= 40);
|
assert(c->devinfo->ver >= 40);
|
||||||
|
|
||||||
if (qinst->qpu.sig.ldunif) {
|
if (qinst->qpu.sig.ldunif) {
|
||||||
|
Reference in New Issue
Block a user