aco: improve p_create_vector RA for sub-dword operands

These's still improvements needed for sub-dword definitions, but that's
not as simple.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4507>
This commit is contained in:
Rhys Perry
2020-04-10 11:38:20 +01:00
committed by Marge Bot
parent e18711cda3
commit 52cc1f8237

View File

@@ -481,11 +481,12 @@ bool get_regs_for_copies(ra_ctx& ctx,
std::pair<PhysReg, bool> res;
if (is_dead_operand) {
if (instr->opcode == aco_opcode::p_create_vector) {
for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
for (unsigned j = 0; j < size; j++)
assert(reg_file[def_reg_lo + offset + j] == 0);
res = {PhysReg{def_reg_lo + offset}, true};
PhysReg reg(def_reg_lo);
reg.reg_b += offset;
assert(!reg_file.test(reg, var.rc.bytes()));
res = {reg, true};
break;
}
}
@@ -893,6 +894,7 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
{
/* create_vector instructions have different costs w.r.t. register coalescing */
uint32_t size = rc.size();
uint32_t bytes = rc.bytes();
uint32_t stride = 1;
uint32_t lb, ub;
if (rc.type() == RegType::vgpr) {
@@ -907,20 +909,25 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
stride = 4;
}
//TODO: improve p_create_vector for sub-dword vectors
unsigned best_pos = -1;
unsigned num_moves = 0xFF;
bool best_war_hint = true;
/* test for each operand which definition placement causes the least shuffle instructions */
for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
// TODO: think about, if we can alias live operands on the same register
if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || instr->operands[i].getTemp().type() != rc.type())
continue;
if (offset > instr->operands[i].physReg())
if (offset > instr->operands[i].physReg().reg_b)
continue;
unsigned reg_lo = instr->operands[i].physReg() - offset;
unsigned reg_lo = instr->operands[i].physReg().reg_b - offset;
if (reg_lo % 4)
continue;
reg_lo /= 4;
unsigned reg_hi = reg_lo + size - 1;
unsigned k = 0;
@@ -942,10 +949,18 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
bool linear_vgpr = false;
for (unsigned j = reg_lo; j <= reg_hi && !linear_vgpr; j++) {
if (reg_file[j] != 0) {
k++;
/* we cannot split live ranges of linear vgprs */
if (ctx.assignments[reg_file[j]].rc & (1 << 6))
linear_vgpr = true;
if (reg_file[j] == 0xF0000000) {
PhysReg reg;
reg.reg_b = j * 4;
unsigned bytes_left = bytes - (j - reg_lo) * 4;
for (unsigned k = 0; k < MIN2(bytes_left, 4); k++, reg.reg_b++)
k += reg_file.test(reg, 1);
} else {
k += 4;
/* we cannot split live ranges of linear vgprs */
if (ctx.assignments[reg_file[j]].rc & (1 << 6))
linear_vgpr = true;
}
}
war_hint |= ctx.war_hint[j];
}
@@ -953,13 +968,13 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
continue;
/* count operands in wrong positions */
for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].size(), j++) {
for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].bytes(), j++) {
if (j == i ||
!instr->operands[j].isTemp() ||
instr->operands[j].getTemp().type() != rc.type())
continue;
if (instr->operands[j].physReg() != reg_lo + offset)
k += instr->operands[j].size();
if (instr->operands[j].physReg().reg_b != reg_lo * 4 + offset)
k += instr->operands[j].bytes();
}
bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
if (k > num_moves || (!aligned && k == num_moves))
@@ -970,18 +985,18 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
best_war_hint = war_hint;
}
if (num_moves >= size)
if (num_moves >= bytes)
return get_reg(ctx, reg_file, rc, parallelcopies, instr);
/* collect variables to be moved */
std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
/* move killed operands which aren't yet at the correct position */
for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
if (instr->operands[i].isTemp() &&
instr->operands[i].isFirstKillBeforeDef() &&
instr->operands[i].getTemp().type() == rc.type() &&
instr->operands[i].physReg() != best_pos + offset)
instr->operands[i].physReg().reg_b != best_pos * 4 + offset)
vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
}