broadcom/compiler: convert add to mul when possible to allow merge

Integer add/sub can be implemented as either an add or a mul instruction
but we always emit them as add instructions at VIR level. We can use this
flexibility to improve our QPU scheduling so we can be more effective
at instruction merging by converting these to mul instructions when we
are attempting to merge them with another add instruction.

total instructions in shared programs: 13721549 -> 13691004 (-0.22%)
instructions in affected programs: 3340493 -> 3309948 (-0.91%)
helped: 12805
HURT: 1656
Instructions are helped.

total max-temps in shared programs: 2319528 -> 2319317 (<.01%)
max-temps in affected programs: 5285 -> 5074 (-3.99%)
helped: 195
HURT: 3
Max-temps are helped.

total sfu-stalls in shared programs: 31616 -> 31752 (0.43%)
sfu-stalls in affected programs: 469 -> 605 (29.00%)
helped: 52
HURT: 161
Sfu-stalls are HURT.

total inst-and-stalls in shared programs: 13753165 -> 13722756 (-0.22%)
inst-and-stalls in affected programs: 3340383 -> 3309974 (-0.91%)
helped: 12782
HURT: 1666
Inst-and-stalls are helped.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9769>
This commit is contained in:
Iago Toral Quiroga
2021-03-18 13:03:01 +01:00
committed by Marge Bot
parent bdf93f4e3b
commit 22a979be65

View File

@@ -820,6 +820,50 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
return true;
}
static bool
can_do_add_as_mul(enum v3d_qpu_add_op op)
{
switch (op) {
case V3D_QPU_A_ADD:
case V3D_QPU_A_SUB:
return true;
default:
return false;
}
}
static enum v3d_qpu_mul_op
add_op_as_mul_op(enum v3d_qpu_add_op op)
{
switch (op) {
case V3D_QPU_A_ADD:
return V3D_QPU_M_ADD;
case V3D_QPU_A_SUB:
return V3D_QPU_M_SUB;
default:
unreachable("unexpected add opcode");
}
}
static void
qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
{
STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
assert(inst->alu.add.op != V3D_QPU_A_NOP);
assert(inst->alu.mul.op == V3D_QPU_M_NOP);
memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul));
inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op);
inst->alu.add.op = V3D_QPU_A_NOP;
inst->flags.mc = inst->flags.ac;
inst->flags.mpf = inst->flags.apf;
inst->flags.muf = inst->flags.auf;
inst->flags.ac = V3D_QPU_PF_NONE;
inst->flags.apf = V3D_QPU_PF_NONE;
inst->flags.auf = V3D_QPU_PF_NONE;
}
static bool
qpu_merge_inst(const struct v3d_device_info *devinfo,
struct v3d_qpu_instr *result,
@@ -837,9 +881,9 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
struct v3d_qpu_instr merge = *a;
const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL;
struct v3d_qpu_instr mul_inst;
if (b->alu.add.op != V3D_QPU_A_NOP) {
if (a->alu.add.op != V3D_QPU_A_NOP)
return false;
if (a->alu.add.op == V3D_QPU_A_NOP) {
merge.alu.add = b->alu.add;
merge.flags.ac = b->flags.ac;
@@ -849,6 +893,41 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
add_instr = b;
mul_instr = a;
}
/* If a's add op is used but its mul op is not, then see if we
* can convert either a's add op or b's add op to a mul op
* so we can merge.
*/
else if (a->alu.mul.op == V3D_QPU_M_NOP &&
can_do_add_as_mul(b->alu.add.op)) {
mul_inst = *b;
qpu_convert_add_to_mul(&mul_inst);
merge.alu.mul = mul_inst.alu.mul;
merge.flags.mc = b->flags.ac;
merge.flags.mpf = b->flags.apf;
merge.flags.muf = b->flags.auf;
add_instr = a;
mul_instr = &mul_inst;
} else if (a->alu.mul.op == V3D_QPU_M_NOP &&
can_do_add_as_mul(a->alu.add.op)) {
mul_inst = *a;
qpu_convert_add_to_mul(&mul_inst);
merge = mul_inst;
merge.alu.add = b->alu.add;
merge.flags.ac = b->flags.ac;
merge.flags.apf = b->flags.apf;
merge.flags.auf = b->flags.auf;
add_instr = b;
mul_instr = &mul_inst;
} else {
return false;
}
}
if (b->alu.mul.op != V3D_QPU_M_NOP) {
if (a->alu.mul.op != V3D_QPU_M_NOP)