v3d: Implement a small immediates optimization, based on VC4's.
We can do one per instruction, and we have to be careful not to overwrite raddr_b, but this greatly reduces the pressure on uniform loads (particularly around ldvpm/stvpm instructions). total instructions in shared programs: 90768 -> 88220 (-2.81%) instructions in affected programs: 82711 -> 80163 (-3.08%)
This commit is contained in:
@@ -28,6 +28,7 @@ BROADCOM_FILES = \
|
|||||||
compiler/vir_lower_uniforms.c \
|
compiler/vir_lower_uniforms.c \
|
||||||
compiler/vir_opt_copy_propagate.c \
|
compiler/vir_opt_copy_propagate.c \
|
||||||
compiler/vir_opt_dead_code.c \
|
compiler/vir_opt_dead_code.c \
|
||||||
|
compiler/vir_opt_small_immediates.c \
|
||||||
compiler/vir_register_allocate.c \
|
compiler/vir_register_allocate.c \
|
||||||
compiler/vir_to_qpu.c \
|
compiler/vir_to_qpu.c \
|
||||||
compiler/qpu_schedule.c \
|
compiler/qpu_schedule.c \
|
||||||
|
@@ -26,6 +26,7 @@ libbroadcom_compiler_files = files(
|
|||||||
'vir_lower_uniforms.c',
|
'vir_lower_uniforms.c',
|
||||||
'vir_opt_copy_propagate.c',
|
'vir_opt_copy_propagate.c',
|
||||||
'vir_opt_dead_code.c',
|
'vir_opt_dead_code.c',
|
||||||
|
'vir_opt_small_immediates.c',
|
||||||
'vir_register_allocate.c',
|
'vir_register_allocate.c',
|
||||||
'vir_to_qpu.c',
|
'vir_to_qpu.c',
|
||||||
'qpu_schedule.c',
|
'qpu_schedule.c',
|
||||||
|
@@ -670,7 +670,8 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
|
|||||||
|
|
||||||
if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) {
|
if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) {
|
||||||
if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) &&
|
if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) &&
|
||||||
a->raddr_b != b->raddr_b) {
|
(a->raddr_b != b->raddr_b ||
|
||||||
|
a->sig.small_imm != b->sig.small_imm)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
merge.raddr_b = b->raddr_b;
|
merge.raddr_b = b->raddr_b;
|
||||||
|
@@ -115,6 +115,7 @@ static inline struct qreg vir_reg(enum qfile file, uint32_t index)
|
|||||||
*/
|
*/
|
||||||
struct qpu_reg {
|
struct qpu_reg {
|
||||||
bool magic;
|
bool magic;
|
||||||
|
bool smimm;
|
||||||
int index;
|
int index;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -1001,6 +1001,7 @@ vir_optimize(struct v3d_compile *c)
|
|||||||
|
|
||||||
OPTPASS(vir_opt_copy_propagate);
|
OPTPASS(vir_opt_copy_propagate);
|
||||||
OPTPASS(vir_opt_dead_code);
|
OPTPASS(vir_opt_dead_code);
|
||||||
|
OPTPASS(vir_opt_small_immediates);
|
||||||
|
|
||||||
if (!progress)
|
if (!progress)
|
||||||
break;
|
break;
|
||||||
|
@@ -25,7 +25,8 @@
|
|||||||
#include "v3d_compiler.h"
|
#include "v3d_compiler.h"
|
||||||
|
|
||||||
static void
|
static void
|
||||||
vir_print_reg(struct v3d_compile *c, struct qreg reg)
|
vir_print_reg(struct v3d_compile *c, const struct qinst *inst,
|
||||||
|
struct qreg reg)
|
||||||
{
|
{
|
||||||
static const char *files[] = {
|
static const char *files[] = {
|
||||||
[QFILE_TEMP] = "t",
|
[QFILE_TEMP] = "t",
|
||||||
@@ -58,12 +59,20 @@ vir_print_reg(struct v3d_compile *c, struct qreg reg)
|
|||||||
fprintf(stderr, "%s", v3d_qpu_magic_waddr_name(reg.index));
|
fprintf(stderr, "%s", v3d_qpu_magic_waddr_name(reg.index));
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case QFILE_SMALL_IMM:
|
case QFILE_SMALL_IMM: {
|
||||||
if ((int)reg.index >= -16 && (int)reg.index <= 15)
|
uint32_t unpacked;
|
||||||
fprintf(stderr, "%d", reg.index);
|
bool ok = v3d_qpu_small_imm_unpack(c->devinfo,
|
||||||
|
inst->qpu.raddr_b,
|
||||||
|
&unpacked);
|
||||||
|
assert(ok); (void) ok;
|
||||||
|
|
||||||
|
if ((int)inst->qpu.raddr_b >= -16 &&
|
||||||
|
(int)inst->qpu.raddr_b <= 15)
|
||||||
|
fprintf(stderr, "%d", unpacked);
|
||||||
else
|
else
|
||||||
fprintf(stderr, "%f", uif(reg.index));
|
fprintf(stderr, "%f", uif(unpacked));
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case QFILE_VPM:
|
case QFILE_VPM:
|
||||||
fprintf(stderr, "vpm%d.%d",
|
fprintf(stderr, "vpm%d.%d",
|
||||||
@@ -220,7 +229,7 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
|
|||||||
fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.auf));
|
fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.auf));
|
||||||
fprintf(stderr, " ");
|
fprintf(stderr, " ");
|
||||||
|
|
||||||
vir_print_reg(c, inst->dst);
|
vir_print_reg(c, inst, inst->dst);
|
||||||
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
|
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
|
||||||
|
|
||||||
unpack[0] = instr->alu.add.a_unpack;
|
unpack[0] = instr->alu.add.a_unpack;
|
||||||
@@ -232,7 +241,7 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
|
|||||||
fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.muf));
|
fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.muf));
|
||||||
fprintf(stderr, " ");
|
fprintf(stderr, " ");
|
||||||
|
|
||||||
vir_print_reg(c, inst->dst);
|
vir_print_reg(c, inst, inst->dst);
|
||||||
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
|
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
|
||||||
|
|
||||||
unpack[0] = instr->alu.mul.a_unpack;
|
unpack[0] = instr->alu.mul.a_unpack;
|
||||||
@@ -241,7 +250,7 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
|
|||||||
|
|
||||||
for (int i = 0; i < sideband_nsrc; i++) {
|
for (int i = 0; i < sideband_nsrc; i++) {
|
||||||
fprintf(stderr, ", ");
|
fprintf(stderr, ", ");
|
||||||
vir_print_reg(c, inst->src[i]);
|
vir_print_reg(c, inst, inst->src[i]);
|
||||||
if (i < nsrc)
|
if (i < nsrc)
|
||||||
fprintf(stderr, "%s", v3d_qpu_unpack_name(unpack[i]));
|
fprintf(stderr, "%s", v3d_qpu_unpack_name(unpack[i]));
|
||||||
}
|
}
|
||||||
@@ -307,7 +316,7 @@ vir_dump_inst(struct v3d_compile *c, struct qinst *inst)
|
|||||||
|
|
||||||
if (vir_has_implicit_uniform(inst)) {
|
if (vir_has_implicit_uniform(inst)) {
|
||||||
fprintf(stderr, " ");
|
fprintf(stderr, " ");
|
||||||
vir_print_reg(c, inst->src[vir_get_implicit_uniform_src(inst)]);
|
vir_print_reg(c, inst, inst->src[vir_get_implicit_uniform_src(inst)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
112
src/broadcom/compiler/vir_opt_small_immediates.c
Normal file
112
src/broadcom/compiler/vir_opt_small_immediates.c
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
/*
|
||||||
|
* Copyright © 2014 Broadcom
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice (including the next
|
||||||
|
* paragraph) shall be included in all copies or substantial portions of the
|
||||||
|
* Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||||
|
* IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file v3d_opt_small_immediates.c
|
||||||
|
*
|
||||||
|
* Turns references to small constant uniform values into small immediates
|
||||||
|
* fields.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "v3d_compiler.h"
|
||||||
|
|
||||||
|
static bool debug;
|
||||||
|
|
||||||
|
bool
|
||||||
|
vir_opt_small_immediates(struct v3d_compile *c)
|
||||||
|
{
|
||||||
|
bool progress = false;
|
||||||
|
|
||||||
|
vir_for_each_inst_inorder(inst, c) {
|
||||||
|
if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* The small immediate value sits in the raddr B field, so we
|
||||||
|
* can't have 2 small immediates in one instruction (unless
|
||||||
|
* they're the same value, but that should be optimized away
|
||||||
|
* elsewhere).
|
||||||
|
*/
|
||||||
|
bool uses_small_imm = false;
|
||||||
|
for (int i = 0; i < vir_get_nsrc(inst); i++) {
|
||||||
|
if (inst->src[i].file == QFILE_SMALL_IMM)
|
||||||
|
uses_small_imm = true;
|
||||||
|
}
|
||||||
|
if (uses_small_imm)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
for (int i = 0; i < vir_get_nsrc(inst); i++) {
|
||||||
|
struct qreg src = vir_follow_movs(c, inst->src[i]);
|
||||||
|
|
||||||
|
if (src.file != QFILE_UNIF ||
|
||||||
|
c->uniform_contents[src.index] !=
|
||||||
|
QUNIFORM_CONSTANT) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vir_has_implicit_uniform(inst) &&
|
||||||
|
i == vir_get_implicit_uniform_src(inst)) {
|
||||||
|
/* No turning the implicit uniform read into
|
||||||
|
* an immediate.
|
||||||
|
*/
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check if the uniform is suitable as a small
|
||||||
|
* immediate.
|
||||||
|
*/
|
||||||
|
uint32_t imm = c->uniform_data[src.index];
|
||||||
|
uint32_t packed;
|
||||||
|
if (!v3d_qpu_small_imm_pack(c->devinfo, imm, &packed))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* Check that we don't have any other signals already
|
||||||
|
* that would be incompatible with small_imm.
|
||||||
|
*/
|
||||||
|
struct v3d_qpu_sig new_sig = inst->qpu.sig;
|
||||||
|
uint32_t sig_packed;
|
||||||
|
new_sig.small_imm = true;
|
||||||
|
if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (debug) {
|
||||||
|
fprintf(stderr, "opt_small_immediate() from: ");
|
||||||
|
vir_dump_inst(c, inst);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
}
|
||||||
|
inst->qpu.sig.small_imm = true;
|
||||||
|
inst->qpu.raddr_b = packed;
|
||||||
|
|
||||||
|
inst->src[i].file = QFILE_SMALL_IMM;
|
||||||
|
inst->src[i].index = imm;
|
||||||
|
if (debug) {
|
||||||
|
fprintf(stderr, "to: ");
|
||||||
|
vir_dump_inst(c, inst);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
}
|
||||||
|
progress = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return progress;
|
||||||
|
}
|
@@ -109,6 +109,12 @@ new_ldunif_instr(struct qinst *inst, int i)
|
|||||||
static void
|
static void
|
||||||
set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
|
set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
|
||||||
{
|
{
|
||||||
|
if (src.smimm) {
|
||||||
|
assert(instr->sig.small_imm);
|
||||||
|
*mux = V3D_QPU_MUX_B;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (src.magic) {
|
if (src.magic) {
|
||||||
assert(src.index >= V3D_QPU_WADDR_R0 &&
|
assert(src.index >= V3D_QPU_WADDR_R0 &&
|
||||||
src.index <= V3D_QPU_WADDR_R5);
|
src.index <= V3D_QPU_WADDR_R5);
|
||||||
@@ -244,15 +250,7 @@ v3d_generate_code_block(struct v3d_compile *c,
|
|||||||
src[i] = qpu_acc(5);
|
src[i] = qpu_acc(5);
|
||||||
break;
|
break;
|
||||||
case QFILE_SMALL_IMM:
|
case QFILE_SMALL_IMM:
|
||||||
abort(); /* XXX */
|
src[i].smimm = true;
|
||||||
#if 0
|
|
||||||
src[i].mux = QPU_MUX_SMALL_IMM;
|
|
||||||
src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
|
|
||||||
/* This should only have returned a valid
|
|
||||||
* small immediate field, not ~0 for failure.
|
|
||||||
*/
|
|
||||||
assert(src[i].addr <= 47);
|
|
||||||
#endif
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case QFILE_VPM:
|
case QFILE_VPM:
|
||||||
|
Reference in New Issue
Block a user