Files
third_party_mesa3d/src/broadcom/compiler/vir.c
Eric Anholt 2780a99ff8 v3d: Move the stores for fixed function VS output reads into NIR.
This lets us emit the VPM_WRITEs directly from
nir_intrinsic_store_output() (useful once NIR scheduling is in place so
that we can reduce register pressure), and lets future NIR scheduling
schedule the math to generate them.  Even in the meantime, it looks like
this lets NIR DCE some more code and make better decisions.

total instructions in shared programs: 6429246 -> 6412976 (-0.25%)
total threads in shared programs: 153924 -> 153934 (<.01%)
total loops in shared programs: 486 -> 483 (-0.62%)
total uniforms in shared programs: 2385436 -> 2388195 (0.12%)

Acked-by: Ian Romanick <ian.d.romanick@intel.com> (nir)
2019-03-05 10:59:40 -08:00

1108 lines
35 KiB
C

/*
* Copyright © 2016-2017 Broadcom
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "broadcom/common/v3d_device_info.h"
#include "v3d_compiler.h"
int
vir_get_non_sideband_nsrc(struct qinst *inst)
{
switch (inst->qpu.type) {
case V3D_QPU_INSTR_TYPE_BRANCH:
return 0;
case V3D_QPU_INSTR_TYPE_ALU:
if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
else
return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
}
return 0;
}
int
vir_get_nsrc(struct qinst *inst)
{
int nsrc = vir_get_non_sideband_nsrc(inst);
if (vir_has_implicit_uniform(inst))
nsrc++;
return nsrc;
}
bool
vir_has_implicit_uniform(struct qinst *inst)
{
switch (inst->qpu.type) {
case V3D_QPU_INSTR_TYPE_BRANCH:
return true;
case V3D_QPU_INSTR_TYPE_ALU:
switch (inst->dst.file) {
case QFILE_TLBU:
return true;
case QFILE_MAGIC:
switch (inst->dst.index) {
case V3D_QPU_WADDR_TLBU:
case V3D_QPU_WADDR_TMUAU:
case V3D_QPU_WADDR_SYNCU:
return true;
default:
break;
}
break;
default:
return inst->has_implicit_uniform;
}
}
return false;
}
/* The sideband uniform for textures gets stored after the normal ALU
* arguments.
*/
int
vir_get_implicit_uniform_src(struct qinst *inst)
{
if (!vir_has_implicit_uniform(inst))
return -1;
return vir_get_nsrc(inst) - 1;
}
/**
* Returns whether the instruction has any side effects that must be
* preserved.
*/
bool
vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
{
switch (inst->qpu.type) {
case V3D_QPU_INSTR_TYPE_BRANCH:
return true;
case V3D_QPU_INSTR_TYPE_ALU:
switch (inst->qpu.alu.add.op) {
case V3D_QPU_A_SETREVF:
case V3D_QPU_A_SETMSF:
case V3D_QPU_A_VPMSETUP:
case V3D_QPU_A_STVPMV:
case V3D_QPU_A_STVPMD:
case V3D_QPU_A_STVPMP:
case V3D_QPU_A_VPMWT:
case V3D_QPU_A_TMUWT:
return true;
default:
break;
}
switch (inst->qpu.alu.mul.op) {
case V3D_QPU_M_MULTOP:
return true;
default:
break;
}
}
if (inst->qpu.sig.ldtmu ||
inst->qpu.sig.ldvary ||
inst->qpu.sig.wrtmuc ||
inst->qpu.sig.thrsw) {
return true;
}
return false;
}
bool
vir_is_raw_mov(struct qinst *inst)
{
if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
(inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
return false;
}
if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
return false;
}
if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
return false;
}
if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
inst->qpu.flags.mc != V3D_QPU_COND_NONE)
return false;
return true;
}
bool
vir_is_add(struct qinst *inst)
{
return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
inst->qpu.alu.add.op != V3D_QPU_A_NOP);
}
bool
vir_is_mul(struct qinst *inst)
{
return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
}
bool
vir_is_tex(struct qinst *inst)
{
if (inst->dst.file == QFILE_MAGIC)
return v3d_qpu_magic_waddr_is_tmu(inst->dst.index);
if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
return true;
}
return false;
}
bool
vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
{
for (int i = 0; i < vir_get_nsrc(inst); i++) {
switch (inst->src[i].file) {
case QFILE_VPM:
return true;
default:
break;
}
}
if (devinfo->ver < 41 && (inst->qpu.sig.ldvary ||
inst->qpu.sig.ldtlb ||
inst->qpu.sig.ldtlbu ||
inst->qpu.sig.ldvpm)) {
return true;
}
return false;
}
bool
vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
{
switch (inst->dst.file) {
case QFILE_MAGIC:
switch (inst->dst.index) {
case V3D_QPU_WADDR_RECIP:
case V3D_QPU_WADDR_RSQRT:
case V3D_QPU_WADDR_EXP:
case V3D_QPU_WADDR_LOG:
case V3D_QPU_WADDR_SIN:
return true;
}
break;
default:
break;
}
if (devinfo->ver < 41 && inst->qpu.sig.ldtmu)
return true;
return false;
}
void
vir_set_unpack(struct qinst *inst, int src,
enum v3d_qpu_input_unpack unpack)
{
assert(src == 0 || src == 1);
if (vir_is_add(inst)) {
if (src == 0)
inst->qpu.alu.add.a_unpack = unpack;
else
inst->qpu.alu.add.b_unpack = unpack;
} else {
assert(vir_is_mul(inst));
if (src == 0)
inst->qpu.alu.mul.a_unpack = unpack;
else
inst->qpu.alu.mul.b_unpack = unpack;
}
}
void
vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
{
if (vir_is_add(inst)) {
inst->qpu.flags.ac = cond;
} else {
assert(vir_is_mul(inst));
inst->qpu.flags.mc = cond;
}
}
void
vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf)
{
if (vir_is_add(inst)) {
inst->qpu.flags.apf = pf;
} else {
assert(vir_is_mul(inst));
inst->qpu.flags.mpf = pf;
}
}
void
vir_set_uf(struct qinst *inst, enum v3d_qpu_uf uf)
{
if (vir_is_add(inst)) {
inst->qpu.flags.auf = uf;
} else {
assert(vir_is_mul(inst));
inst->qpu.flags.muf = uf;
}
}
#if 0
uint8_t
vir_channels_written(struct qinst *inst)
{
if (vir_is_mul(inst)) {
switch (inst->dst.pack) {
case QPU_PACK_MUL_NOP:
case QPU_PACK_MUL_8888:
return 0xf;
case QPU_PACK_MUL_8A:
return 0x1;
case QPU_PACK_MUL_8B:
return 0x2;
case QPU_PACK_MUL_8C:
return 0x4;
case QPU_PACK_MUL_8D:
return 0x8;
}
} else {
switch (inst->dst.pack) {
case QPU_PACK_A_NOP:
case QPU_PACK_A_8888:
case QPU_PACK_A_8888_SAT:
case QPU_PACK_A_32_SAT:
return 0xf;
case QPU_PACK_A_8A:
case QPU_PACK_A_8A_SAT:
return 0x1;
case QPU_PACK_A_8B:
case QPU_PACK_A_8B_SAT:
return 0x2;
case QPU_PACK_A_8C:
case QPU_PACK_A_8C_SAT:
return 0x4;
case QPU_PACK_A_8D:
case QPU_PACK_A_8D_SAT:
return 0x8;
case QPU_PACK_A_16A:
case QPU_PACK_A_16A_SAT:
return 0x3;
case QPU_PACK_A_16B:
case QPU_PACK_A_16B_SAT:
return 0xc;
}
}
unreachable("Bad pack field");
}
#endif
struct qreg
vir_get_temp(struct v3d_compile *c)
{
struct qreg reg;
reg.file = QFILE_TEMP;
reg.index = c->num_temps++;
if (c->num_temps > c->defs_array_size) {
uint32_t old_size = c->defs_array_size;
c->defs_array_size = MAX2(old_size * 2, 16);
c->defs = reralloc(c, c->defs, struct qinst *,
c->defs_array_size);
memset(&c->defs[old_size], 0,
sizeof(c->defs[0]) * (c->defs_array_size - old_size));
c->spillable = reralloc(c, c->spillable,
BITSET_WORD,
BITSET_WORDS(c->defs_array_size));
for (int i = old_size; i < c->defs_array_size; i++)
BITSET_SET(c->spillable, i);
}
return reg;
}
struct qinst *
vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
{
struct qinst *inst = calloc(1, sizeof(*inst));
inst->qpu = v3d_qpu_nop();
inst->qpu.alu.add.op = op;
inst->dst = dst;
inst->src[0] = src0;
inst->src[1] = src1;
inst->uniform = ~0;
return inst;
}
struct qinst *
vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
{
struct qinst *inst = calloc(1, sizeof(*inst));
inst->qpu = v3d_qpu_nop();
inst->qpu.alu.mul.op = op;
inst->dst = dst;
inst->src[0] = src0;
inst->src[1] = src1;
inst->uniform = ~0;
return inst;
}
struct qinst *
vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src)
{
struct qinst *inst = calloc(1, sizeof(*inst));
inst->qpu = v3d_qpu_nop();
inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
inst->qpu.branch.cond = cond;
inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
inst->qpu.branch.ub = true;
inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
inst->dst = vir_nop_reg();
inst->src[0] = src;
inst->uniform = ~0;
return inst;
}
static void
vir_emit(struct v3d_compile *c, struct qinst *inst)
{
switch (c->cursor.mode) {
case vir_cursor_add:
list_add(&inst->link, c->cursor.link);
break;
case vir_cursor_addtail:
list_addtail(&inst->link, c->cursor.link);
break;
}
c->cursor = vir_after_inst(inst);
c->live_intervals_valid = false;
}
/* Updates inst to write to a new temporary, emits it, and notes the def. */
struct qreg
vir_emit_def(struct v3d_compile *c, struct qinst *inst)
{
assert(inst->dst.file == QFILE_NULL);
/* If we're emitting an instruction that's a def, it had better be
* writing a register.
*/
if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
}
inst->dst = vir_get_temp(c);
if (inst->dst.file == QFILE_TEMP)
c->defs[inst->dst.index] = inst;
vir_emit(c, inst);
return inst->dst;
}
struct qinst *
vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
{
if (inst->dst.file == QFILE_TEMP)
c->defs[inst->dst.index] = NULL;
vir_emit(c, inst);
return inst;
}
struct qblock *
vir_new_block(struct v3d_compile *c)
{
struct qblock *block = rzalloc(c, struct qblock);
list_inithead(&block->instructions);
block->predecessors = _mesa_set_create(block,
_mesa_hash_pointer,
_mesa_key_pointer_equal);
block->index = c->next_block_index++;
return block;
}
void
vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
{
c->cur_block = block;
c->cursor = vir_after_block(block);
list_addtail(&block->link, &c->blocks);
}
struct qblock *
vir_entry_block(struct v3d_compile *c)
{
return list_first_entry(&c->blocks, struct qblock, link);
}
struct qblock *
vir_exit_block(struct v3d_compile *c)
{
return list_last_entry(&c->blocks, struct qblock, link);
}
void
vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
{
_mesa_set_add(successor->predecessors, predecessor);
if (predecessor->successors[0]) {
assert(!predecessor->successors[1]);
predecessor->successors[1] = successor;
} else {
predecessor->successors[0] = successor;
}
}
const struct v3d_compiler *
v3d_compiler_init(const struct v3d_device_info *devinfo)
{
struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
if (!compiler)
return NULL;
compiler->devinfo = devinfo;
if (!vir_init_reg_sets(compiler)) {
ralloc_free(compiler);
return NULL;
}
return compiler;
}
void
v3d_compiler_free(const struct v3d_compiler *compiler)
{
ralloc_free((void *)compiler);
}
static struct v3d_compile *
vir_compile_init(const struct v3d_compiler *compiler,
struct v3d_key *key,
nir_shader *s,
void (*debug_output)(const char *msg,
void *debug_output_data),
void *debug_output_data,
int program_id, int variant_id)
{
struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
c->compiler = compiler;
c->devinfo = compiler->devinfo;
c->key = key;
c->program_id = program_id;
c->variant_id = variant_id;
c->threads = 4;
c->debug_output = debug_output;
c->debug_output_data = debug_output_data;
s = nir_shader_clone(c, s);
c->s = s;
list_inithead(&c->blocks);
vir_set_emit_block(c, vir_new_block(c));
c->output_position_index = -1;
c->output_sample_mask_index = -1;
c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
_mesa_key_pointer_equal);
return c;
}
static int
type_size_vec4(const struct glsl_type *type)
{
return glsl_count_attribute_slots(type, false);
}
static void
v3d_lower_nir(struct v3d_compile *c)
{
struct nir_lower_tex_options tex_options = {
.lower_txd = true,
.lower_tg4_broadcom_swizzle = true,
.lower_rect = false, /* XXX: Use this on V3D 3.x */
.lower_txp = ~0,
/* Apply swizzles to all samplers. */
.swizzle_result = ~0,
};
/* Lower the format swizzle and (for 32-bit returns)
* ARB_texture_swizzle-style swizzle.
*/
for (int i = 0; i < ARRAY_SIZE(c->key->tex); i++) {
for (int j = 0; j < 4; j++)
tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
if (c->key->tex[i].clamp_s)
tex_options.saturate_s |= 1 << i;
if (c->key->tex[i].clamp_t)
tex_options.saturate_t |= 1 << i;
if (c->key->tex[i].clamp_r)
tex_options.saturate_r |= 1 << i;
if (c->key->tex[i].return_size == 16) {
tex_options.lower_tex_packing[i] =
nir_lower_tex_packing_16;
}
}
NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
NIR_PASS_V(c->s, nir_lower_system_values);
}
static void
v3d_set_prog_data_uniforms(struct v3d_compile *c,
struct v3d_prog_data *prog_data)
{
int count = c->num_uniforms;
struct v3d_uniform_list *ulist = &prog_data->uniforms;
ulist->count = count;
ulist->data = ralloc_array(prog_data, uint32_t, count);
memcpy(ulist->data, c->uniform_data,
count * sizeof(*ulist->data));
ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
memcpy(ulist->contents, c->uniform_contents,
count * sizeof(*ulist->contents));
}
/* Copy the compiler UBO range state to the compiled shader, dropping out
* arrays that were never referenced by an indirect load.
*
* (Note that QIR dead code elimination of an array access still leaves that
* array alive, though)
*/
static void
v3d_set_prog_data_ubo(struct v3d_compile *c,
struct v3d_prog_data *prog_data)
{
if (!c->num_ubo_ranges)
return;
prog_data->num_ubo_ranges = 0;
prog_data->ubo_ranges = ralloc_array(prog_data, struct v3d_ubo_range,
c->num_ubo_ranges);
for (int i = 0; i < c->num_ubo_ranges; i++) {
if (!c->ubo_range_used[i])
continue;
struct v3d_ubo_range *range = &c->ubo_ranges[i];
prog_data->ubo_ranges[prog_data->num_ubo_ranges++] = *range;
prog_data->ubo_size += range->size;
}
if (prog_data->ubo_size) {
if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
vir_get_stage_name(c),
c->program_id, c->variant_id,
prog_data->ubo_size / 4);
}
}
}
static void
v3d_vs_set_prog_data(struct v3d_compile *c,
struct v3d_vs_prog_data *prog_data)
{
/* The vertex data gets format converted by the VPM so that
* each attribute channel takes up a VPM column. Precompute
* the sizes for the shader record.
*/
for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
prog_data->vattr_sizes[i] = c->vattr_sizes[i];
prog_data->vpm_input_size += c->vattr_sizes[i];
}
prog_data->uses_vid = (c->s->info.system_values_read &
(1ull << SYSTEM_VALUE_VERTEX_ID));
prog_data->uses_iid = (c->s->info.system_values_read &
(1ull << SYSTEM_VALUE_INSTANCE_ID));
if (prog_data->uses_vid)
prog_data->vpm_input_size++;
if (prog_data->uses_iid)
prog_data->vpm_input_size++;
/* Input/output segment size are in sectors (8 rows of 32 bits per
* channel).
*/
prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
/* Set us up for shared input/output segments. This is apparently
* necessary for our VCM setup to avoid varying corruption.
*/
prog_data->separate_segments = false;
prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
prog_data->vpm_input_size);
prog_data->vpm_input_size = 0;
/* Compute VCM cache size. We set up our program to take up less than
* half of the VPM, so that any set of bin and render programs won't
* run out of space. We need space for at least one input segment,
* and then allocate the rest to output segments (one for the current
* program, the rest to VCM). The valid range of the VCM cache size
* field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
* batches.
*/
assert(c->devinfo->vpm_size);
int sector_size = 16 * sizeof(uint32_t) * 8;
int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
int half_vpm = vpm_size_in_sectors / 2;
int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
assert(vpm_output_batches >= 2);
prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
}
static void
v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
struct v3d_fs_prog_data *prog_data)
{
prog_data->num_inputs = c->num_inputs;
memcpy(prog_data->input_slots, c->input_slots,
c->num_inputs * sizeof(*c->input_slots));
STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) >
(V3D_MAX_FS_INPUTS - 1) / 24);
for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
if (BITSET_TEST(c->flat_shade_flags, i))
prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
if (BITSET_TEST(c->noperspective_flags, i))
prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
if (BITSET_TEST(c->centroid_flags, i))
prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
}
}
static void
v3d_fs_set_prog_data(struct v3d_compile *c,
struct v3d_fs_prog_data *prog_data)
{
v3d_set_fs_prog_data_inputs(c, prog_data);
prog_data->writes_z = c->writes_z;
prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
prog_data->uses_center_w = c->uses_center_w;
}
static void
v3d_set_prog_data(struct v3d_compile *c,
struct v3d_prog_data *prog_data)
{
prog_data->threads = c->threads;
prog_data->single_seg = !c->last_thrsw;
prog_data->spill_size = c->spill_size;
v3d_set_prog_data_uniforms(c, prog_data);
v3d_set_prog_data_ubo(c, prog_data);
if (c->s->info.stage == MESA_SHADER_VERTEX) {
v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
} else {
assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
}
}
static uint64_t *
v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
{
*final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
uint64_t *qpu_insts = malloc(*final_assembly_size);
if (!qpu_insts)
return NULL;
memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
vir_compile_destroy(c);
return qpu_insts;
}
static void
v3d_nir_lower_vs_early(struct v3d_compile *c)
{
/* Split our I/O vars and dead code eliminate the unused
* components.
*/
NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
nir_var_shader_in | nir_var_shader_out);
uint64_t used_outputs[4] = {0};
for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
int slot = v3d_slot_get_slot(c->vs_key->fs_inputs[i]);
int comp = v3d_slot_get_component(c->vs_key->fs_inputs[i]);
used_outputs[comp] |= 1ull << slot;
}
NIR_PASS_V(c->s, nir_remove_unused_io_vars,
&c->s->outputs, used_outputs, NULL); /* demotes to globals */
NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
v3d_optimize_nir(c->s);
NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in);
NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
type_size_vec4,
(nir_lower_io_options)0);
}
static void
v3d_fixup_fs_output_types(struct v3d_compile *c)
{
nir_foreach_variable(var, &c->s->outputs) {
uint32_t mask = 0;
switch (var->data.location) {
case FRAG_RESULT_COLOR:
mask = ~0;
break;
case FRAG_RESULT_DATA0:
case FRAG_RESULT_DATA1:
case FRAG_RESULT_DATA2:
case FRAG_RESULT_DATA3:
mask = 1 << (var->data.location - FRAG_RESULT_DATA0);
break;
}
if (c->fs_key->int_color_rb & mask) {
var->type =
glsl_vector_type(GLSL_TYPE_INT,
glsl_get_components(var->type));
} else if (c->fs_key->uint_color_rb & mask) {
var->type =
glsl_vector_type(GLSL_TYPE_UINT,
glsl_get_components(var->type));
}
}
}
static void
v3d_nir_lower_fs_early(struct v3d_compile *c)
{
if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
v3d_fixup_fs_output_types(c);
/* If the shader has no non-TLB side effects, we can promote it to
* enabling early_fragment_tests even if the user didn't.
*/
if (!(c->s->info.num_images ||
c->s->info.num_ssbos ||
c->s->info.num_abos)) {
c->s->info.fs.early_fragment_tests = true;
}
}
static void
v3d_nir_lower_vs_late(struct v3d_compile *c)
{
if (c->vs_key->clamp_color)
NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
if (c->key->ucp_enables) {
NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables,
false);
NIR_PASS_V(c->s, nir_lower_io_to_scalar,
nir_var_shader_out);
}
/* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
}
static void
v3d_nir_lower_fs_late(struct v3d_compile *c)
{
if (c->fs_key->light_twoside)
NIR_PASS_V(c->s, nir_lower_two_sided_color);
if (c->fs_key->clamp_color)
NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
if (c->fs_key->alpha_test) {
NIR_PASS_V(c->s, nir_lower_alpha_test,
c->fs_key->alpha_test_func,
false);
}
if (c->key->ucp_enables)
NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables);
/* Note: FS input scalarizing must happen after
* nir_lower_two_sided_color, which only handles a vec4 at a time.
*/
NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
}
uint64_t *v3d_compile(const struct v3d_compiler *compiler,
struct v3d_key *key,
struct v3d_prog_data **out_prog_data,
nir_shader *s,
void (*debug_output)(const char *msg,
void *debug_output_data),
void *debug_output_data,
int program_id, int variant_id,
uint32_t *final_assembly_size)
{
struct v3d_prog_data *prog_data;
struct v3d_compile *c = vir_compile_init(compiler, key, s,
debug_output, debug_output_data,
program_id, variant_id);
switch (c->s->info.stage) {
case MESA_SHADER_VERTEX:
c->vs_key = (struct v3d_vs_key *)key;
prog_data = rzalloc_size(NULL, sizeof(struct v3d_vs_prog_data));
break;
case MESA_SHADER_FRAGMENT:
c->fs_key = (struct v3d_fs_key *)key;
prog_data = rzalloc_size(NULL, sizeof(struct v3d_fs_prog_data));
break;
default:
unreachable("unsupported shader stage");
}
if (c->s->info.stage == MESA_SHADER_VERTEX) {
v3d_nir_lower_vs_early(c);
} else {
assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
v3d_nir_lower_fs_early(c);
}
v3d_lower_nir(c);
if (c->s->info.stage == MESA_SHADER_VERTEX) {
v3d_nir_lower_vs_late(c);
} else {
assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
v3d_nir_lower_fs_late(c);
}
NIR_PASS_V(c->s, v3d_nir_lower_io, c);
NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
NIR_PASS_V(c->s, nir_lower_idiv);
v3d_optimize_nir(c->s);
NIR_PASS_V(c->s, nir_lower_bool_to_int32);
NIR_PASS_V(c->s, nir_convert_from_ssa, true);
v3d_nir_to_vir(c);
v3d_set_prog_data(c, prog_data);
*out_prog_data = prog_data;
char *shaderdb;
int ret = asprintf(&shaderdb,
"%s shader: %d inst, %d threads, %d loops, "
"%d uniforms, %d:%d spills:fills",
vir_get_stage_name(c),
c->qpu_inst_count,
c->threads,
c->loops,
c->num_uniforms,
c->spills,
c->fills);
if (ret >= 0) {
c->debug_output(shaderdb, c->debug_output_data);
free(shaderdb);
}
return v3d_return_qpu_insts(c, final_assembly_size);
}
void
vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
{
if (qinst->dst.file == QFILE_TEMP)
c->defs[qinst->dst.index] = NULL;
assert(&qinst->link != c->cursor.link);
list_del(&qinst->link);
free(qinst);
c->live_intervals_valid = false;
}
struct qreg
vir_follow_movs(struct v3d_compile *c, struct qreg reg)
{
/* XXX
int pack = reg.pack;
while (reg.file == QFILE_TEMP &&
c->defs[reg.index] &&
(c->defs[reg.index]->op == QOP_MOV ||
c->defs[reg.index]->op == QOP_FMOV) &&
!c->defs[reg.index]->dst.pack &&
!c->defs[reg.index]->src[0].pack) {
reg = c->defs[reg.index]->src[0];
}
reg.pack = pack;
*/
return reg;
}
void
vir_compile_destroy(struct v3d_compile *c)
{
/* Defuse the assert that we aren't removing the cursor's instruction.
*/
c->cursor.link = NULL;
vir_for_each_block(block, c) {
while (!list_empty(&block->instructions)) {
struct qinst *qinst =
list_first_entry(&block->instructions,
struct qinst, link);
vir_remove_instruction(c, qinst);
}
}
ralloc_free(c);
}
struct qreg
vir_uniform(struct v3d_compile *c,
enum quniform_contents contents,
uint32_t data)
{
for (int i = 0; i < c->num_uniforms; i++) {
if (c->uniform_contents[i] == contents &&
c->uniform_data[i] == data) {
return vir_reg(QFILE_UNIF, i);
}
}
uint32_t uniform = c->num_uniforms++;
if (uniform >= c->uniform_array_size) {
c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
c->uniform_array_size * 2);
c->uniform_data = reralloc(c, c->uniform_data,
uint32_t,
c->uniform_array_size);
c->uniform_contents = reralloc(c, c->uniform_contents,
enum quniform_contents,
c->uniform_array_size);
}
c->uniform_contents[uniform] = contents;
c->uniform_data[uniform] = data;
return vir_reg(QFILE_UNIF, uniform);
}
#define OPTPASS(func) \
do { \
bool stage_progress = func(c); \
if (stage_progress) { \
progress = true; \
if (print_opt_debug) { \
fprintf(stderr, \
"VIR opt pass %2d: %s progress\n", \
pass, #func); \
} \
/*XXX vir_validate(c);*/ \
} \
} while (0)
void
vir_optimize(struct v3d_compile *c)
{
bool print_opt_debug = false;
int pass = 1;
while (true) {
bool progress = false;
OPTPASS(vir_opt_copy_propagate);
OPTPASS(vir_opt_dead_code);
OPTPASS(vir_opt_small_immediates);
if (!progress)
break;
pass++;
}
}
const char *
vir_get_stage_name(struct v3d_compile *c)
{
if (c->vs_key && c->vs_key->is_coord)
return "MESA_SHADER_COORD";
else
return gl_shader_stage_name(c->s->info.stage);
}