ir3: Convert to register intrinsics

Thanks to our SSA-based RA, we only use nir_register for arrays, and we only
access array registers with dedicated moves anyway. So there's no reason to need
any fancy coalescing... we can just switch to register access intrinsics and
translate them to moves exactly like we would've done when getting srcs/dests
before.

This addresses the ir3 portion of #9051.

No shader-db changes with a (significant subset of) Rob's shader-db. (Some
shaders are affected by this change but not in any way that shows up in the
stats.)

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24126>
This commit is contained in:
Alyssa Rosenzweig
2023-07-12 14:21:43 -04:00
parent 7ad9416c61
commit 1e9f4b967a
4 changed files with 84 additions and 99 deletions

View File

@@ -580,7 +580,7 @@ struct ir3_array {
unsigned length;
unsigned id;
struct nir_register *r;
struct nir_ssa_def *r;
/* To avoid array write's from getting DCE'd, keep track of the
* most recent write. Any array access depends on the most

View File

@@ -1992,6 +1992,63 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
const unsigned primitive_map = const_state->offsets.primitive_map * 4;
switch (intr->intrinsic) {
case nir_intrinsic_decl_reg:
/* There's logically nothing to do, but this has a destination in NIR so
* plug in something... It will get DCE'd.
*/
dst[0] = create_immed(ctx->block, 0);
break;
case nir_intrinsic_load_reg:
case nir_intrinsic_load_reg_indirect: {
struct ir3_array *arr = ir3_get_array(ctx, intr->src[0].ssa);
struct ir3_instruction *addr = NULL;
if (intr->intrinsic == nir_intrinsic_load_reg_indirect) {
addr = ir3_get_addr0(ctx, ir3_get_src(ctx, &intr->src[1])[0],
dest_components);
}
ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(intr->src[0].ssa);
assert(dest_components == nir_intrinsic_num_components(decl));
for (unsigned i = 0; i < dest_components; i++) {
unsigned n = nir_intrinsic_base(intr) * dest_components + i;
compile_assert(ctx, n < arr->length);
dst[i] = ir3_create_array_load(ctx, arr, n, addr);
}
break;
}
case nir_intrinsic_store_reg:
case nir_intrinsic_store_reg_indirect: {
struct ir3_array *arr = ir3_get_array(ctx, intr->src[1].ssa);
unsigned num_components = nir_src_num_components(intr->src[0]);
struct ir3_instruction *addr = NULL;
ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(intr->src[1].ssa);
assert(num_components == nir_intrinsic_num_components(decl));
struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[0]);
if (intr->intrinsic == nir_intrinsic_store_reg_indirect) {
addr = ir3_get_addr0(ctx, ir3_get_src(ctx, &intr->src[2])[0],
num_components);
}
u_foreach_bit(i, nir_intrinsic_write_mask(intr)) {
assert(i < num_components);
unsigned n = nir_intrinsic_base(intr) * num_components + i;
compile_assert(ctx, n < arr->length);
if (value[i])
ir3_create_array_store(ctx, arr, n, value[i], addr);
}
break;
}
case nir_intrinsic_load_uniform:
idx = nir_intrinsic_base(intr);
if (nir_src_is_const(intr->src[0])) {
@@ -4377,8 +4434,8 @@ emit_instructions(struct ir3_context *ctx)
ctx->so->shared_size = ctx->s->info.shared_size;
/* NOTE: need to do something more clever when we support >1 fxn */
nir_foreach_register (reg, &fxn->registers) {
ir3_declare_array(ctx, reg);
nir_foreach_reg_decl (decl, fxn) {
ir3_declare_array(ctx, decl);
}
if (ctx->so->type == MESA_SHADER_TESS_CTRL &&

View File

@@ -29,6 +29,8 @@
#include "ir3_image.h"
#include "ir3_nir.h"
#include "ir3_shader.h"
#include "nir.h"
#include "nir_intrinsics_indices.h"
struct ir3_context *
ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader,
@@ -87,9 +89,9 @@ ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader,
*/
bool progress = false;
bool needs_late_alg = false;
NIR_PASS(progress, ctx->s, nir_lower_locals_to_regs, 1);
NIR_PASS(progress, ctx->s, nir_lower_locals_to_reg_intrinsics, 1);
/* we could need cleanup after lower_locals_to_regs */
/* we could need cleanup after lower_locals_to_reg_intrinsics */
while (progress) {
progress = false;
NIR_PASS(progress, ctx->s, nir_opt_algebraic);
@@ -98,9 +100,9 @@ ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader,
}
/* We want to lower nir_op_imul as late as possible, to catch also
* those generated by earlier passes (e.g, nir_lower_locals_to_regs).
* However, we want a final swing of a few passes to have a chance
* at optimizing the result.
* those generated by earlier passes (e.g,
* nir_lower_locals_to_reg_intrinsics). However, we want a final swing of a
* few passes to have a chance at optimizing the result.
*/
progress = false;
NIR_PASS(progress, ctx->s, ir3_nir_lower_imul);
@@ -201,17 +203,9 @@ ir3_get_dst_ssa(struct ir3_context *ctx, nir_ssa_def *dst, unsigned n)
struct ir3_instruction **
ir3_get_dst(struct ir3_context *ctx, nir_dest *dst, unsigned n)
{
struct ir3_instruction **value;
assert(dst->is_ssa);
struct ir3_instruction **value = ir3_get_dst_ssa(ctx, &dst->ssa, n);
if (dst->is_ssa) {
value = ir3_get_dst_ssa(ctx, &dst->ssa, n);
} else {
value = ralloc_array(ctx, struct ir3_instruction *, n);
}
/* NOTE: in non-ssa case, we don't really need to store last_dst
* but this helps us catch cases where put_dst() call is forgotten
*/
compile_assert(ctx, !ctx->last_dst);
ctx->last_dst = value;
ctx->last_dst_n = n;
@@ -222,31 +216,11 @@ ir3_get_dst(struct ir3_context *ctx, nir_dest *dst, unsigned n)
struct ir3_instruction *const *
ir3_get_src(struct ir3_context *ctx, nir_src *src)
{
if (src->is_ssa) {
struct hash_entry *entry;
entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
compile_assert(ctx, entry);
return entry->data;
} else {
nir_register *reg = src->reg.reg;
struct ir3_array *arr = ir3_get_array(ctx, reg);
unsigned num_components = arr->r->num_components;
struct ir3_instruction *addr = NULL;
struct ir3_instruction **value =
ralloc_array(ctx, struct ir3_instruction *, num_components);
if (src->reg.indirect)
addr = ir3_get_addr0(ctx, ir3_get_src(ctx, src->reg.indirect)[0],
reg->num_components);
for (unsigned i = 0; i < num_components; i++) {
unsigned n = src->reg.base_offset * reg->num_components + i;
compile_assert(ctx, n < arr->length);
value[i] = ir3_create_array_load(ctx, arr, n, addr);
}
return value;
}
assert(src->is_ssa);
struct hash_entry *entry;
entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
compile_assert(ctx, entry);
return entry->data;
}
void
@@ -279,27 +253,7 @@ ir3_put_dst(struct ir3_context *ctx, nir_dest *dst)
}
}
if (!dst->is_ssa) {
nir_register *reg = dst->reg.reg;
struct ir3_array *arr = ir3_get_array(ctx, reg);
unsigned num_components = ctx->last_dst_n;
struct ir3_instruction *addr = NULL;
if (dst->reg.indirect)
addr = ir3_get_addr0(ctx, ir3_get_src(ctx, dst->reg.indirect)[0],
reg->num_components);
for (unsigned i = 0; i < num_components; i++) {
unsigned n = dst->reg.base_offset * reg->num_components + i;
compile_assert(ctx, n < arr->length);
if (!ctx->last_dst[i])
continue;
ir3_create_array_store(ctx, arr, n, ctx->last_dst[i], addr);
}
ralloc_free(ctx->last_dst);
}
assert(dst->is_ssa);
ctx->last_dst = NULL;
ctx->last_dst_n = 0;
}
@@ -543,7 +497,7 @@ ir3_get_predicate(struct ir3_context *ctx, struct ir3_instruction *src)
*/
void
ir3_declare_array(struct ir3_context *ctx, nir_register *reg)
ir3_declare_array(struct ir3_context *ctx, nir_intrinsic_instr *decl)
{
struct ir3_array *arr = rzalloc(ctx, struct ir3_array);
arr->id = ++ctx->num_arrays;
@@ -554,15 +508,17 @@ ir3_declare_array(struct ir3_context *ctx, nir_register *reg)
* It would be nice if there was a nir pass to convert arrays of
* length 1 to ssa.
*/
arr->length = reg->num_components * MAX2(1, reg->num_array_elems);
arr->length = nir_intrinsic_num_components(decl) *
MAX2(1, nir_intrinsic_num_array_elems(decl));
compile_assert(ctx, arr->length > 0);
arr->r = reg;
arr->half = ir3_bitsize(ctx, reg->bit_size) <= 16;
arr->r = &decl->dest.ssa;
arr->half = ir3_bitsize(ctx, nir_intrinsic_bit_size(decl)) <= 16;
list_addtail(&arr->node, &ctx->ir->array_list);
}
struct ir3_array *
ir3_get_array(struct ir3_context *ctx, nir_register *reg)
ir3_get_array(struct ir3_context *ctx, nir_ssa_def *reg)
{
foreach_array (arr, &ctx->ir->array_list) {
if (arr->r == reg)
@@ -622,34 +578,6 @@ ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
struct ir3_register *dst;
unsigned flags = 0;
/* if not relative store, don't create an extra mov, since that
* ends up being difficult for cp to remove.
*
* Also, don't skip the mov if the src is meta (like fanout/split),
* since that creates a situation that RA can't really handle properly.
*/
if (!address && !is_meta(src)) {
dst = src->dsts[0];
src->barrier_class |= IR3_BARRIER_ARRAY_W;
src->barrier_conflict |= IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
dst->flags |= IR3_REG_ARRAY;
dst->size = arr->length;
dst->array.id = arr->id;
dst->array.offset = n;
dst->array.base = INVALID_REG;
if (arr->last_write && arr->last_write->instr->block == src->block)
ir3_reg_set_last_array(src, dst, arr->last_write);
arr->last_write = dst;
array_insert(block, block->keeps, src);
return;
}
mov = ir3_instr_create(block, OPC_MOV, 1, 1);
if (arr->half) {
mov->cat1.src_type = TYPE_U16;

View File

@@ -243,8 +243,8 @@ struct ir3_instruction *ir3_get_addr1(struct ir3_context *ctx,
struct ir3_instruction *ir3_get_predicate(struct ir3_context *ctx,
struct ir3_instruction *src);
void ir3_declare_array(struct ir3_context *ctx, nir_register *reg);
struct ir3_array *ir3_get_array(struct ir3_context *ctx, nir_register *reg);
void ir3_declare_array(struct ir3_context *ctx, nir_intrinsic_instr *decl);
struct ir3_array *ir3_get_array(struct ir3_context *ctx, nir_ssa_def *reg);
struct ir3_instruction *ir3_create_array_load(struct ir3_context *ctx,
struct ir3_array *arr, int n,
struct ir3_instruction *address);