agx: Preload vertex/instance ID only at start
This means we don't reserve the registers, which improves RA considerably. Using a special preload psuedo-op instead of a regular move allows us to constrain semantics and gaurantee coalescing. shader-db on glmark2 subset: total instructions in shared programs: 6448 -> 6442 (-0.09%) instructions in affected programs: 230 -> 224 (-2.61%) helped: 4 HURT: 0 total bytes in shared programs: 42232 -> 42196 (-0.09%) bytes in affected programs: 1530 -> 1494 (-2.35%) helped: 4 HURT: 0 total halfregs in shared programs: 2291 -> 1926 (-15.93%) halfregs in affected programs: 2185 -> 1820 (-16.70%) helped: 75 HURT: 0 Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18804>
This commit is contained in:

committed by
Marge Bot

parent
f665229d77
commit
c9a96d4615
@@ -52,6 +52,30 @@ int agx_debug = 0;
|
||||
fprintf(stderr, "%s:%d: "fmt, \
|
||||
__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
|
||||
|
||||
static agx_index
|
||||
agx_cached_preload(agx_context *ctx, agx_index *cache, unsigned base, enum agx_size size)
|
||||
{
|
||||
if (agx_is_null(*cache)) {
|
||||
agx_block *block = agx_start_block(ctx);
|
||||
agx_builder b = agx_init_builder(ctx, agx_before_block(block));
|
||||
*cache = agx_preload(&b, agx_register(base, size));
|
||||
}
|
||||
|
||||
return *cache;
|
||||
}
|
||||
|
||||
static agx_index
|
||||
agx_vertex_id(agx_builder *b)
|
||||
{
|
||||
return agx_cached_preload(b->shader, &b->shader->vertex_id, 10, AGX_SIZE_32);
|
||||
}
|
||||
|
||||
static agx_index
|
||||
agx_instance_id(agx_builder *b)
|
||||
{
|
||||
return agx_cached_preload(b->shader, &b->shader->instance_id, 12, AGX_SIZE_32);
|
||||
}
|
||||
|
||||
static agx_index
|
||||
agx_get_cf(agx_context *ctx, bool smooth, bool perspective,
|
||||
gl_varying_slot slot, unsigned offset, unsigned count)
|
||||
@@ -327,13 +351,10 @@ agx_emit_load_attr(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
|
||||
agx_index shifted_stride = agx_mov_imm(b, 32, stride >> shift);
|
||||
agx_index src_offset = agx_mov_imm(b, 32, attrib.src_offset);
|
||||
|
||||
agx_index vertex_id = agx_register(10, AGX_SIZE_32);
|
||||
agx_index instance_id = agx_register(12, AGX_SIZE_32);
|
||||
|
||||
/* A nonzero divisor requires dividing the instance ID. A zero divisor
|
||||
* specifies per-instance data. */
|
||||
agx_index element_id = (attrib.divisor == 0) ? vertex_id :
|
||||
agx_udiv_const(b, instance_id, attrib.divisor);
|
||||
agx_index element_id = (attrib.divisor == 0) ? agx_vertex_id(b) :
|
||||
agx_udiv_const(b, agx_instance_id(b), attrib.divisor);
|
||||
|
||||
agx_index offset = agx_imad(b, element_id, shifted_stride, src_offset, 0);
|
||||
|
||||
@@ -683,10 +704,10 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
|
||||
AGX_PUSH_TEXTURE_BASE, AGX_SIZE_64, 0, 4));
|
||||
|
||||
case nir_intrinsic_load_vertex_id:
|
||||
return agx_mov_to(b, dst, agx_abs(agx_register(10, AGX_SIZE_32)));
|
||||
return agx_mov_to(b, dst, agx_abs(agx_vertex_id(b)));
|
||||
|
||||
case nir_intrinsic_load_instance_id:
|
||||
return agx_mov_to(b, dst, agx_abs(agx_register(12, AGX_SIZE_32)));
|
||||
return agx_mov_to(b, dst, agx_abs(agx_instance_id(b)));
|
||||
|
||||
case nir_intrinsic_load_blend_const_color_r_float: return agx_blend_const(b, dst, 0);
|
||||
case nir_intrinsic_load_blend_const_color_g_float: return agx_blend_const(b, dst, 1);
|
||||
|
@@ -396,6 +396,11 @@ typedef struct {
|
||||
* components, populated by a split. */
|
||||
struct hash_table_u64 *allocated_vec;
|
||||
|
||||
/* During instruction selection, preloaded values,
|
||||
* or NULL if it hasn't been preloaded
|
||||
*/
|
||||
agx_index vertex_id, instance_id;
|
||||
|
||||
/* Stats for shader-db */
|
||||
unsigned loop_count;
|
||||
unsigned spills;
|
||||
@@ -456,6 +461,20 @@ agx_vec_for_intr(agx_context *ctx, nir_intrinsic_instr *instr)
|
||||
return agx_vec_for_dest(ctx, &instr->dest);
|
||||
}
|
||||
|
||||
static inline unsigned
|
||||
agx_num_predecessors(agx_block *block)
|
||||
{
|
||||
return util_dynarray_num_elements(&block->predecessors, agx_block *);
|
||||
}
|
||||
|
||||
static inline agx_block *
|
||||
agx_start_block(agx_context *ctx)
|
||||
{
|
||||
agx_block *first = list_first_entry(&ctx->blocks, agx_block, link);
|
||||
assert(agx_num_predecessors(first) == 0);
|
||||
return first;
|
||||
}
|
||||
|
||||
/* Iterators for AGX IR */
|
||||
|
||||
#define agx_foreach_block(ctx, v) \
|
||||
@@ -650,6 +669,25 @@ agx_after_block_logical(agx_block *block)
|
||||
return agx_after_block(block);
|
||||
}
|
||||
|
||||
|
||||
static inline agx_cursor
|
||||
agx_before_nonempty_block(agx_block *block)
|
||||
{
|
||||
agx_instr *I = list_first_entry(&block->instructions, agx_instr, link);
|
||||
assert(I != NULL);
|
||||
|
||||
return agx_before_instr(I);
|
||||
}
|
||||
|
||||
static inline agx_cursor
|
||||
agx_before_block(agx_block *block)
|
||||
{
|
||||
if (list_is_empty(&block->instructions))
|
||||
return agx_after_block(block);
|
||||
else
|
||||
return agx_before_nonempty_block(block);
|
||||
}
|
||||
|
||||
/* IR builder in terms of cursor infrastructure */
|
||||
|
||||
typedef struct {
|
||||
|
@@ -281,3 +281,7 @@ op("split", _, srcs = 1, dests = VARIABLE)
|
||||
op("phi", _, srcs = VARIABLE)
|
||||
|
||||
op("unit_test", _, dests = 0, srcs = 1, can_eliminate = False)
|
||||
|
||||
# Like mov, but takes a register and can only appear at the start. Gauranteed
|
||||
# to be coalesced during RA, rather than lowered to a real move.
|
||||
op("preload", _, srcs = 1)
|
||||
|
@@ -111,10 +111,6 @@ agx_ra_assign_local(agx_block *block, uint8_t *ssa_to_reg, uint8_t *ncomps)
|
||||
}
|
||||
|
||||
BITSET_SET(used_regs, 0); // control flow writes r0l
|
||||
BITSET_SET(used_regs, 5*2); // TODO: precolouring, don't overwrite vertex ID
|
||||
BITSET_SET(used_regs, (5*2 + 1));
|
||||
BITSET_SET(used_regs, (6*2 + 0));
|
||||
BITSET_SET(used_regs, (6*2 + 1));
|
||||
|
||||
agx_foreach_instr_in_block(block, I) {
|
||||
/* Optimization: if a split contains the last use of a vector, the split
|
||||
@@ -152,6 +148,21 @@ agx_ra_assign_local(agx_block *block, uint8_t *ssa_to_reg, uint8_t *ncomps)
|
||||
ssa_to_reg[I->dest[d].value] = reg + offset;
|
||||
}
|
||||
|
||||
continue;
|
||||
} else if (I->op == AGX_OPCODE_PRELOAD) {
|
||||
/* We must coalesce all preload moves */
|
||||
assert(I->dest[0].type == AGX_INDEX_NORMAL);
|
||||
assert(I->dest[0].size == I->src[0].size);
|
||||
assert(I->src[0].type == AGX_INDEX_REGISTER);
|
||||
|
||||
unsigned base = I->src[0].value;
|
||||
|
||||
for (unsigned i = 0; i < agx_size_align_16(I->src[0].size); ++i) {
|
||||
assert(!BITSET_TEST(used_regs, base + i));
|
||||
BITSET_SET(used_regs, base + i);
|
||||
}
|
||||
|
||||
ssa_to_reg[I->dest[0].value] = base;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -363,18 +374,29 @@ agx_ra(agx_context *ctx)
|
||||
agx_insert_parallel_copies(ctx, block);
|
||||
}
|
||||
|
||||
/* Phi nodes can be removed now */
|
||||
agx_foreach_instr_global_safe(ctx, I) {
|
||||
if (I->op == AGX_OPCODE_PHI || I->op == AGX_OPCODE_LOGICAL_END)
|
||||
switch (I->op) {
|
||||
/* Pseudoinstructions for RA must be removed now */
|
||||
case AGX_OPCODE_PHI:
|
||||
case AGX_OPCODE_LOGICAL_END:
|
||||
case AGX_OPCODE_PRELOAD:
|
||||
agx_remove_instruction(I);
|
||||
break;
|
||||
|
||||
/* Remove identity moves */
|
||||
if (I->op == AGX_OPCODE_MOV && I->src[0].type == AGX_INDEX_REGISTER &&
|
||||
I->dest[0].size == I->src[0].size && I->src[0].value == I->dest[0].value) {
|
||||
/* Coalesced moves can be removed */
|
||||
case AGX_OPCODE_MOV:
|
||||
if (I->src[0].type == AGX_INDEX_REGISTER &&
|
||||
I->dest[0].size == I->src[0].size &&
|
||||
I->src[0].value == I->dest[0].value) {
|
||||
|
||||
assert(I->dest[0].type == AGX_INDEX_REGISTER);
|
||||
agx_remove_instruction(I);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
free(ssa_to_reg);
|
||||
|
@@ -165,3 +165,11 @@ TEST_F(Optimizer, IntCopypropDoesntConvert)
|
||||
agx_xor_to(b, out, cvt, wy);
|
||||
});
|
||||
}
|
||||
|
||||
TEST_F(Optimizer, SkipPreloads)
|
||||
{
|
||||
NEGCASE32({
|
||||
agx_index preload = agx_preload(b, agx_register(0, AGX_SIZE_32));
|
||||
agx_xor_to(b, out, preload, wy);
|
||||
});
|
||||
}
|
||||
|
Reference in New Issue
Block a user