ir3: Plumb through support for a1.x

This will need to be used in some cases for the upcoming bindless support, plus ldc.k instructions which push data from a UBO to const registers. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4358>
2020-03-18 18:06:41 +01:00
parent c8b0f90439
commit de7d90ef53
11 changed files with 164 additions and 67 deletions
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -1086,7 +1086,14 @@ ir3_instr_set_address(struct ir3_instruction *instr,
 		debug_assert(instr->block == addr->block);

 		instr->address = addr;
-		array_insert(ir, ir->indirects, instr);
+		debug_assert(reg_num(addr->regs[0]) == REG_A0);
+		unsigned comp = reg_comp(addr->regs[0]);
+		if (comp == 0) {
+			array_insert(ir, ir->a0_users, instr);
+		} else {
+			debug_assert(comp == 1);
+			array_insert(ir, ir->a1_users, instr);
+		}
 	}
 }

--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -469,7 +469,10 @@ struct ir3 {
 	 * convenient list of instructions that reference some address
 	 * register simplifies this.
 	 */
-	DECLARE_ARRAY(struct ir3_instruction *, indirects);
+	DECLARE_ARRAY(struct ir3_instruction *, a0_users);
+
+	/* same for a1.x: */
+	DECLARE_ARRAY(struct ir3_instruction *, a1_users);

 	/* and same for instructions that consume predicate register: */
 	DECLARE_ARRAY(struct ir3_instruction *, predicates);
@@ -695,10 +698,10 @@ static inline bool is_same_type_mov(struct ir3_instruction *instr)

 	dst = instr->regs[0];

-	/* mov's that write to a0.x or p0.x are special: */
+	/* mov's that write to a0 or p0.x are special: */
 	if (dst->num == regid(REG_P0, 0))
 		return false;
-	if (dst->num == regid(REG_A0, 0))
+	if (reg_num(dst) == REG_A0)
 		return false;

 	if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
@@ -848,11 +851,20 @@ static inline unsigned dest_regs(struct ir3_instruction *instr)
 	return util_last_bit(instr->regs[0]->wrmask);
 }

-static inline bool writes_addr(struct ir3_instruction *instr)
+static inline bool writes_addr0(struct ir3_instruction *instr)
 {
 	if (instr->regs_count > 0) {
 		struct ir3_register *dst = instr->regs[0];
-		return reg_num(dst) == REG_A0;
+		return dst->num == regid(REG_A0, 0);
+	}
+	return false;
+}
+
+static inline bool writes_addr1(struct ir3_instruction *instr)
+{
+	if (instr->regs_count > 0) {
+		struct ir3_register *dst = instr->regs[0];
+		return dst->num == regid(REG_A0, 1);
 	}
 	return false;
 }
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -744,8 +744,8 @@ emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
 		base_lo = create_uniform(b, ubo + (src0->regs[1]->iim_val * ptrsz));
 		base_hi = create_uniform(b, ubo + (src0->regs[1]->iim_val * ptrsz) + 1);
 	} else {
-		base_lo = create_uniform_indirect(b, ubo, ir3_get_addr(ctx, src0, ptrsz));
-		base_hi = create_uniform_indirect(b, ubo + 1, ir3_get_addr(ctx, src0, ptrsz));
+		base_lo = create_uniform_indirect(b, ubo, ir3_get_addr0(ctx, src0, ptrsz));
+		base_hi = create_uniform_indirect(b, ubo + 1, ir3_get_addr0(ctx, src0, ptrsz));

 		/* NOTE: since relative addressing is used, make sure constlen is
 		 * at least big enough to cover all the UBO addresses, since the
@@ -1362,7 +1362,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 			src = ir3_get_src(ctx, &intr->src[0]);
 			for (int i = 0; i < intr->num_components; i++) {
 				dst[i] = create_uniform_indirect(b, idx + i,
-						ir3_get_addr(ctx, src[0], 1));
+						ir3_get_addr0(ctx, src[0], 1));
 			}
 			/* NOTE: if relative addressing is used, we set
 			 * constlen in the compiler (to worst-case value)
@@ -1558,7 +1558,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 			src = ir3_get_src(ctx, &intr->src[0]);
 			struct ir3_instruction *collect =
 					ir3_create_collect(ctx, ctx->ir->inputs, ctx->ninputs);
-			struct ir3_instruction *addr = ir3_get_addr(ctx, src[0], 4);
+			struct ir3_instruction *addr = ir3_get_addr0(ctx, src[0], 4);
 			for (int i = 0; i < intr->num_components; i++) {
 				unsigned n = idx * 4 + i + comp;
 				dst[i] = create_indirect_load(ctx, ctx->ninputs,
@@ -2424,11 +2424,14 @@ emit_block(struct ir3_context *ctx, nir_block *nblock)
 	list_addtail(&block->node, &ctx->ir->block_list);

 	/* re-emit addr register in each block if needed: */
-	for (int i = 0; i < ARRAY_SIZE(ctx->addr_ht); i++) {
-		_mesa_hash_table_destroy(ctx->addr_ht[i], NULL);
-		ctx->addr_ht[i] = NULL;
+	for (int i = 0; i < ARRAY_SIZE(ctx->addr0_ht); i++) {
+		_mesa_hash_table_destroy(ctx->addr0_ht[i], NULL);
+		ctx->addr0_ht[i] = NULL;
 	}

+	_mesa_hash_table_u64_destroy(ctx->addr1_ht, NULL);
+	ctx->addr1_ht = NULL;
+
 	nir_foreach_instr (instr, nblock) {
 		ctx->cur_instr = instr;
 		emit_instr(ctx, instr);
--- a/src/freedreno/ir3/ir3_context.c
+++ b/src/freedreno/ir3/ir3_context.c
@@ -184,7 +184,7 @@ ir3_get_src(struct ir3_context *ctx, nir_src *src)
 			ralloc_array(ctx, struct ir3_instruction *, num_components);

 		if (src->reg.indirect)
-			addr = ir3_get_addr(ctx, ir3_get_src(ctx, src->reg.indirect)[0],
+			addr = ir3_get_addr0(ctx, ir3_get_src(ctx, src->reg.indirect)[0],
 					reg->num_components);

 		for (unsigned i = 0; i < num_components; i++) {
@@ -230,7 +230,7 @@ ir3_put_dst(struct ir3_context *ctx, nir_dest *dst)
 		struct ir3_instruction *addr = NULL;

 		if (dst->reg.indirect)
-			addr = ir3_get_addr(ctx, ir3_get_src(ctx, dst->reg.indirect)[0],
+			addr = ir3_get_addr0(ctx, ir3_get_src(ctx, dst->reg.indirect)[0],
 					reg->num_components);

 		for (unsigned i = 0; i < num_components; i++) {
@@ -378,7 +378,7 @@ ir3_context_error(struct ir3_context *ctx, const char *format, ...)
 }

 static struct ir3_instruction *
-create_addr(struct ir3_block *block, struct ir3_instruction *src, int align)
+create_addr0(struct ir3_block *block, struct ir3_instruction *src, int align)
 {
 	struct ir3_instruction *instr, *immed;

@@ -433,29 +433,62 @@ create_addr(struct ir3_block *block, struct ir3_instruction *src, int align)
 	return instr;
 }

+static struct ir3_instruction *
+create_addr1(struct ir3_block *block, unsigned const_val)
+{
+
+	struct ir3_instruction *immed = create_immed(block, const_val);
+	struct ir3_instruction *instr = ir3_MOV(block, immed, TYPE_S16);
+	instr->regs[0]->num = regid(REG_A0, 1);
+	instr->regs[0]->flags &= ~IR3_REG_SSA;
+	instr->regs[0]->flags |= IR3_REG_HALF;
+	instr->regs[1]->flags |= IR3_REG_HALF;
+	return instr;
+}
+
 /* caches addr values to avoid generating multiple cov/shl/mova
 * sequences for each use of a given NIR level src as address
 */
 struct ir3_instruction *
-ir3_get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align)
+ir3_get_addr0(struct ir3_context *ctx, struct ir3_instruction *src, int align)
 {
 	struct ir3_instruction *addr;
 	unsigned idx = align - 1;

-	compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr_ht));
+	compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr0_ht));

-	if (!ctx->addr_ht[idx]) {
-		ctx->addr_ht[idx] = _mesa_hash_table_create(ctx,
+	if (!ctx->addr0_ht[idx]) {
+		ctx->addr0_ht[idx] = _mesa_hash_table_create(ctx,
 				_mesa_hash_pointer, _mesa_key_pointer_equal);
 	} else {
 		struct hash_entry *entry;
-		entry = _mesa_hash_table_search(ctx->addr_ht[idx], src);
+		entry = _mesa_hash_table_search(ctx->addr0_ht[idx], src);
 		if (entry)
 			return entry->data;
 	}

-	addr = create_addr(ctx->block, src, align);
-	_mesa_hash_table_insert(ctx->addr_ht[idx], src, addr);
+	addr = create_addr0(ctx->block, src, align);
+	_mesa_hash_table_insert(ctx->addr0_ht[idx], src, addr);
+
+	return addr;
+}
+
+/* Similar to ir3_get_addr0, but for a1.x. */
+struct ir3_instruction *
+ir3_get_addr1(struct ir3_context *ctx, unsigned const_val)
+{
+	struct ir3_instruction *addr;
+
+	if (!ctx->addr1_ht) {
+		ctx->addr1_ht = _mesa_hash_table_u64_create(ctx);
+	} else {
+		addr = _mesa_hash_table_u64_search(ctx->addr1_ht, const_val);
+		if (addr)
+			return addr;
+	}
+
+	addr = create_addr1(ctx->block, const_val);
+	_mesa_hash_table_u64_insert(ctx->addr1_ht, const_val, addr);

 	return addr;
 }
--- a/src/freedreno/ir3/ir3_context.h
+++ b/src/freedreno/ir3/ir3_context.h
@@ -120,7 +120,12 @@ struct ir3_context {
 	 * src used for an array of vec1 cannot be also used for an
 	 * array of vec4.
 	 */
-	struct hash_table *addr_ht[4];
+	struct hash_table *addr0_ht[4];
+
+	/* The same for a1.x. We only support immediate values for a1.x, as this
+	 * is the only use so far.
+	 */
+	struct hash_table_u64 *addr1_ht;

 	/* last dst array, for indirect we need to insert a var-store.
 	 */
@@ -176,8 +181,10 @@ NORETURN void ir3_context_error(struct ir3_context *ctx, const char *format, ...
 		if (!(cond)) ir3_context_error((ctx), "failed assert: "#cond"\n"); \
 	} while (0)

-struct ir3_instruction * ir3_get_addr(struct ir3_context *ctx,
+struct ir3_instruction * ir3_get_addr0(struct ir3_context *ctx,
 		struct ir3_instruction *src, int align);
+struct ir3_instruction * ir3_get_addr1(struct ir3_context *ctx,
+		unsigned const_val);
 struct ir3_instruction * ir3_get_predicate(struct ir3_context *ctx,
 		struct ir3_instruction *src);

--- a/src/freedreno/ir3/ir3_delay.c
+++ b/src/freedreno/ir3/ir3_delay.c
@@ -82,7 +82,7 @@ ir3_delayslots(struct ir3_instruction *assigner,
 	if (is_meta(assigner) || is_meta(consumer))
 		return 0;

-	if (writes_addr(assigner))
+	if (writes_addr0(assigner) || writes_addr1(assigner))
 		return 6;

 	/* On a6xx, it takes the number of delay slots to get a SFU result
--- a/src/freedreno/ir3/ir3_depth.c
+++ b/src/freedreno/ir3/ir3_depth.c
@@ -201,10 +201,16 @@ compute_depth_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so)
 	/* note that we can end up with unused indirects, but we should
 	 * not end up with unused predicates.
 	 */
-	for (i = 0; i < ir->indirects_count; i++) {
-		struct ir3_instruction *instr = ir->indirects[i];
+	for (i = 0; i < ir->a0_users_count; i++) {
+		struct ir3_instruction *instr = ir->a0_users[i];
 		if (instr && (instr->flags & IR3_INSTR_UNUSED))
-			ir->indirects[i] = NULL;
+			ir->a0_users[i] = NULL;
+	}
+
+	for (i = 0; i < ir->a1_users_count; i++) {
+		struct ir3_instruction *instr = ir->a1_users[i];
+		if (instr && (instr->flags & IR3_INSTR_UNUSED))
+			ir->a1_users[i] = NULL;
 	}

 	/* cleanup unused inputs: */
--- a/src/freedreno/ir3/ir3_postsched.c
+++ b/src/freedreno/ir3/ir3_postsched.c
@@ -392,7 +392,6 @@ static void
 calculate_deps(struct ir3_postsched_deps_state *state,
 		struct ir3_postsched_node *node)
 {
-	static const struct ir3_register half_reg = { .flags = IR3_REG_HALF };
 	struct ir3_register *reg;
 	int b;

@@ -400,12 +399,6 @@ calculate_deps(struct ir3_postsched_deps_state *state,
 	 * in the reverse direction) wrote any of our src registers:
 	 */
 	foreach_src_n (reg, i, node->instr) {
-		/* NOTE: relative access for a src can be either const or gpr: */
-		if (reg->flags & IR3_REG_RELATIV) {
-			/* also reads a0.x: */
-			add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
-		}
-
 		if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 			continue;

@@ -428,6 +421,12 @@ calculate_deps(struct ir3_postsched_deps_state *state,
 		}
 	}

+	if (node->instr->address) {
+		add_reg_dep(state, node, node->instr->address->regs[0],
+					node->instr->address->regs[0]->num,
+					false);
+	}
+
 	if (dest_regs(node->instr) == 0)
 		return;

@@ -441,9 +440,6 @@ calculate_deps(struct ir3_postsched_deps_state *state,
 		for (unsigned i = 0; i < arr->length; i++) {
 			add_reg_dep(state, node, reg, arr->reg + i, true);
 		}
-
-		/* also reads a0.x: */
-		add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
 	} else {
 		foreach_bit (b, reg->wrmask) {
 			add_reg_dep(state, node, reg, reg->num + b, true);
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -264,7 +264,7 @@ ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		if (instr->regs_count == 0)
 			continue;
 		/* couple special cases: */
-		if (writes_addr(instr) || writes_pred(instr)) {
+		if (writes_addr0(instr) || writes_addr1(instr) || writes_pred(instr)) {
 			id->cls = -1;
 		} else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
 			id->cls = total_class_count;
--- a/src/freedreno/ir3/ir3_ra.h
+++ b/src/freedreno/ir3/ir3_ra.h
@@ -199,7 +199,7 @@ writes_gpr(struct ir3_instruction *instr)
 	/* is dest a normal temp register: */
 	struct ir3_register *reg = instr->regs[0];
 	debug_assert(!(reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)));
-	if ((reg->num == regid(REG_A0, 0)) ||
+	if ((reg_num(reg) == REG_A0) ||
 			(reg->num == regid(REG_P0, 0)))
 		return false;
 	return true;
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -68,7 +68,8 @@ struct ir3_sched_ctx {
 	struct ir3_block *block;           /* the current block */
 	struct list_head depth_list;       /* depth sorted unscheduled instrs */
 	struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
-	struct ir3_instruction *addr;      /* current a0.x user, if any */
+	struct ir3_instruction *addr0;     /* current a0.x user, if any */
+	struct ir3_instruction *addr1;     /* current a1.x user, if any */
 	struct ir3_instruction *pred;      /* current p0.x user, if any */
 	int live_values;                   /* estimate of current live values */
 	int half_live_values;              /* estimate of current half precision live values */
@@ -225,9 +226,14 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 	 */
 	list_delinit(&instr->node);

-	if (writes_addr(instr)) {
-		debug_assert(ctx->addr == NULL);
-		ctx->addr = instr;
+	if (writes_addr0(instr)) {
+		debug_assert(ctx->addr0 == NULL);
+		ctx->addr0 = instr;
+	}
+
+	if (writes_addr1(instr)) {
+		debug_assert(ctx->addr1 == NULL);
+		ctx->addr1 = instr;
 	}

 	if (writes_pred(instr)) {
@@ -244,7 +250,7 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)

 	update_live_values(ctx, instr);

-	if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
+	if (writes_addr0(instr) || writes_addr1(instr) || writes_pred(instr) || is_input(instr)) {
 		clear_cache(ctx, NULL);
 	} else {
 		/* invalidate only the necessary entries.. */
@@ -281,7 +287,7 @@ struct ir3_sched_notes {
 	/* there is at least one instruction that could be scheduled,
 	 * except for conflicting address/predicate register usage:
 	 */
-	bool addr_conflict, pred_conflict;
+	bool addr0_conflict, addr1_conflict, pred_conflict;
 };

 /* could an instruction be scheduled if specified ssa src was scheduled? */
@@ -314,11 +320,28 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	 * TODO if any instructions use pred register and have other
 	 * src args, we would need to do the same for writes_pred()..
 	 */
-	if (writes_addr(instr)) {
+	if (writes_addr0(instr)) {
 		struct ir3 *ir = instr->block->shader;
 		bool ready = false;
-		for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
-			struct ir3_instruction *indirect = ir->indirects[i];
+		for (unsigned i = 0; (i < ir->a0_users_count) && !ready; i++) {
+			struct ir3_instruction *indirect = ir->a0_users[i];
+			if (!indirect)
+				continue;
+			if (indirect->address != instr)
+				continue;
+			ready = could_sched(indirect, instr);
+		}
+
+		/* nothing could be scheduled, so keep looking: */
+		if (!ready)
+			return false;
+	}
+
+	if (writes_addr1(instr)) {
+		struct ir3 *ir = instr->block->shader;
+		bool ready = false;
+		for (unsigned i = 0; (i < ir->a1_users_count) && !ready; i++) {
+			struct ir3_instruction *indirect = ir->a1_users[i];
 			if (!indirect)
 				continue;
 			if (indirect->address != instr)
@@ -335,9 +358,15 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	 * register is currently in use, we need to defer until it is
 	 * free:
 	 */
-	if (writes_addr(instr) && ctx->addr) {
-		debug_assert(ctx->addr != instr);
-		notes->addr_conflict = true;
+	if (writes_addr0(instr) && ctx->addr0) {
+		debug_assert(ctx->addr0 != instr);
+		notes->addr0_conflict = true;
+		return false;
+	}
+
+	if (writes_addr1(instr) && ctx->addr1) {
+		debug_assert(ctx->addr1 != instr);
+		notes->addr1_conflict = true;
 		return false;
 	}

@@ -585,23 +614,21 @@ split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
 	return new_instr;
 }

-/* "spill" the address register by remapping any unscheduled
+/* "spill" the address registers by remapping any unscheduled
 * instructions which depend on the current address register
 * to a clone of the instruction which wrote the address reg.
 */
 static struct ir3_instruction *
-split_addr(struct ir3_sched_ctx *ctx)
+split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
+		   struct ir3_instruction **users, unsigned users_count)
 {
-	struct ir3 *ir;
 	struct ir3_instruction *new_addr = NULL;
 	unsigned i;

-	debug_assert(ctx->addr);
+	debug_assert(*addr);

-	ir = ctx->addr->block->shader;
-
-	for (i = 0; i < ir->indirects_count; i++) {
-		struct ir3_instruction *indirect = ir->indirects[i];
+	for (i = 0; i < users_count; i++) {
+		struct ir3_instruction *indirect = users[i];

 		if (!indirect)
 			continue;
@@ -613,9 +640,9 @@ split_addr(struct ir3_sched_ctx *ctx)
 		/* remap remaining instructions using current addr
 		 * to new addr:
 		 */
-		if (indirect->address == ctx->addr) {
+		if (indirect->address == *addr) {
 			if (!new_addr) {
-				new_addr = split_instr(ctx, ctx->addr);
+				new_addr = split_instr(ctx, *addr);
 				/* original addr is scheduled, but new one isn't: */
 				new_addr->flags &= ~IR3_INSTR_MARK;
 			}
@@ -625,7 +652,7 @@ split_addr(struct ir3_sched_ctx *ctx)
 	}

 	/* all remaining indirects remapped to new addr: */
-	ctx->addr = NULL;
+	*addr = NULL;

 	return new_addr;
 }
@@ -682,7 +709,8 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 	ctx->block = block;

 	/* addr/pred writes are per-block: */
-	ctx->addr = NULL;
+	ctx->addr0 = NULL;
+	ctx->addr1 = NULL;
 	ctx->pred = NULL;

 	/* move all instructions to the unscheduled list, and
@@ -740,14 +768,19 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 			schedule(ctx, instr);
 		} else {
 			struct ir3_instruction *new_instr = NULL;
+			struct ir3 *ir = block->shader;

 			/* nothing available to schedule.. if we are blocked on
 			 * address/predicate register conflict, then break the
 			 * deadlock by cloning the instruction that wrote that
 			 * reg:
 			 */
-			if (notes.addr_conflict) {
-				new_instr = split_addr(ctx);
+			if (notes.addr0_conflict) {
+				new_instr = split_addr(ctx, &ctx->addr0,
+									   ir->a0_users, ir->a0_users_count);
+			} else if (notes.addr1_conflict) {
+				new_instr = split_addr(ctx, &ctx->addr1,
+									   ir->a1_users, ir->a1_users_count);
 			} else if (notes.pred_conflict) {
 				new_instr = split_pred(ctx);
 			} else {