ir3: Fix LDC offset units
I had missed that LDC actually uses vec4 units for its offset. This means that we have to create a new instruction, and lower it in ir3_nir_lower_io_offsets, similar to the existing SSBO instructions. Unfortunately we can't assume that loads are always vec4-aligned, so we have to use the alignment information that NIR gives us. Unfortunately, it's currently woefully inadequate, and will have to be fixed to give us good codegen in the future. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4568>
This commit is contained in:
@@ -802,6 +802,12 @@ intrinsic("ssbo_atomic_xor_ir3", src_comp=[1, 1, 1, 1], dest_comp=1)
|
|||||||
intrinsic("ssbo_atomic_exchange_ir3", src_comp=[1, 1, 1, 1], dest_comp=1)
|
intrinsic("ssbo_atomic_exchange_ir3", src_comp=[1, 1, 1, 1], dest_comp=1)
|
||||||
intrinsic("ssbo_atomic_comp_swap_ir3", src_comp=[1, 1, 1, 1, 1], dest_comp=1)
|
intrinsic("ssbo_atomic_comp_swap_ir3", src_comp=[1, 1, 1, 1, 1], dest_comp=1)
|
||||||
|
|
||||||
|
# IR3-specific instruction for UBO loads using the ldc instruction. The second
|
||||||
|
# source is the indirect offset, in units of four dwords. The base is a
|
||||||
|
# component offset, in dword units.
|
||||||
|
intrinsic("load_ubo_ir3", src_comp=[1, 1], bit_sizes=[32], dest_comp=0, indices=[BASE],
|
||||||
|
flags=[CAN_REORDER, CAN_ELIMINATE])
|
||||||
|
|
||||||
# System values for freedreno geometry shaders.
|
# System values for freedreno geometry shaders.
|
||||||
system_value("vs_primitive_stride_ir3", 1)
|
system_value("vs_primitive_stride_ir3", 1)
|
||||||
system_value("vs_vertex_stride_ir3", 1)
|
system_value("vs_vertex_stride_ir3", 1)
|
||||||
|
@@ -940,6 +940,8 @@ static void print_instr_cat6_a6xx(struct disasm_ctx *ctx, instr_t *instr)
|
|||||||
fprintf(ctx->out, ".%s", cat6->typed ? "typed" : "untyped");
|
fprintf(ctx->out, ".%s", cat6->typed ? "typed" : "untyped");
|
||||||
fprintf(ctx->out, ".%dd", cat6->d + 1);
|
fprintf(ctx->out, ".%dd", cat6->d + 1);
|
||||||
fprintf(ctx->out, ".%s", type[cat6->type]);
|
fprintf(ctx->out, ".%s", type[cat6->type]);
|
||||||
|
} else {
|
||||||
|
fprintf(ctx->out, ".offset%d", cat6->d);
|
||||||
}
|
}
|
||||||
fprintf(ctx->out, ".%u", cat6->type_size + 1);
|
fprintf(ctx->out, ".%u", cat6->type_size + 1);
|
||||||
|
|
||||||
|
@@ -561,7 +561,7 @@ static int emit_cat6_a6xx(struct ir3_instruction *instr, void *ptr,
|
|||||||
}
|
}
|
||||||
|
|
||||||
cat6->type = instr->cat6.type;
|
cat6->type = instr->cat6.type;
|
||||||
cat6->d = instr->cat6.d - 1;
|
cat6->d = instr->cat6.d - (instr->opc == OPC_LDC ? 0 : 1);
|
||||||
cat6->typed = instr->cat6.typed;
|
cat6->typed = instr->cat6.typed;
|
||||||
cat6->type_size = instr->cat6.iim_val - 1;
|
cat6->type_size = instr->cat6.iim_val - 1;
|
||||||
cat6->opc = instr->opc;
|
cat6->opc = instr->opc;
|
||||||
|
@@ -267,7 +267,7 @@ struct ir3_instruction {
|
|||||||
int src_offset;
|
int src_offset;
|
||||||
int dst_offset;
|
int dst_offset;
|
||||||
int iim_val : 3; /* for ldgb/stgb, # of components */
|
int iim_val : 3; /* for ldgb/stgb, # of components */
|
||||||
unsigned d : 3;
|
unsigned d : 3; /* for ldc, component offset */
|
||||||
bool typed : 1;
|
bool typed : 1;
|
||||||
unsigned base : 3;
|
unsigned base : 3;
|
||||||
} cat6;
|
} cat6;
|
||||||
|
@@ -748,8 +748,8 @@ emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
|||||||
struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
|
struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
|
||||||
struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
|
struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
|
||||||
ldc->regs[0]->wrmask = MASK(ncomp);
|
ldc->regs[0]->wrmask = MASK(ncomp);
|
||||||
ldc->cat6.iim_val = intr->num_components;
|
ldc->cat6.iim_val = ncomp;
|
||||||
ldc->cat6.d = 1;
|
ldc->cat6.d = nir_intrinsic_base(intr);
|
||||||
ldc->cat6.type = TYPE_U32;
|
ldc->cat6.type = TYPE_U32;
|
||||||
|
|
||||||
nir_intrinsic_instr *bindless = ir3_bindless_resource(intr->src[0]);
|
nir_intrinsic_instr *bindless = ir3_bindless_resource(intr->src[0]);
|
||||||
@@ -768,13 +768,6 @@ static void
|
|||||||
emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
||||||
struct ir3_instruction **dst)
|
struct ir3_instruction **dst)
|
||||||
{
|
{
|
||||||
if (ir3_bindless_resource(intr->src[0])) {
|
|
||||||
/* TODO: We should be using ldc for non-bindless things on a6xx as
|
|
||||||
* well.
|
|
||||||
*/
|
|
||||||
emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
struct ir3_block *b = ctx->block;
|
struct ir3_block *b = ctx->block;
|
||||||
struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
|
struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
|
||||||
/* UBO addresses are the first driver params, but subtract 2 here to
|
/* UBO addresses are the first driver params, but subtract 2 here to
|
||||||
@@ -1612,6 +1605,9 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
|||||||
case nir_intrinsic_load_ubo:
|
case nir_intrinsic_load_ubo:
|
||||||
emit_intrinsic_load_ubo(ctx, intr, dst);
|
emit_intrinsic_load_ubo(ctx, intr, dst);
|
||||||
break;
|
break;
|
||||||
|
case nir_intrinsic_load_ubo_ir3:
|
||||||
|
emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
|
||||||
|
break;
|
||||||
case nir_intrinsic_load_frag_coord:
|
case nir_intrinsic_load_frag_coord:
|
||||||
ir3_split_dest(b, dst, get_frag_coord(ctx), 0, 4);
|
ir3_split_dest(b, dst, get_frag_coord(ctx), 0, 4);
|
||||||
break;
|
break;
|
||||||
|
@@ -250,6 +250,84 @@ lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
lower_offset_for_ubo(nir_intrinsic_instr *intrinsic, nir_builder *b)
|
||||||
|
{
|
||||||
|
/* We only need to lower offset if using LDC. Currently, we only use LDC
|
||||||
|
* in the bindless mode. Also, LDC is introduced on A6xx, but currently we
|
||||||
|
* only use bindless in turnip which is A6xx only.
|
||||||
|
*
|
||||||
|
* TODO: We should be using LDC always on A6xx+.
|
||||||
|
*/
|
||||||
|
if (!ir3_bindless_resource(intrinsic->src[0]))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* TODO handle other bitsizes, including non-dword-aligned loads */
|
||||||
|
assert(intrinsic->dest.ssa.bit_size == 32);
|
||||||
|
|
||||||
|
b->cursor = nir_before_instr(&intrinsic->instr);
|
||||||
|
|
||||||
|
nir_intrinsic_instr *new_intrinsic =
|
||||||
|
nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_ir3);
|
||||||
|
|
||||||
|
debug_assert(intrinsic->dest.is_ssa);
|
||||||
|
new_intrinsic->src[0] = nir_src_for_ssa(intrinsic->src[0].ssa);
|
||||||
|
|
||||||
|
nir_ssa_def *offset = intrinsic->src[1].ssa;
|
||||||
|
nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, offset, -4);
|
||||||
|
|
||||||
|
if (!new_offset)
|
||||||
|
new_offset = nir_ushr(b, offset, nir_imm_int(b, 4));
|
||||||
|
|
||||||
|
new_intrinsic->src[1] = nir_src_for_ssa(new_offset);
|
||||||
|
|
||||||
|
unsigned align_mul = nir_intrinsic_align_mul(intrinsic);
|
||||||
|
unsigned align_offset = nir_intrinsic_align_offset(intrinsic);
|
||||||
|
|
||||||
|
unsigned components = intrinsic->num_components;
|
||||||
|
|
||||||
|
if (align_mul % 16 != 0)
|
||||||
|
components = 4;
|
||||||
|
|
||||||
|
new_intrinsic->num_components = components;
|
||||||
|
|
||||||
|
nir_ssa_dest_init(&new_intrinsic->instr, &new_intrinsic->dest,
|
||||||
|
components, 32, NULL);
|
||||||
|
|
||||||
|
nir_builder_instr_insert(b, &new_intrinsic->instr);
|
||||||
|
|
||||||
|
nir_ssa_def *new_dest;
|
||||||
|
if (align_mul % 16 == 0) {
|
||||||
|
/* We know that the low 4 bits of the offset are constant and equal to
|
||||||
|
* align_offset. Use the component offset.
|
||||||
|
*/
|
||||||
|
unsigned component = align_offset / 4;
|
||||||
|
nir_intrinsic_set_base(new_intrinsic, component);
|
||||||
|
new_dest = &new_intrinsic->dest.ssa;
|
||||||
|
} else {
|
||||||
|
/* We have to assume it isn't aligned, and extract the components
|
||||||
|
* dynamically.
|
||||||
|
*/
|
||||||
|
nir_intrinsic_set_base(new_intrinsic, 0);
|
||||||
|
nir_ssa_def *component =
|
||||||
|
nir_iand(b, nir_ushr(b, offset, nir_imm_int(b, 2)), nir_imm_int(b, 3));
|
||||||
|
nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS];
|
||||||
|
for (unsigned i = 0; i < intrinsic->num_components; i++) {
|
||||||
|
nir_ssa_def *idx = nir_iadd(b, nir_imm_int(b, i), component);
|
||||||
|
channels[i] = nir_vector_extract(b, &new_intrinsic->dest.ssa, idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
new_dest = nir_vec(b, channels, intrinsic->num_components);
|
||||||
|
}
|
||||||
|
|
||||||
|
nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa,
|
||||||
|
nir_src_for_ssa(new_dest));
|
||||||
|
|
||||||
|
nir_instr_remove(&intrinsic->instr);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx)
|
lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx)
|
||||||
{
|
{
|
||||||
@@ -261,6 +339,12 @@ lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx)
|
|||||||
|
|
||||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||||
|
|
||||||
|
/* UBO */
|
||||||
|
if (intr->intrinsic == nir_intrinsic_load_ubo) {
|
||||||
|
progress |= lower_offset_for_ubo(intr, b);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
/* SSBO */
|
/* SSBO */
|
||||||
int ir3_intrinsic;
|
int ir3_intrinsic;
|
||||||
uint8_t offset_src_idx;
|
uint8_t offset_src_idx;
|
||||||
|
@@ -127,6 +127,8 @@ static void print_instr_name(struct ir3_instruction *instr, bool flags)
|
|||||||
printf(".s");
|
printf(".s");
|
||||||
if (instr->flags & IR3_INSTR_A1EN)
|
if (instr->flags & IR3_INSTR_A1EN)
|
||||||
printf(".a1en");
|
printf(".a1en");
|
||||||
|
if (instr->opc == OPC_LDC)
|
||||||
|
printf(".offset%d", instr->cat6.d);
|
||||||
if (instr->flags & IR3_INSTR_B) {
|
if (instr->flags & IR3_INSTR_B) {
|
||||||
printf(".base%d",
|
printf(".base%d",
|
||||||
is_tex(instr) ? instr->cat5.tex_base : instr->cat6.base);
|
is_tex(instr) ? instr->cat5.tex_base : instr->cat6.base);
|
||||||
|
Reference in New Issue
Block a user