diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index 1ccb98338e4..6f00b2cbea8 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -175,6 +175,8 @@ struct fd_dev_info { /* See ir3_compiler::has_scalar_alu. */ bool has_scalar_alu; + bool has_isam_v; + /* Whether writing to UBWC attachment and reading the same image as input * attachment or as a texture reads correct values from the image. * If this is false, we may read stale values from the flag buffer, diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index ae177e8a70d..1cfe020447c 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -414,6 +414,7 @@ a6xx_gen4 = A6XXProps( has_lrz_dir_tracking = True, has_per_view_viewport = True, has_scalar_alu = True, + has_isam_v = True, ) a6xx_a690_quirk = A6XXProps( @@ -794,6 +795,7 @@ a7xx_base = A6XXProps( line_width_max = 127.5, has_scalar_alu = True, has_coherent_ubwc_flag_caches = True, + has_isam_v = True, ) a7xx_725 = A7XXProps( diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c index ab3077594ad..dd38e2af885 100644 --- a/src/freedreno/ir3/ir3.c +++ b/src/freedreno/ir3/ir3.c @@ -1258,6 +1258,12 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags) return false; break; case 5: + if (instr->opc == OPC_ISAM && (instr->flags & IR3_INSTR_V)) { + if (((instr->flags & IR3_INSTR_S2EN) && n == 2) || + (!(instr->flags & IR3_INSTR_S2EN) && n == 1)) { + return flags == IR3_REG_IMMED; + } + } /* no flags allowed */ if (flags) return false; diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 2f36294c1b8..177796738fe 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -224,6 +224,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->has_branch_and_or = true; compiler->has_predication = true; compiler->has_scalar_alu = dev_info->a6xx.has_scalar_alu; + compiler->has_isam_v = dev_info->a6xx.has_isam_v; compiler->fs_must_have_non_zero_constlen_quirk = dev_info->a7xx.fs_must_have_non_zero_constlen_quirk; } else { compiler->max_const_pipeline = 512; @@ -237,6 +238,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->max_const_safe = 256; compiler->has_scalar_alu = false; + compiler->has_isam_v = false; } /* This is just a guess for a4xx. */ diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index ac8885656d1..88896bc76ce 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -208,6 +208,9 @@ struct ir3_compiler { /* Whether SSBOs have descriptors for sampling with ISAM */ bool has_isam_ssbo; + /* Whether isam.v is supported to sample multiple components from SSBOs */ + bool has_isam_v; + /* True if 16-bit descriptors are used for both 16-bit and 32-bit access. */ bool storage_16bit; diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 8fed9d9116b..75c0716d0c3 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -1592,9 +1592,11 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, struct ir3_instruction **dst) { - /* Note: isam currently can't handle vectorized loads/stores */ + /* Note: we can only use isam for vectorized loads/stores if isam.v is + * available. + */ if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) || - intr->def.num_components > 1 || + (intr->def.num_components > 1 && !ctx->compiler->has_isam_v) || !ctx->compiler->has_isam_ssbo) { ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst); return; @@ -1602,13 +1604,27 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, struct ir3_block *b = ctx->block; struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[2])[0]; - struct ir3_instruction *coords = ir3_collect(b, offset, create_immed(b, 0)); + struct ir3_instruction *coords = NULL; + unsigned imm_offset = 0; + + if (ctx->compiler->has_isam_v) { + coords = offset; + } else { + coords = ir3_collect(b, offset, create_immed(b, 0)); + } + struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], false); unsigned num_components = intr->def.num_components; + assert(num_components == 1 || ctx->compiler->has_isam_v); + struct ir3_instruction *sam = emit_sam(ctx, OPC_ISAM, info, utype_for_size(intr->def.bit_size), - MASK(num_components), coords, NULL); + MASK(num_components), coords, create_immed(b, imm_offset)); + + if (ctx->compiler->has_isam_v) { + sam->flags |= (IR3_INSTR_V | IR3_INSTR_INV_1D); + } ir3_handle_nonuniform(sam, intr); diff --git a/src/freedreno/ir3/ir3_cp.c b/src/freedreno/ir3/ir3_cp.c index 33e9bd1139b..1521f8a94c8 100644 --- a/src/freedreno/ir3/ir3_cp.c +++ b/src/freedreno/ir3/ir3_cp.c @@ -444,6 +444,7 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr, (opc_cat(instr->opc) == 2) || (opc_cat(instr->opc) == 6) || is_meta(instr) || + (instr->opc == OPC_ISAM && (n == 1 || n == 2)) || (is_mad(instr->opc) && (n == 0))); if ((opc_cat(instr->opc) == 2) && diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 9bd9bc927b0..f5085e21489 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -92,10 +92,11 @@ ir3_nir_should_scalarize_mem(const nir_instr *instr, const void *data) /* Scalarize load_ssbo's that we could otherwise lower to isam, * as the tex cache benefit outweighs the benefit of vectorizing + * Don't do this if (vectorized) isam.v is supported. */ if ((intrin->intrinsic == nir_intrinsic_load_ssbo) && (nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) && - compiler->has_isam_ssbo) { + compiler->has_isam_ssbo && !compiler->has_isam_v) { return true; } @@ -112,11 +113,12 @@ ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, unsigned byte_size = bit_size / 8; /* Don't vectorize load_ssbo's that we could otherwise lower to isam, - * as the tex cache benefit outweighs the benefit of vectorizing + * as the tex cache benefit outweighs the benefit of vectorizing. If we + * support isam.v, we can vectorize this though. */ if ((low->intrinsic == nir_intrinsic_load_ssbo) && (nir_intrinsic_access(low) & ACCESS_CAN_REORDER) && - compiler->has_isam_ssbo) { + compiler->has_isam_ssbo && !compiler->has_isam_v) { return false; }