ir3: use isam.v for multi-component SSBO loads

Since a7xx, isam.v can be used to perform multi-component SSBO loads.
Use this whenever possible to prevent excessive scalarization. isam.v
also uses only a single coordinate (as opposed to a 2-dimensional
coordinate for isam) so this reduces register pressure as well.

Signed-off-by: Job Noorman <jnoorman@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28664>
This commit is contained in:
Job Noorman
2024-04-10 10:43:38 +02:00
committed by Marge Bot
parent 455ebcccfb
commit c4fe247e62
8 changed files with 41 additions and 7 deletions

View File

@@ -175,6 +175,8 @@ struct fd_dev_info {
/* See ir3_compiler::has_scalar_alu. */
bool has_scalar_alu;
bool has_isam_v;
/* Whether writing to UBWC attachment and reading the same image as input
* attachment or as a texture reads correct values from the image.
* If this is false, we may read stale values from the flag buffer,

View File

@@ -414,6 +414,7 @@ a6xx_gen4 = A6XXProps(
has_lrz_dir_tracking = True,
has_per_view_viewport = True,
has_scalar_alu = True,
has_isam_v = True,
)
a6xx_a690_quirk = A6XXProps(
@@ -794,6 +795,7 @@ a7xx_base = A6XXProps(
line_width_max = 127.5,
has_scalar_alu = True,
has_coherent_ubwc_flag_caches = True,
has_isam_v = True,
)
a7xx_725 = A7XXProps(

View File

@@ -1258,6 +1258,12 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
return false;
break;
case 5:
if (instr->opc == OPC_ISAM && (instr->flags & IR3_INSTR_V)) {
if (((instr->flags & IR3_INSTR_S2EN) && n == 2) ||
(!(instr->flags & IR3_INSTR_S2EN) && n == 1)) {
return flags == IR3_REG_IMMED;
}
}
/* no flags allowed */
if (flags)
return false;

View File

@@ -224,6 +224,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->has_branch_and_or = true;
compiler->has_predication = true;
compiler->has_scalar_alu = dev_info->a6xx.has_scalar_alu;
compiler->has_isam_v = dev_info->a6xx.has_isam_v;
compiler->fs_must_have_non_zero_constlen_quirk = dev_info->a7xx.fs_must_have_non_zero_constlen_quirk;
} else {
compiler->max_const_pipeline = 512;
@@ -237,6 +238,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->max_const_safe = 256;
compiler->has_scalar_alu = false;
compiler->has_isam_v = false;
}
/* This is just a guess for a4xx. */

View File

@@ -208,6 +208,9 @@ struct ir3_compiler {
/* Whether SSBOs have descriptors for sampling with ISAM */
bool has_isam_ssbo;
/* Whether isam.v is supported to sample multiple components from SSBOs */
bool has_isam_v;
/* True if 16-bit descriptors are used for both 16-bit and 32-bit access. */
bool storage_16bit;

View File

@@ -1592,9 +1592,11 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx,
nir_intrinsic_instr *intr,
struct ir3_instruction **dst)
{
/* Note: isam currently can't handle vectorized loads/stores */
/* Note: we can only use isam for vectorized loads/stores if isam.v is
* available.
*/
if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) ||
intr->def.num_components > 1 ||
(intr->def.num_components > 1 && !ctx->compiler->has_isam_v) ||
!ctx->compiler->has_isam_ssbo) {
ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
return;
@@ -1602,13 +1604,27 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx,
struct ir3_block *b = ctx->block;
struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[2])[0];
struct ir3_instruction *coords = ir3_collect(b, offset, create_immed(b, 0));
struct ir3_instruction *coords = NULL;
unsigned imm_offset = 0;
if (ctx->compiler->has_isam_v) {
coords = offset;
} else {
coords = ir3_collect(b, offset, create_immed(b, 0));
}
struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], false);
unsigned num_components = intr->def.num_components;
assert(num_components == 1 || ctx->compiler->has_isam_v);
struct ir3_instruction *sam =
emit_sam(ctx, OPC_ISAM, info, utype_for_size(intr->def.bit_size),
MASK(num_components), coords, NULL);
MASK(num_components), coords, create_immed(b, imm_offset));
if (ctx->compiler->has_isam_v) {
sam->flags |= (IR3_INSTR_V | IR3_INSTR_INV_1D);
}
ir3_handle_nonuniform(sam, intr);

View File

@@ -444,6 +444,7 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
(opc_cat(instr->opc) == 2) ||
(opc_cat(instr->opc) == 6) ||
is_meta(instr) ||
(instr->opc == OPC_ISAM && (n == 1 || n == 2)) ||
(is_mad(instr->opc) && (n == 0)));
if ((opc_cat(instr->opc) == 2) &&

View File

@@ -92,10 +92,11 @@ ir3_nir_should_scalarize_mem(const nir_instr *instr, const void *data)
/* Scalarize load_ssbo's that we could otherwise lower to isam,
* as the tex cache benefit outweighs the benefit of vectorizing
* Don't do this if (vectorized) isam.v is supported.
*/
if ((intrin->intrinsic == nir_intrinsic_load_ssbo) &&
(nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
compiler->has_isam_ssbo) {
compiler->has_isam_ssbo && !compiler->has_isam_v) {
return true;
}
@@ -112,11 +113,12 @@ ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
unsigned byte_size = bit_size / 8;
/* Don't vectorize load_ssbo's that we could otherwise lower to isam,
* as the tex cache benefit outweighs the benefit of vectorizing
* as the tex cache benefit outweighs the benefit of vectorizing. If we
* support isam.v, we can vectorize this though.
*/
if ((low->intrinsic == nir_intrinsic_load_ssbo) &&
(nir_intrinsic_access(low) & ACCESS_CAN_REORDER) &&
compiler->has_isam_ssbo) {
compiler->has_isam_ssbo && !compiler->has_isam_v) {
return false;
}