ir3: use isam.v for multi-component SSBO loads

Since a7xx, isam.v can be used to perform multi-component SSBO loads.
Use this whenever possible to prevent excessive scalarization. isam.v
also uses only a single coordinate (as opposed to a 2-dimensional
coordinate for isam) so this reduces register pressure as well.

Signed-off-by: Job Noorman <jnoorman@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28664>
This commit is contained in:
Job Noorman
2024-04-10 10:43:38 +02:00
committed by Marge Bot
parent 455ebcccfb
commit c4fe247e62
8 changed files with 41 additions and 7 deletions

View File

@@ -175,6 +175,8 @@ struct fd_dev_info {
/* See ir3_compiler::has_scalar_alu. */ /* See ir3_compiler::has_scalar_alu. */
bool has_scalar_alu; bool has_scalar_alu;
bool has_isam_v;
/* Whether writing to UBWC attachment and reading the same image as input /* Whether writing to UBWC attachment and reading the same image as input
* attachment or as a texture reads correct values from the image. * attachment or as a texture reads correct values from the image.
* If this is false, we may read stale values from the flag buffer, * If this is false, we may read stale values from the flag buffer,

View File

@@ -414,6 +414,7 @@ a6xx_gen4 = A6XXProps(
has_lrz_dir_tracking = True, has_lrz_dir_tracking = True,
has_per_view_viewport = True, has_per_view_viewport = True,
has_scalar_alu = True, has_scalar_alu = True,
has_isam_v = True,
) )
a6xx_a690_quirk = A6XXProps( a6xx_a690_quirk = A6XXProps(
@@ -794,6 +795,7 @@ a7xx_base = A6XXProps(
line_width_max = 127.5, line_width_max = 127.5,
has_scalar_alu = True, has_scalar_alu = True,
has_coherent_ubwc_flag_caches = True, has_coherent_ubwc_flag_caches = True,
has_isam_v = True,
) )
a7xx_725 = A7XXProps( a7xx_725 = A7XXProps(

View File

@@ -1258,6 +1258,12 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
return false; return false;
break; break;
case 5: case 5:
if (instr->opc == OPC_ISAM && (instr->flags & IR3_INSTR_V)) {
if (((instr->flags & IR3_INSTR_S2EN) && n == 2) ||
(!(instr->flags & IR3_INSTR_S2EN) && n == 1)) {
return flags == IR3_REG_IMMED;
}
}
/* no flags allowed */ /* no flags allowed */
if (flags) if (flags)
return false; return false;

View File

@@ -224,6 +224,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->has_branch_and_or = true; compiler->has_branch_and_or = true;
compiler->has_predication = true; compiler->has_predication = true;
compiler->has_scalar_alu = dev_info->a6xx.has_scalar_alu; compiler->has_scalar_alu = dev_info->a6xx.has_scalar_alu;
compiler->has_isam_v = dev_info->a6xx.has_isam_v;
compiler->fs_must_have_non_zero_constlen_quirk = dev_info->a7xx.fs_must_have_non_zero_constlen_quirk; compiler->fs_must_have_non_zero_constlen_quirk = dev_info->a7xx.fs_must_have_non_zero_constlen_quirk;
} else { } else {
compiler->max_const_pipeline = 512; compiler->max_const_pipeline = 512;
@@ -237,6 +238,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->max_const_safe = 256; compiler->max_const_safe = 256;
compiler->has_scalar_alu = false; compiler->has_scalar_alu = false;
compiler->has_isam_v = false;
} }
/* This is just a guess for a4xx. */ /* This is just a guess for a4xx. */

View File

@@ -208,6 +208,9 @@ struct ir3_compiler {
/* Whether SSBOs have descriptors for sampling with ISAM */ /* Whether SSBOs have descriptors for sampling with ISAM */
bool has_isam_ssbo; bool has_isam_ssbo;
/* Whether isam.v is supported to sample multiple components from SSBOs */
bool has_isam_v;
/* True if 16-bit descriptors are used for both 16-bit and 32-bit access. */ /* True if 16-bit descriptors are used for both 16-bit and 32-bit access. */
bool storage_16bit; bool storage_16bit;

View File

@@ -1592,9 +1592,11 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx,
nir_intrinsic_instr *intr, nir_intrinsic_instr *intr,
struct ir3_instruction **dst) struct ir3_instruction **dst)
{ {
/* Note: isam currently can't handle vectorized loads/stores */ /* Note: we can only use isam for vectorized loads/stores if isam.v is
* available.
*/
if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) || if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) ||
intr->def.num_components > 1 || (intr->def.num_components > 1 && !ctx->compiler->has_isam_v) ||
!ctx->compiler->has_isam_ssbo) { !ctx->compiler->has_isam_ssbo) {
ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst); ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
return; return;
@@ -1602,13 +1604,27 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx,
struct ir3_block *b = ctx->block; struct ir3_block *b = ctx->block;
struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[2])[0]; struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[2])[0];
struct ir3_instruction *coords = ir3_collect(b, offset, create_immed(b, 0)); struct ir3_instruction *coords = NULL;
unsigned imm_offset = 0;
if (ctx->compiler->has_isam_v) {
coords = offset;
} else {
coords = ir3_collect(b, offset, create_immed(b, 0));
}
struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], false); struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], false);
unsigned num_components = intr->def.num_components; unsigned num_components = intr->def.num_components;
assert(num_components == 1 || ctx->compiler->has_isam_v);
struct ir3_instruction *sam = struct ir3_instruction *sam =
emit_sam(ctx, OPC_ISAM, info, utype_for_size(intr->def.bit_size), emit_sam(ctx, OPC_ISAM, info, utype_for_size(intr->def.bit_size),
MASK(num_components), coords, NULL); MASK(num_components), coords, create_immed(b, imm_offset));
if (ctx->compiler->has_isam_v) {
sam->flags |= (IR3_INSTR_V | IR3_INSTR_INV_1D);
}
ir3_handle_nonuniform(sam, intr); ir3_handle_nonuniform(sam, intr);

View File

@@ -444,6 +444,7 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
(opc_cat(instr->opc) == 2) || (opc_cat(instr->opc) == 2) ||
(opc_cat(instr->opc) == 6) || (opc_cat(instr->opc) == 6) ||
is_meta(instr) || is_meta(instr) ||
(instr->opc == OPC_ISAM && (n == 1 || n == 2)) ||
(is_mad(instr->opc) && (n == 0))); (is_mad(instr->opc) && (n == 0)));
if ((opc_cat(instr->opc) == 2) && if ((opc_cat(instr->opc) == 2) &&

View File

@@ -92,10 +92,11 @@ ir3_nir_should_scalarize_mem(const nir_instr *instr, const void *data)
/* Scalarize load_ssbo's that we could otherwise lower to isam, /* Scalarize load_ssbo's that we could otherwise lower to isam,
* as the tex cache benefit outweighs the benefit of vectorizing * as the tex cache benefit outweighs the benefit of vectorizing
* Don't do this if (vectorized) isam.v is supported.
*/ */
if ((intrin->intrinsic == nir_intrinsic_load_ssbo) && if ((intrin->intrinsic == nir_intrinsic_load_ssbo) &&
(nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) && (nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
compiler->has_isam_ssbo) { compiler->has_isam_ssbo && !compiler->has_isam_v) {
return true; return true;
} }
@@ -112,11 +113,12 @@ ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
unsigned byte_size = bit_size / 8; unsigned byte_size = bit_size / 8;
/* Don't vectorize load_ssbo's that we could otherwise lower to isam, /* Don't vectorize load_ssbo's that we could otherwise lower to isam,
* as the tex cache benefit outweighs the benefit of vectorizing * as the tex cache benefit outweighs the benefit of vectorizing. If we
* support isam.v, we can vectorize this though.
*/ */
if ((low->intrinsic == nir_intrinsic_load_ssbo) && if ((low->intrinsic == nir_intrinsic_load_ssbo) &&
(nir_intrinsic_access(low) & ACCESS_CAN_REORDER) && (nir_intrinsic_access(low) & ACCESS_CAN_REORDER) &&
compiler->has_isam_ssbo) { compiler->has_isam_ssbo && !compiler->has_isam_v) {
return false; return false;
} }