ir3: use isam.v for multi-component SSBO loads
Since a7xx, isam.v can be used to perform multi-component SSBO loads. Use this whenever possible to prevent excessive scalarization. isam.v also uses only a single coordinate (as opposed to a 2-dimensional coordinate for isam) so this reduces register pressure as well. Signed-off-by: Job Noorman <jnoorman@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28664>
This commit is contained in:
@@ -175,6 +175,8 @@ struct fd_dev_info {
|
|||||||
/* See ir3_compiler::has_scalar_alu. */
|
/* See ir3_compiler::has_scalar_alu. */
|
||||||
bool has_scalar_alu;
|
bool has_scalar_alu;
|
||||||
|
|
||||||
|
bool has_isam_v;
|
||||||
|
|
||||||
/* Whether writing to UBWC attachment and reading the same image as input
|
/* Whether writing to UBWC attachment and reading the same image as input
|
||||||
* attachment or as a texture reads correct values from the image.
|
* attachment or as a texture reads correct values from the image.
|
||||||
* If this is false, we may read stale values from the flag buffer,
|
* If this is false, we may read stale values from the flag buffer,
|
||||||
|
@@ -414,6 +414,7 @@ a6xx_gen4 = A6XXProps(
|
|||||||
has_lrz_dir_tracking = True,
|
has_lrz_dir_tracking = True,
|
||||||
has_per_view_viewport = True,
|
has_per_view_viewport = True,
|
||||||
has_scalar_alu = True,
|
has_scalar_alu = True,
|
||||||
|
has_isam_v = True,
|
||||||
)
|
)
|
||||||
|
|
||||||
a6xx_a690_quirk = A6XXProps(
|
a6xx_a690_quirk = A6XXProps(
|
||||||
@@ -794,6 +795,7 @@ a7xx_base = A6XXProps(
|
|||||||
line_width_max = 127.5,
|
line_width_max = 127.5,
|
||||||
has_scalar_alu = True,
|
has_scalar_alu = True,
|
||||||
has_coherent_ubwc_flag_caches = True,
|
has_coherent_ubwc_flag_caches = True,
|
||||||
|
has_isam_v = True,
|
||||||
)
|
)
|
||||||
|
|
||||||
a7xx_725 = A7XXProps(
|
a7xx_725 = A7XXProps(
|
||||||
|
@@ -1258,6 +1258,12 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
|
|||||||
return false;
|
return false;
|
||||||
break;
|
break;
|
||||||
case 5:
|
case 5:
|
||||||
|
if (instr->opc == OPC_ISAM && (instr->flags & IR3_INSTR_V)) {
|
||||||
|
if (((instr->flags & IR3_INSTR_S2EN) && n == 2) ||
|
||||||
|
(!(instr->flags & IR3_INSTR_S2EN) && n == 1)) {
|
||||||
|
return flags == IR3_REG_IMMED;
|
||||||
|
}
|
||||||
|
}
|
||||||
/* no flags allowed */
|
/* no flags allowed */
|
||||||
if (flags)
|
if (flags)
|
||||||
return false;
|
return false;
|
||||||
|
@@ -224,6 +224,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
|
|||||||
compiler->has_branch_and_or = true;
|
compiler->has_branch_and_or = true;
|
||||||
compiler->has_predication = true;
|
compiler->has_predication = true;
|
||||||
compiler->has_scalar_alu = dev_info->a6xx.has_scalar_alu;
|
compiler->has_scalar_alu = dev_info->a6xx.has_scalar_alu;
|
||||||
|
compiler->has_isam_v = dev_info->a6xx.has_isam_v;
|
||||||
compiler->fs_must_have_non_zero_constlen_quirk = dev_info->a7xx.fs_must_have_non_zero_constlen_quirk;
|
compiler->fs_must_have_non_zero_constlen_quirk = dev_info->a7xx.fs_must_have_non_zero_constlen_quirk;
|
||||||
} else {
|
} else {
|
||||||
compiler->max_const_pipeline = 512;
|
compiler->max_const_pipeline = 512;
|
||||||
@@ -237,6 +238,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
|
|||||||
compiler->max_const_safe = 256;
|
compiler->max_const_safe = 256;
|
||||||
|
|
||||||
compiler->has_scalar_alu = false;
|
compiler->has_scalar_alu = false;
|
||||||
|
compiler->has_isam_v = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* This is just a guess for a4xx. */
|
/* This is just a guess for a4xx. */
|
||||||
|
@@ -208,6 +208,9 @@ struct ir3_compiler {
|
|||||||
/* Whether SSBOs have descriptors for sampling with ISAM */
|
/* Whether SSBOs have descriptors for sampling with ISAM */
|
||||||
bool has_isam_ssbo;
|
bool has_isam_ssbo;
|
||||||
|
|
||||||
|
/* Whether isam.v is supported to sample multiple components from SSBOs */
|
||||||
|
bool has_isam_v;
|
||||||
|
|
||||||
/* True if 16-bit descriptors are used for both 16-bit and 32-bit access. */
|
/* True if 16-bit descriptors are used for both 16-bit and 32-bit access. */
|
||||||
bool storage_16bit;
|
bool storage_16bit;
|
||||||
|
|
||||||
|
@@ -1592,9 +1592,11 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx,
|
|||||||
nir_intrinsic_instr *intr,
|
nir_intrinsic_instr *intr,
|
||||||
struct ir3_instruction **dst)
|
struct ir3_instruction **dst)
|
||||||
{
|
{
|
||||||
/* Note: isam currently can't handle vectorized loads/stores */
|
/* Note: we can only use isam for vectorized loads/stores if isam.v is
|
||||||
|
* available.
|
||||||
|
*/
|
||||||
if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) ||
|
if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) ||
|
||||||
intr->def.num_components > 1 ||
|
(intr->def.num_components > 1 && !ctx->compiler->has_isam_v) ||
|
||||||
!ctx->compiler->has_isam_ssbo) {
|
!ctx->compiler->has_isam_ssbo) {
|
||||||
ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
|
ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
|
||||||
return;
|
return;
|
||||||
@@ -1602,13 +1604,27 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx,
|
|||||||
|
|
||||||
struct ir3_block *b = ctx->block;
|
struct ir3_block *b = ctx->block;
|
||||||
struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[2])[0];
|
struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[2])[0];
|
||||||
struct ir3_instruction *coords = ir3_collect(b, offset, create_immed(b, 0));
|
struct ir3_instruction *coords = NULL;
|
||||||
|
unsigned imm_offset = 0;
|
||||||
|
|
||||||
|
if (ctx->compiler->has_isam_v) {
|
||||||
|
coords = offset;
|
||||||
|
} else {
|
||||||
|
coords = ir3_collect(b, offset, create_immed(b, 0));
|
||||||
|
}
|
||||||
|
|
||||||
struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], false);
|
struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], false);
|
||||||
|
|
||||||
unsigned num_components = intr->def.num_components;
|
unsigned num_components = intr->def.num_components;
|
||||||
|
assert(num_components == 1 || ctx->compiler->has_isam_v);
|
||||||
|
|
||||||
struct ir3_instruction *sam =
|
struct ir3_instruction *sam =
|
||||||
emit_sam(ctx, OPC_ISAM, info, utype_for_size(intr->def.bit_size),
|
emit_sam(ctx, OPC_ISAM, info, utype_for_size(intr->def.bit_size),
|
||||||
MASK(num_components), coords, NULL);
|
MASK(num_components), coords, create_immed(b, imm_offset));
|
||||||
|
|
||||||
|
if (ctx->compiler->has_isam_v) {
|
||||||
|
sam->flags |= (IR3_INSTR_V | IR3_INSTR_INV_1D);
|
||||||
|
}
|
||||||
|
|
||||||
ir3_handle_nonuniform(sam, intr);
|
ir3_handle_nonuniform(sam, intr);
|
||||||
|
|
||||||
|
@@ -444,6 +444,7 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
|
|||||||
(opc_cat(instr->opc) == 2) ||
|
(opc_cat(instr->opc) == 2) ||
|
||||||
(opc_cat(instr->opc) == 6) ||
|
(opc_cat(instr->opc) == 6) ||
|
||||||
is_meta(instr) ||
|
is_meta(instr) ||
|
||||||
|
(instr->opc == OPC_ISAM && (n == 1 || n == 2)) ||
|
||||||
(is_mad(instr->opc) && (n == 0)));
|
(is_mad(instr->opc) && (n == 0)));
|
||||||
|
|
||||||
if ((opc_cat(instr->opc) == 2) &&
|
if ((opc_cat(instr->opc) == 2) &&
|
||||||
|
@@ -92,10 +92,11 @@ ir3_nir_should_scalarize_mem(const nir_instr *instr, const void *data)
|
|||||||
|
|
||||||
/* Scalarize load_ssbo's that we could otherwise lower to isam,
|
/* Scalarize load_ssbo's that we could otherwise lower to isam,
|
||||||
* as the tex cache benefit outweighs the benefit of vectorizing
|
* as the tex cache benefit outweighs the benefit of vectorizing
|
||||||
|
* Don't do this if (vectorized) isam.v is supported.
|
||||||
*/
|
*/
|
||||||
if ((intrin->intrinsic == nir_intrinsic_load_ssbo) &&
|
if ((intrin->intrinsic == nir_intrinsic_load_ssbo) &&
|
||||||
(nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
|
(nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
|
||||||
compiler->has_isam_ssbo) {
|
compiler->has_isam_ssbo && !compiler->has_isam_v) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -112,11 +113,12 @@ ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
|
|||||||
unsigned byte_size = bit_size / 8;
|
unsigned byte_size = bit_size / 8;
|
||||||
|
|
||||||
/* Don't vectorize load_ssbo's that we could otherwise lower to isam,
|
/* Don't vectorize load_ssbo's that we could otherwise lower to isam,
|
||||||
* as the tex cache benefit outweighs the benefit of vectorizing
|
* as the tex cache benefit outweighs the benefit of vectorizing. If we
|
||||||
|
* support isam.v, we can vectorize this though.
|
||||||
*/
|
*/
|
||||||
if ((low->intrinsic == nir_intrinsic_load_ssbo) &&
|
if ((low->intrinsic == nir_intrinsic_load_ssbo) &&
|
||||||
(nir_intrinsic_access(low) & ACCESS_CAN_REORDER) &&
|
(nir_intrinsic_access(low) & ACCESS_CAN_REORDER) &&
|
||||||
compiler->has_isam_ssbo) {
|
compiler->has_isam_ssbo && !compiler->has_isam_v) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user