nir/opt_shrink_vectors: shrink memory loads, not just IO

The problem with radeonsi+ACO is that UBO loads from vec4 uniforms using
only 1 component always load all 4 components. This fixes that.

We are only interested in shrinking UBO and SSBO loads, but I added more
intrinsics because why not.

Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29384>
This commit is contained in:
Marek Olšák
2024-05-24 08:23:23 -04:00
committed by Marge Bot
parent ef2bb4728f
commit 09e64e3682
3 changed files with 27 additions and 4 deletions

View File

@@ -286,6 +286,7 @@ alpine-build-testing:
C_ARGS: >
-Wno-error=cpp
-Wno-error=array-bounds
-Wno-error=stringop-overflow
-Wno-error=stringop-overread
DRI_LOADERS: >
-D glx=disabled

View File

@@ -2750,6 +2750,7 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr)
case nir_intrinsic_load_shared:
case nir_intrinsic_load_task_payload:
case nir_intrinsic_load_uniform:
case nir_intrinsic_load_constant:
case nir_intrinsic_load_push_constant:
case nir_intrinsic_load_kernel_input:
case nir_intrinsic_load_global:
@@ -2775,6 +2776,7 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr)
case nir_intrinsic_load_per_vertex_output:
case nir_intrinsic_load_per_primitive_output:
case nir_intrinsic_load_interpolated_input:
case nir_intrinsic_load_smem_amd:
case nir_intrinsic_store_output:
case nir_intrinsic_store_shared:
case nir_intrinsic_store_task_payload:

View File

@@ -103,10 +103,14 @@ shrink_dest_to_read_mask(nir_def *def, bool shrink_start)
return false;
nir_intrinsic_instr *intr = NULL;
if (def->parent_instr->type == nir_instr_type_intrinsic)
intr = nir_instr_as_intrinsic(def->parent_instr);
nir_src *offset_src = NULL;
shrink_start &= (intr != NULL) && nir_intrinsic_has_component(intr) &&
if (def->parent_instr->type == nir_instr_type_intrinsic) {
intr = nir_instr_as_intrinsic(def->parent_instr);
offset_src = nir_get_io_offset_src(intr);
}
shrink_start &= intr && (nir_intrinsic_has_component(intr) || offset_src) &&
is_only_used_by_alu(def);
int last_bit = util_last_bit(mask);
@@ -122,9 +126,25 @@ shrink_dest_to_read_mask(nir_def *def, bool shrink_start)
if (first_bit) {
assert(shrink_start);
nir_intrinsic_set_component(intr, nir_intrinsic_component(intr) + first_bit);
if (nir_intrinsic_has_component(intr)) {
unsigned new_component = nir_intrinsic_component(intr) + first_bit;
nir_intrinsic_set_component(intr, new_component);
} else {
/* Add the component offset into the src offset. */
unsigned offset = (def->bit_size / 8) * first_bit;
if (nir_intrinsic_has_align_offset(intr)) {
unsigned align_offset = (nir_intrinsic_align_offset(intr) + offset) %
nir_intrinsic_align_mul(intr);
nir_intrinsic_set_align_offset(intr, align_offset);
}
nir_builder b = nir_builder_at(nir_before_instr(&intr->instr));
nir_src_rewrite(offset_src, nir_iadd_imm(&b, offset_src->ssa, offset));
}
/* Reswizzle sources, which must be ALU since they have swizzle */
assert(first_bit + comps <= NIR_MAX_VEC_COMPONENTS);
uint8_t swizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
for (unsigned i = 0; i < comps; ++i) {
swizzle[first_bit + i] = i;