nir/opt_shrink_vectors: shrink memory loads, not just IO
The problem with radeonsi+ACO is that UBO loads from vec4 uniforms using only 1 component always load all 4 components. This fixes that. We are only interested in shrinking UBO and SSBO loads, but I added more intrinsics because why not. Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29384>
This commit is contained in:
@@ -286,6 +286,7 @@ alpine-build-testing:
|
||||
C_ARGS: >
|
||||
-Wno-error=cpp
|
||||
-Wno-error=array-bounds
|
||||
-Wno-error=stringop-overflow
|
||||
-Wno-error=stringop-overread
|
||||
DRI_LOADERS: >
|
||||
-D glx=disabled
|
||||
|
@@ -2750,6 +2750,7 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr)
|
||||
case nir_intrinsic_load_shared:
|
||||
case nir_intrinsic_load_task_payload:
|
||||
case nir_intrinsic_load_uniform:
|
||||
case nir_intrinsic_load_constant:
|
||||
case nir_intrinsic_load_push_constant:
|
||||
case nir_intrinsic_load_kernel_input:
|
||||
case nir_intrinsic_load_global:
|
||||
@@ -2775,6 +2776,7 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr)
|
||||
case nir_intrinsic_load_per_vertex_output:
|
||||
case nir_intrinsic_load_per_primitive_output:
|
||||
case nir_intrinsic_load_interpolated_input:
|
||||
case nir_intrinsic_load_smem_amd:
|
||||
case nir_intrinsic_store_output:
|
||||
case nir_intrinsic_store_shared:
|
||||
case nir_intrinsic_store_task_payload:
|
||||
|
@@ -103,10 +103,14 @@ shrink_dest_to_read_mask(nir_def *def, bool shrink_start)
|
||||
return false;
|
||||
|
||||
nir_intrinsic_instr *intr = NULL;
|
||||
if (def->parent_instr->type == nir_instr_type_intrinsic)
|
||||
intr = nir_instr_as_intrinsic(def->parent_instr);
|
||||
nir_src *offset_src = NULL;
|
||||
|
||||
shrink_start &= (intr != NULL) && nir_intrinsic_has_component(intr) &&
|
||||
if (def->parent_instr->type == nir_instr_type_intrinsic) {
|
||||
intr = nir_instr_as_intrinsic(def->parent_instr);
|
||||
offset_src = nir_get_io_offset_src(intr);
|
||||
}
|
||||
|
||||
shrink_start &= intr && (nir_intrinsic_has_component(intr) || offset_src) &&
|
||||
is_only_used_by_alu(def);
|
||||
|
||||
int last_bit = util_last_bit(mask);
|
||||
@@ -122,9 +126,25 @@ shrink_dest_to_read_mask(nir_def *def, bool shrink_start)
|
||||
if (first_bit) {
|
||||
assert(shrink_start);
|
||||
|
||||
nir_intrinsic_set_component(intr, nir_intrinsic_component(intr) + first_bit);
|
||||
if (nir_intrinsic_has_component(intr)) {
|
||||
unsigned new_component = nir_intrinsic_component(intr) + first_bit;
|
||||
nir_intrinsic_set_component(intr, new_component);
|
||||
} else {
|
||||
/* Add the component offset into the src offset. */
|
||||
unsigned offset = (def->bit_size / 8) * first_bit;
|
||||
|
||||
if (nir_intrinsic_has_align_offset(intr)) {
|
||||
unsigned align_offset = (nir_intrinsic_align_offset(intr) + offset) %
|
||||
nir_intrinsic_align_mul(intr);
|
||||
nir_intrinsic_set_align_offset(intr, align_offset);
|
||||
}
|
||||
|
||||
nir_builder b = nir_builder_at(nir_before_instr(&intr->instr));
|
||||
nir_src_rewrite(offset_src, nir_iadd_imm(&b, offset_src->ssa, offset));
|
||||
}
|
||||
|
||||
/* Reswizzle sources, which must be ALU since they have swizzle */
|
||||
assert(first_bit + comps <= NIR_MAX_VEC_COMPONENTS);
|
||||
uint8_t swizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
|
||||
for (unsigned i = 0; i < comps; ++i) {
|
||||
swizzle[first_bit + i] = i;
|
||||
|
Reference in New Issue
Block a user