ac/nir: vectorize streamout stores for legacy pipeline optimally

Walk the whole vertex stride thanks to XFB info sorted by offset, gather
individual components from same or different outputs, and once we have
gathered 4, store them as vec4.

It also removes the COHERENT flag from VMEM stores because NGG streamout
doesn't use it either and I don't think it's needed.

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32686>
This commit is contained in:
Marek Olšák
2024-12-17 04:02:00 -05:00
committed by Marge Bot
parent e399f3bed9
commit 4f2e2e10bc

View File

@@ -1233,7 +1233,10 @@ emit_streamout(nir_builder *b, unsigned stream, nir_xfb_info *info, ac_nir_prera
so_write_offset[i] = offset;
}
nir_def *undef = nir_undef(b, 1, 32);
nir_def *zero = nir_imm_int(b, 0);
unsigned num_values = 0, store_offset = 0, store_buffer_index = 0;
nir_def *values[4];
for (unsigned i = 0; i < info->output_count; i++) {
const nir_xfb_output_info *output = info->outputs + i;
if (stream != info->buffer_to_stream[output->buffer])
@@ -1243,35 +1246,50 @@ emit_streamout(nir_builder *b, unsigned stream, nir_xfb_info *info, ac_nir_prera
nir_def **output_data =
get_output_and_type(out, output->location, output->high_16bits, &output_type);
nir_def *vec[4] = {undef, undef, undef, undef};
uint8_t mask = 0;
u_foreach_bit(j, output->component_mask) {
nir_def *data = output_data[j];
u_foreach_bit(out_comp, output->component_mask) {
if (!output_data[out_comp])
continue;
if (data) {
if (data->bit_size < 32) {
/* we need output type to convert non-32bit output to 32bit */
assert(output_type);
nir_def *data = output_data[out_comp];
nir_alu_type base_type = nir_alu_type_get_base_type(output_type[j]);
data = nir_convert_to_bit_size(b, data, base_type, 32);
}
if (data->bit_size < 32) {
/* Convert the 16-bit output to 32 bits. */
assert(output_type);
unsigned comp = j - output->component_offset;
vec[comp] = data;
mask |= 1 << comp;
nir_alu_type base_type = nir_alu_type_get_base_type(output_type[out_comp]);
data = nir_convert_to_bit_size(b, data, base_type, 32);
}
assert(out_comp >= output->component_offset);
const unsigned store_comp = out_comp - output->component_offset;
const unsigned store_comp_offset = output->offset + store_comp * 4;
const bool has_hole = store_offset + num_values * 4 != store_comp_offset;
/* Flush the gathered components to memory as a vec4 store or less if there is a hole. */
if (num_values && (num_values == 4 || store_buffer_index != output->buffer || has_hole)) {
nir_store_buffer_amd(b, nir_vec(b, values, num_values), so_buffers[store_buffer_index],
so_write_offset[store_buffer_index], zero, zero,
.base = store_offset,
.access = ACCESS_NON_TEMPORAL);
num_values = 0;
}
/* Initialize the buffer index and offset if we are beginning a new vec4 store. */
if (num_values == 0) {
store_buffer_index = output->buffer;
store_offset = store_comp_offset;
}
values[num_values++] = data;
}
}
if (!mask)
continue;
unsigned buffer = output->buffer;
nir_def *data = nir_vec(b, vec, util_last_bit(mask));
nir_def *zero = nir_imm_int(b, 0);
nir_store_buffer_amd(b, data, so_buffers[buffer], so_write_offset[buffer], zero, zero,
.base = output->offset, .write_mask = mask,
.access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL);
if (num_values) {
/* Flush the remaining components to memory (as an up to vec4 store) */
nir_store_buffer_amd(b, nir_vec(b, values, num_values), so_buffers[store_buffer_index],
so_write_offset[store_buffer_index], zero, zero,
.base = store_offset,
.access = ACCESS_NON_TEMPORAL);
}
nir_pop_if(b, NULL);