From 9e42553ca8d30a2a2cb6781774631c45285d77dd Mon Sep 17 00:00:00 2001 From: Mike Blumenkrantz Date: Thu, 27 Jul 2023 13:47:13 -0400 Subject: [PATCH] zink: use lowered io (kinda) for i/o vars this runs io lowering on shader create, which is a huge pita and waste of time since it requires then re-creating all the deref io later, but it also makes the variables simpler by eliminating struct awfulness (which was already eliminated by the split_blocks pass, but who's keeping track) and will enable future use of some bizarro inter-stage linker thing Reviewed-by: Dave Airlie Part-of: --- src/gallium/drivers/zink/zink_compiler.c | 781 ++++++++++++++++++++--- 1 file changed, 687 insertions(+), 94 deletions(-) diff --git a/src/gallium/drivers/zink/zink_compiler.c b/src/gallium/drivers/zink/zink_compiler.c index 86460b41d7e..3a3b81b8a2a 100644 --- a/src/gallium/drivers/zink/zink_compiler.c +++ b/src/gallium/drivers/zink/zink_compiler.c @@ -1366,6 +1366,7 @@ zink_screen_init_compiler(struct zink_screen *screen) .lower_mul_2x32_64 = true, .support_16bit_alu = true, /* not quite what it sounds like */ .max_unroll_iterations = 0, + .use_interpolated_input_intrinsics = true, }; screen->nir_options = default_options; @@ -1771,36 +1772,16 @@ find_var_with_location_frac(nir_shader *nir, unsigned location, unsigned locatio { assert((int)location >= 0); - unsigned found = 0; - if (!location_frac && location != VARYING_SLOT_PSIZ) { - nir_foreach_variable_with_modes(var, nir, mode) { - if (var->data.location == location) - found++; - } - } - if (found) { - /* multiple variables found for this location: find the biggest one */ - nir_variable *out = NULL; - unsigned slots = 0; - nir_foreach_variable_with_modes(var, nir, mode) { - if (var->data.location == location) { - unsigned count_slots = glsl_count_vec4_slots(var->type, false, false); - if (count_slots > slots) { - slots = count_slots; - out = var; - } - } - } - return out; - } else { - /* only one variable found or this is location_frac */ - nir_foreach_variable_with_modes(var, nir, mode) { - if (var->data.location == location && - (var->data.location_frac == location_frac || - (glsl_type_is_array(var->type) ? glsl_array_size(var->type) : glsl_get_vector_elements(var->type)) >= location_frac + 1)) { - if (location != VARYING_SLOT_PSIZ || !have_psiz || var->data.explicit_location) - return var; - } + nir_foreach_variable_with_modes(var, nir, mode) { + if (var->data.location == location && (location != VARYING_SLOT_PSIZ || !have_psiz || var->data.explicit_location)) { + unsigned num_components = glsl_get_vector_elements(var->type); + if (glsl_type_is_64bit(glsl_without_array(var->type))) + num_components *= 2; + if (var->data.location == VARYING_SLOT_CLIP_DIST0 || var->data.location == VARYING_SLOT_CULL_DIST0) + num_components = glsl_get_aoa_size(var->type); + if (var->data.location_frac <= location_frac && + var->data.location_frac + num_components > location_frac) + return var; } } return NULL; @@ -1898,6 +1879,38 @@ get_slot_components(nir_variable *var, unsigned slot, unsigned so_slot) return num_components; } +static bool +is_var_type_bindless(nir_variable *var) +{ + switch (glsl_get_base_type(glsl_without_array(var->type))) { + case GLSL_TYPE_SAMPLER: + case GLSL_TYPE_TEXTURE: + case GLSL_TYPE_IMAGE: + return true; + default: + break; + } + return false; +} + +static unsigned +get_var_slot_count(nir_shader *nir, nir_variable *var) +{ + assert(var->data.mode == nir_var_shader_in || var->data.mode == nir_var_shader_out); + const struct glsl_type *type = var->type; + if (nir_is_arrayed_io(var, nir->info.stage)) + type = glsl_get_array_element(type); + unsigned slot_count = 0; + if (var->data.location >= VARYING_SLOT_VAR0) + slot_count = glsl_count_vec4_slots(type, false, is_var_type_bindless(var)); + else if (glsl_type_is_array(type)) + slot_count = DIV_ROUND_UP(glsl_get_aoa_size(type), 4); + else + slot_count = 1; + return slot_count; +} + + static const struct pipe_stream_output * find_packed_output(const struct pipe_stream_output_info *so_info, uint8_t *reverse_map, unsigned slot) { @@ -1939,46 +1952,59 @@ update_so_info(struct zink_shader *zs, nir_shader *nir, const struct pipe_stream nir_variable *psiz = NULL; for (unsigned i = 0; i < so_info->num_outputs; i++) { const struct pipe_stream_output *output = &so_info->output[i]; - unsigned slot = reverse_map[output->register_index]; /* always set stride to be used during draw */ zs->sinfo.so_info.stride[output->output_buffer] = so_info->stride[output->output_buffer]; if (zs->info.stage != MESA_SHADER_GEOMETRY || util_bitcount(zs->info.gs.active_stream_mask) == 1) { - nir_variable *var = NULL; - unsigned so_slot; - while (!var) - var = find_var_with_location_frac(nir, slot--, output->start_component, have_psiz, nir_var_shader_out); - if (var->data.location == VARYING_SLOT_PSIZ) - psiz = var; - so_slot = slot + 1; - slot = reverse_map[output->register_index]; - if (var->data.explicit_xfb_buffer) { - /* handle dvec3 where gallium splits streamout over 2 registers */ - for (unsigned j = 0; j < output->num_components; j++) - inlined[slot][output->start_component + j] = true; - } - if (is_inlined(inlined[slot], output)) - continue; - bool is_struct = glsl_type_is_struct_or_ifc(glsl_without_array(var->type)); - unsigned num_components = get_slot_components(var, slot, so_slot); - /* if this is the entire variable, try to blast it out during the initial declaration - * structs must be handled later to ensure accurate analysis - */ - if (!is_struct && (num_components == output->num_components || (num_components > output->num_components && output->num_components == 4))) { - var->data.explicit_xfb_buffer = 1; - var->data.xfb.buffer = output->output_buffer; - var->data.xfb.stride = so_info->stride[output->output_buffer] * 4; - var->data.offset = output->dst_offset * 4; - var->data.stream = output->stream; - for (unsigned j = 0; j < output->num_components; j++) - inlined[slot][output->start_component + j] = true; - } else { - /* otherwise store some metadata for later */ - packed |= BITFIELD64_BIT(slot); - packed_components[slot] += output->num_components; - packed_streams[slot] |= BITFIELD_BIT(output->stream); - packed_buffers[slot] |= BITFIELD_BIT(output->output_buffer); - for (unsigned j = 0; j < output->num_components; j++) - packed_offsets[output->register_index][j + output->start_component] = output->dst_offset + j; + for (unsigned c = 0; !is_inlined(inlined[reverse_map[output->register_index]], output) && c < output->num_components; c++) { + unsigned slot = reverse_map[output->register_index]; + if (inlined[slot][output->start_component + c]) + continue; + nir_variable *var = NULL; + while (!var && slot < VARYING_SLOT_TESS_MAX) + var = find_var_with_location_frac(nir, slot--, output->start_component + c, have_psiz, nir_var_shader_out); + slot = reverse_map[output->register_index]; + unsigned slot_count = var ? get_var_slot_count(nir, var) : 0; + if (!var || var->data.location > slot || var->data.location + slot_count <= slot) { + /* if no variable is found for the xfb output, no output exists */ + inlined[slot][c + output->start_component] = true; + continue; + } + if (var->data.location == VARYING_SLOT_PSIZ) + psiz = var; + if (var->data.explicit_xfb_buffer) { + /* handle dvec3 where gallium splits streamout over 2 registers */ + for (unsigned j = 0; j < output->num_components; j++) + inlined[slot][c + output->start_component + j] = true; + } + if (is_inlined(inlined[slot], output)) + continue; + assert(!glsl_type_is_array(var->type) || var->data.location == VARYING_SLOT_CLIP_DIST0 || var->data.location == VARYING_SLOT_CULL_DIST0); + assert(!glsl_type_is_struct_or_ifc(var->type)); + unsigned num_components = glsl_type_is_array(var->type) ? glsl_get_aoa_size(var->type) : glsl_get_vector_elements(var->type); + if (glsl_type_is_64bit(glsl_without_array(var->type))) + num_components *= 2; + /* if this is the entire variable, try to blast it out during the initial declaration + * structs must be handled later to ensure accurate analysis + */ + if ((num_components == output->num_components || + num_components < output->num_components || + (num_components > output->num_components && output->num_components == 4))) { + var->data.explicit_xfb_buffer = 1; + var->data.xfb.buffer = output->output_buffer; + var->data.xfb.stride = so_info->stride[output->output_buffer] * 4; + var->data.offset = (c + output->dst_offset) * 4; + var->data.stream = output->stream; + for (unsigned j = 0; j < MIN2(num_components, output->num_components); j++) + inlined[slot][c + output->start_component + j] = true; + } else { + /* otherwise store some metadata for later */ + packed |= BITFIELD64_BIT(slot); + packed_components[slot] += output->num_components; + packed_streams[slot] |= BITFIELD_BIT(output->stream); + packed_buffers[slot] |= BITFIELD_BIT(output->output_buffer); + for (unsigned j = 0; j < output->num_components; j++) + packed_offsets[output->register_index][j + output->start_component + c] = output->dst_offset + j; + } } } } @@ -1996,6 +2022,10 @@ update_so_info(struct zink_shader *zs, nir_shader *nir, const struct pipe_stream nir_variable *var = NULL; while (!var) var = find_var_with_location_frac(nir, slot--, output->start_component, have_psiz, nir_var_shader_out); + slot = reverse_map[output->register_index]; + unsigned slot_count = var ? get_var_slot_count(nir, var) : 0; + if (!var || var->data.location > slot || var->data.location + slot_count <= slot) + continue; /* this is a lowered 64bit variable that can't be exported due to packing */ if (var->data.is_xfb) goto out; @@ -2422,6 +2452,32 @@ remove_bo_access(nir_shader *shader, struct zink_shader *zs) return nir_shader_instructions_pass(shader, remove_bo_access_instr, nir_metadata_dominance, &bo); } +static bool +filter_io_instr(nir_intrinsic_instr *intr, bool *is_load, bool *is_input, bool *is_interp) +{ + switch (intr->intrinsic) { + case nir_intrinsic_load_interpolated_input: + *is_interp = true; + FALLTHROUGH; + case nir_intrinsic_load_input: + case nir_intrinsic_load_per_vertex_input: + *is_input = true; + FALLTHROUGH; + case nir_intrinsic_load_output: + case nir_intrinsic_load_per_vertex_output: + case nir_intrinsic_load_per_primitive_output: + *is_load = true; + FALLTHROUGH; + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_primitive_output: + case nir_intrinsic_store_per_vertex_output: + break; + default: + return false; + } + return true; +} + static bool find_var_deref(nir_shader *nir, nir_variable *var) { @@ -2439,6 +2495,47 @@ find_var_deref(nir_shader *nir, nir_variable *var) return false; } +static bool +find_var_io(nir_shader *nir, nir_variable *var) +{ + nir_foreach_function(function, nir) { + if (!function->impl) + continue; + + nir_foreach_block(block, function->impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + bool is_load = false; + bool is_input = false; + bool is_interp = false; + if (!filter_io_instr(intr, &is_load, &is_input, &is_interp)) + continue; + if (var->data.mode == nir_var_shader_in && !is_input) + continue; + if (var->data.mode == nir_var_shader_out && is_input) + continue; + unsigned slot_offset = 0; + if (var->data.fb_fetch_output && !is_load) + continue; + if (nir->info.stage == MESA_SHADER_FRAGMENT && !is_load && !is_input && nir_intrinsic_io_semantics(intr).dual_source_blend_index != var->data.index) + continue; + nir_src *src_offset = nir_get_io_offset_src(intr); + if (src_offset && nir_src_is_const(*src_offset)) + slot_offset = nir_src_as_uint(*src_offset); + unsigned slot_count = get_var_slot_count(nir, var); + if (var->data.mode & (nir_var_shader_out | nir_var_shader_in) && + var->data.fb_fetch_output == nir_intrinsic_io_semantics(intr).fb_fetch_output && + var->data.location <= nir_intrinsic_io_semantics(intr).location + slot_offset && + var->data.location + slot_count > nir_intrinsic_io_semantics(intr).location + slot_offset) + return true; + } + } + } + return false; +} + struct clamp_layer_output_state { nir_variable *original; nir_variable *clamped; @@ -2488,7 +2585,7 @@ clamp_layer_output(nir_shader *vs, nir_shader *fs, unsigned *next_location) } struct clamp_layer_output_state state = {0}; state.original = nir_find_variable_with_location(vs, nir_var_shader_out, VARYING_SLOT_LAYER); - if (!state.original || !find_var_deref(vs, state.original)) + if (!state.original || (!find_var_deref(vs, state.original) && !find_var_io(vs, state.original))) return false; state.clamped = nir_variable_create(vs, nir_var_shader_out, glsl_int_type(), "layer_clamped"); state.clamped->data.location = VARYING_SLOT_LAYER; @@ -2636,16 +2733,16 @@ rewrite_read_as_0(nir_builder *b, nir_instr *instr, void *data) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_load_deref) + if (intr->intrinsic != nir_intrinsic_load_input) return false; - nir_variable *deref_var = nir_intrinsic_get_var(intr, 0); - if (deref_var != var) + unsigned location = nir_intrinsic_io_semantics(intr).location; + if (location != var->data.location) return false; b->cursor = nir_before_instr(instr); nir_def *zero = nir_imm_zero(b, intr->def.num_components, intr->def.bit_size); if (b->shader->info.stage == MESA_SHADER_FRAGMENT) { - switch (var->data.location) { + switch (location) { case VARYING_SLOT_COL0: case VARYING_SLOT_COL1: case VARYING_SLOT_BFC0: @@ -2663,6 +2760,34 @@ rewrite_read_as_0(nir_builder *b, nir_instr *instr, void *data) return true; } + + +static bool +delete_psiz_store_instr(nir_builder *b, nir_instr *instr, void *data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + switch (intr->intrinsic) { + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_primitive_output: + case nir_intrinsic_store_per_vertex_output: + break; + default: + return false; + } + if (nir_intrinsic_io_semantics(intr).location != VARYING_SLOT_PSIZ) + return false; + nir_instr_remove(instr); + return true; +} + +static bool +delete_psiz_store(nir_shader *nir) +{ + return nir_shader_instructions_pass(nir, delete_psiz_store_instr, nir_metadata_dominance, NULL); +} + void zink_compiler_assign_io(struct zink_screen *screen, nir_shader *producer, nir_shader *consumer) { @@ -2677,6 +2802,7 @@ zink_compiler_assign_io(struct zink_screen *screen, nir_shader *producer, nir_sh if (var && !var->data.explicit_location && !nir_find_variable_with_location(consumer, nir_var_shader_in, VARYING_SLOT_PSIZ)) { var->data.mode = nir_var_shader_temp; nir_fixup_deref_modes(producer); + delete_psiz_store(producer); NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_temp, NULL); optimize_nir(producer, NULL); } @@ -3032,8 +3158,6 @@ lower_64bit_vars(nir_shader *shader, bool doubles_only) bool progress = false; struct hash_table *derefs = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); struct set *deletes = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); - nir_foreach_variable_with_modes(var, shader, nir_var_shader_in | nir_var_shader_out) - progress |= lower_64bit_vars_loop(shader, var, derefs, deletes, doubles_only); nir_foreach_function_impl(impl, shader) { nir_foreach_function_temp_variable(var, impl) { if (!glsl_type_contains_64bit(var->type) || (doubles_only && !glsl_contains_double(var->type))) @@ -3285,11 +3409,11 @@ static void prune_io(nir_shader *nir) { nir_foreach_shader_in_variable_safe(var, nir) { - if (!find_var_deref(nir, var)) + if (!find_var_deref(nir, var) && !find_var_io(nir, var)) var->data.mode = nir_var_shader_temp; } nir_foreach_shader_out_variable_safe(var, nir) { - if (!find_var_deref(nir, var)) + if (!find_var_deref(nir, var) && !find_var_io(nir, var)) var->data.mode = nir_var_shader_temp; } NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_temp, NULL); @@ -3501,6 +3625,172 @@ invert_point_coord(nir_shader *nir) return nir_shader_instructions_pass(nir, invert_point_coord_instr, nir_metadata_dominance, NULL); } +static bool +add_derefs_instr(nir_builder *b, nir_instr *instr, void *data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + bool is_load = false; + bool is_input = false; + bool is_interp = false; + if (!filter_io_instr(intr, &is_load, &is_input, &is_interp)) + return false; + unsigned loc = nir_intrinsic_io_semantics(intr).location; + /* loop over all the variables and rewrite corresponding access */ + nir_foreach_variable_with_modes(var, b->shader, is_input ? nir_var_shader_in : nir_var_shader_out) { + nir_src *src_offset = nir_get_io_offset_src(intr); + const unsigned slot_offset = src_offset && nir_src_is_const(*src_offset) ? nir_src_as_uint(*src_offset) : 0; + const struct glsl_type *type = var->type; + if (nir_is_arrayed_io(var, b->shader->info.stage)) + type = glsl_get_array_element(type); + unsigned slot_count = get_var_slot_count(b->shader, var); + unsigned location = loc + slot_offset; + /* filter access that isn't specific to this variable */ + if (var->data.location > location || var->data.location + slot_count <= location) + continue; + if (var->data.fb_fetch_output != nir_intrinsic_io_semantics(intr).fb_fetch_output) + continue; + if (b->shader->info.stage == MESA_SHADER_FRAGMENT && !is_load && nir_intrinsic_io_semantics(intr).dual_source_blend_index != var->data.index) + continue; + unsigned frac = nir_intrinsic_component(intr); + unsigned bit_size = is_load ? intr->def.bit_size : nir_src_bit_size(intr->src[0]); + /* set c aligned/rounded down to dword */ + unsigned c = frac; + if (frac && bit_size < 32) + c = frac * bit_size / 32; + + unsigned size = 0; + bool is_struct = glsl_type_is_struct(glsl_without_array(type)); + if (is_struct) + size = get_slot_components(var, var->data.location + slot_offset, var->data.location); + else if ((var->data.mode == nir_var_shader_out && var->data.location < VARYING_SLOT_VAR0) || + (var->data.mode == nir_var_shader_in && var->data.location < (b->shader->info.stage == MESA_SHADER_VERTEX ? VERT_ATTRIB_GENERIC0 : VARYING_SLOT_VAR0))) + size = glsl_type_is_array(type) ? glsl_get_aoa_size(type) : glsl_get_vector_elements(type); + else + size = glsl_get_vector_elements(glsl_without_array(type)); + assert(size); + if (glsl_type_is_64bit(glsl_without_array(var->type))) + size *= 2; + if (var->data.location != location && size > 4 && size % 4 && !is_struct) { + /* adjust for dvec3-type slot overflow */ + assert(location > var->data.location); + size -= (location - var->data.location) * 4; + } + assert(size); + if (var->data.location_frac + size <= c || var->data.location_frac > c) + continue; + + b->cursor = nir_before_instr(instr); + nir_deref_instr *deref = nir_build_deref_var(b, var); + if (nir_is_arrayed_io(var, b->shader->info.stage)) { + assert(intr->intrinsic != nir_intrinsic_store_output); + deref = nir_build_deref_array(b, deref, intr->src[!is_load].ssa); + } + if (glsl_type_is_array(type)) { + /* unroll array derefs */ + unsigned idx = frac - var->data.location_frac; + assert(src_offset); + if (var->data.location < VARYING_SLOT_VAR0) { + if (src_offset) { + /* clip/cull dist use different array offset semantics */ + bool is_clipdist = (b->shader->info.stage != MESA_SHADER_VERTEX || var->data.mode == nir_var_shader_out) && + var->data.location >= VARYING_SLOT_CLIP_DIST0 && var->data.location <= VARYING_SLOT_CULL_DIST1; + /* this is explicit for ease of debugging but could be collapsed at some point in the future*/ + if (nir_src_is_const(*src_offset)) { + unsigned offset = slot_offset; + if (is_clipdist) + offset *= 4; + deref = nir_build_deref_array_imm(b, deref, offset + idx); + } else { + nir_def *offset = src_offset->ssa; + if (is_clipdist) + nir_imul_imm(b, offset, 4); + deref = nir_build_deref_array(b, deref, idx ? nir_iadd_imm(b, offset, idx) : src_offset->ssa); + } + } else { + deref = nir_build_deref_array_imm(b, deref, idx); + } + type = glsl_get_array_element(type); + } else { + /* need to convert possible N*M to [N][M] */ + nir_def *nm = idx ? nir_iadd_imm(b, src_offset->ssa, idx) : src_offset->ssa; + while (glsl_type_is_array(type)) { + const struct glsl_type *elem = glsl_get_array_element(type); + unsigned type_size = glsl_count_vec4_slots(elem, false, false); + nir_def *n = glsl_type_is_array(elem) ? nir_udiv_imm(b, nm, type_size) : nm; + if (glsl_type_is_vector_or_scalar(elem) && glsl_type_is_64bit(elem) && glsl_get_vector_elements(elem) > 2) + n = nir_udiv_imm(b, n, 2); + deref = nir_build_deref_array(b, deref, n); + nm = nir_umod_imm(b, nm, type_size); + type = glsl_get_array_element(type); + } + } + } else if (glsl_type_is_struct(type)) { + deref = nir_build_deref_struct(b, deref, slot_offset); + } + if (is_load) { + nir_def *load; + if (is_interp) { + nir_def *interp = intr->src[0].ssa; + nir_intrinsic_instr *interp_intr = nir_instr_as_intrinsic(interp->parent_instr); + assert(interp_intr); + switch (interp_intr->intrinsic) { + case nir_intrinsic_load_barycentric_centroid: + load = nir_interp_deref_at_centroid(b, intr->num_components, bit_size, &deref->def); + break; + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_sample: + load = nir_load_deref(b, deref); + break; + case nir_intrinsic_load_barycentric_at_sample: + load = nir_interp_deref_at_sample(b, intr->num_components, bit_size, &deref->def, interp_intr->src[0].ssa); + break; + case nir_intrinsic_load_barycentric_at_offset: + load = nir_interp_deref_at_offset(b, intr->num_components, bit_size, &deref->def, interp_intr->src[0].ssa); + break; + default: + unreachable("unhandled interp!"); + } + } else { + load = nir_load_deref(b, deref); + } + nir_def_rewrite_uses(&intr->def, load); + } else { + nir_def *store = intr->src[0].ssa; + assert(!glsl_type_is_array(type)); + unsigned num_components = glsl_get_vector_elements(type); + /* pad/filter components to match deref type */ + if (intr->num_components < num_components) { + nir_def *zero = nir_imm_zero(b, 1, bit_size); + nir_def *vec[4] = {zero, zero, zero, zero}; + u_foreach_bit(i, nir_intrinsic_write_mask(intr)) + vec[c - var->data.location_frac + i] = nir_channel(b, store, i); + store = nir_vec(b, vec, num_components); + } if (store->num_components > num_components) { + store = nir_channels(b, store, nir_intrinsic_write_mask(intr)); + } + if (store->bit_size != glsl_get_bit_size(type)) { + /* this should be some weird bindless io conversion */ + assert(store->bit_size == 64 && glsl_get_bit_size(type) == 32); + assert(num_components != store->num_components); + store = nir_unpack_64_2x32(b, store); + } + nir_store_deref(b, deref, store, BITFIELD_RANGE(c - var->data.location_frac, intr->num_components)); + } + nir_instr_remove(instr); + return true; + } + unreachable("failed to find variable for explicit io!"); + return true; +} + +static bool +add_derefs(nir_shader *nir) +{ + return nir_shader_instructions_pass(nir, add_derefs_instr, nir_metadata_dominance, NULL); +} + static struct zink_shader_object compile_module(struct zink_screen *screen, struct zink_shader *zs, nir_shader *nir, bool can_shobj, struct zink_program *pg) { @@ -3535,9 +3825,10 @@ zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shad nir_shader *nir, const struct zink_shader_key *key, const void *extra_data, struct zink_program *pg) { struct zink_shader_info *sinfo = &zs->sinfo; - bool need_optimize = false; + bool need_optimize = true; bool inlined_uniforms = false; + NIR_PASS_V(nir, add_derefs); NIR_PASS_V(nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8); if (key) { if (key->inline_uniforms) { @@ -3758,6 +4049,7 @@ zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs) default: break; } } + NIR_PASS_V(nir, add_derefs); NIR_PASS_V(nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8); if (screen->driconf.inline_uniforms) { NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_mem_global | nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_shared, NULL, NULL); @@ -4125,12 +4417,13 @@ lower_bindless_io_instr(nir_builder *b, nir_instr *in, void *data) if (in->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in); - if (instr->intrinsic != nir_intrinsic_load_deref && - instr->intrinsic != nir_intrinsic_store_deref) + bool is_load = false; + bool is_input = false; + bool is_interp = false; + if (!filter_io_instr(instr, &is_load, &is_input, &is_interp)) return false; - nir_deref_instr *src_deref = nir_src_as_deref(instr->src[0]); - nir_variable *var = nir_deref_instr_get_variable(src_deref); + nir_variable *var = find_var_with_location_frac(b->shader, nir_intrinsic_io_semantics(instr).location, nir_intrinsic_component(instr), false, is_input ? nir_var_shader_in : nir_var_shader_out); if (var->data.bindless) return false; if (var->data.mode != nir_var_shader_in && var->data.mode != nir_var_shader_out) @@ -4140,17 +4433,7 @@ lower_bindless_io_instr(nir_builder *b, nir_instr *in, void *data) var->type = glsl_int64_t_type(); var->data.bindless = 1; - b->cursor = nir_before_instr(in); - nir_deref_instr *deref = nir_build_deref_var(b, var); - if (instr->intrinsic == nir_intrinsic_load_deref) { - nir_def *def = nir_load_deref(b, deref); - nir_instr_rewrite_src_ssa(in, &instr->src[0], def); - nir_def_rewrite_uses(&instr->def, def); - } else { - nir_store_deref(b, deref, instr->src[1].ssa, nir_intrinsic_write_mask(instr)); - } - nir_instr_remove(in); - nir_instr_remove(&src_deref->instr); + nir_intrinsic_set_dest_type(instr, nir_type_int64); return true; } @@ -4823,6 +5106,292 @@ zink_flat_flags(struct nir_shader *shader) return flat_flags; } +static void +store_location_var(nir_variable *vars[VARYING_SLOT_TESS_MAX][4], nir_variable *var, nir_shader *nir) +{ + unsigned slot_count; + const struct glsl_type *type; + bool is_bindless = is_var_type_bindless(var); + if (nir_is_arrayed_io(var, nir->info.stage)) { + type = glsl_get_array_element(var->type); + slot_count = glsl_count_vec4_slots(type, false, is_bindless); + } else { + type = glsl_without_array(var->type); + slot_count = glsl_count_vec4_slots(var->type, false, is_bindless); + } + unsigned num_components = glsl_get_vector_elements(glsl_without_array(type)); + if (glsl_type_is_64bit(glsl_without_array(var->type))) + num_components *= 2; + if (!num_components) + num_components = 4; //this is a struct + for (unsigned i = 0; i < slot_count; i++) { + for (unsigned j = 0; j < MIN2(num_components, 4); j++) { + /* allow partial overlap */ + if (!vars[var->data.location + i][var->data.location_frac + j]) + vars[var->data.location + i][var->data.location_frac + j] = var; + } + if (num_components > 4) + num_components -= 4; + } +} + +static void +rework_io_vars(nir_shader *nir, nir_variable_mode mode) +{ + assert(mode == nir_var_shader_out || mode == nir_var_shader_in); + assert(util_bitcount(mode) == 1); + nir_variable *old_vars[VARYING_SLOT_TESS_MAX][4] = {{NULL}}; + nir_variable *vars[VARYING_SLOT_TESS_MAX][4] = {{NULL}}; + bool found = false; + /* store old vars */ + nir_foreach_variable_with_modes_safe(var, nir, mode) { + if ((mode == nir_var_shader_out && var->data.location < VARYING_SLOT_VAR0) || + (mode == nir_var_shader_in && var->data.location < (nir->info.stage == MESA_SHADER_VERTEX ? VERT_ATTRIB_GENERIC0 : VARYING_SLOT_VAR0))) + continue; + /* account for vertex attr aliasing and bindless io */ + if (nir->info.stage != MESA_SHADER_VERTEX || mode == nir_var_shader_out || + (mode == nir_var_shader_in && nir->info.stage == MESA_SHADER_VERTEX && !old_vars[var->data.location][var->data.location_frac]) || + is_var_type_bindless(var)) + store_location_var(old_vars, var, nir); + /* skip interpolated inputs and bindless io */ + if (is_var_type_bindless(var) || + (mode == nir_var_shader_out && nir->info.stage == MESA_SHADER_FRAGMENT)) { + store_location_var(vars, var, nir); + } else { + var->data.mode = nir_var_shader_temp; + found = true; + } + } + if (!found) + return; + nir_fixup_deref_modes(nir); + NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_temp, NULL); + /* scan for vars using indirect array access */ + BITSET_DECLARE(indirect_access, 128); + BITSET_ZERO(indirect_access); + nir_foreach_function_impl(impl, nir) { + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + bool is_load = false; + bool is_input = false; + bool is_interp = false; + if (!filter_io_instr(intr, &is_load, &is_input, &is_interp)) + continue; + nir_src *src_offset = nir_get_io_offset_src(intr); + if (!is_input && !src_offset) + continue; + if (mode == nir_var_shader_in && !is_input) + continue; + if (mode == nir_var_shader_out && is_input) + continue; + nir_io_semantics s = nir_intrinsic_io_semantics(intr); + if ((mode == nir_var_shader_out && s.location < VARYING_SLOT_VAR0) || + (mode == nir_var_shader_in && s.location < (nir->info.stage == MESA_SHADER_VERTEX ? VERT_ATTRIB_GENERIC0 : VARYING_SLOT_VAR0))) + continue; + if (!nir_src_is_const(*src_offset)) + BITSET_SET(indirect_access, s.location); + } + } + } + /* loop and create vars */ + nir_foreach_function_impl(impl, nir) { + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + bool is_load = false; + bool is_input = false; + bool is_interp = false; + if (!filter_io_instr(intr, &is_load, &is_input, &is_interp)) + continue; + if (mode == nir_var_shader_in && !is_input) + continue; + if (mode == nir_var_shader_out && is_input) + continue; + nir_io_semantics s = nir_intrinsic_io_semantics(intr); + if ((mode == nir_var_shader_out && s.location < VARYING_SLOT_VAR0) || + (mode == nir_var_shader_in && s.location < (nir->info.stage == MESA_SHADER_VERTEX ? VERT_ATTRIB_GENERIC0 : VARYING_SLOT_VAR0))) + continue; + unsigned slot_offset = 0; + bool is_indirect = BITSET_TEST(indirect_access, s.location); + nir_src *src_offset = nir_get_io_offset_src(intr); + if (src_offset && !is_indirect) { + assert(nir_src_is_const(*src_offset)); + slot_offset = nir_src_as_uint(*src_offset); + } + unsigned location = s.location + slot_offset; + unsigned frac = nir_intrinsic_component(intr); + unsigned bit_size = is_load ? intr->def.bit_size : nir_src_bit_size(intr->src[0]); + /* set c aligned/rounded down to dword */ + unsigned c = frac; + if (frac && bit_size < 32) + c = frac * bit_size / 32; + nir_alu_type type = is_load ? nir_intrinsic_dest_type(intr) : nir_intrinsic_src_type(intr); + nir_variable *old_var = old_vars[location][c]; + assert(old_var); + if (is_var_type_bindless(old_var)) + continue; + /* ensure dword is filled with like-sized components */ + unsigned max_components = intr->num_components; + if (bit_size == 16) + max_components = align(max_components, 2); + else if (bit_size == 8) + max_components = align(max_components, 4); + if (c + (bit_size == 64 ? max_components * 2 : max_components) > 4) + c = 0; + const struct glsl_type *vec_type = glsl_vector_type(nir_get_glsl_base_type_for_nir_type(type), max_components); + /* reset the mode for nir_is_arrayed_io to work */ + nir_variable_mode oldmode = old_var->data.mode; + old_var->data.mode = mode; + bool is_arrayed = nir_is_arrayed_io(old_var, nir->info.stage); + old_var->data.mode = oldmode; + if (is_indirect) { + /* indirect array access requires the full array in a single variable */ + unsigned slot_count = 0; + if (is_arrayed) + slot_count = glsl_count_vec4_slots(glsl_get_array_element(old_var->type), false, false); + else + slot_count = glsl_count_vec4_slots(old_var->type, false, false); + if (bit_size == 64 && slot_count > 1) + slot_count /= 2; + if (slot_count > 1) + vec_type = glsl_array_type(vec_type, slot_count, glsl_get_explicit_stride(vec_type)); + } + if (is_arrayed) + vec_type = glsl_array_type(vec_type, glsl_array_size(old_var->type), glsl_get_explicit_stride(old_var->type)); + if (vars[location][c]) { + if (glsl_get_vector_elements(glsl_without_array(vars[location][c]->type)) < glsl_get_vector_elements(glsl_without_array(vec_type))) { + /* enlarge existing vars if necessary */ + vars[location][c]->type = vec_type; + store_location_var(vars, vars[location][c], nir); + } + continue; + } + + assert(!vars[location][c] || + (nir_get_nir_type_for_glsl_base_type(glsl_get_base_type(glsl_without_array(vars[location][c]->type))) == type && + glsl_get_vector_elements(glsl_without_array(vars[location][c]->type)) >= intr->num_components)); + nir_variable *var = nir_variable_clone(old_var, nir); + var->data.mode = mode; + var->type = vec_type; + var->data.driver_location = nir_intrinsic_base(intr) + slot_offset; + var->data.location_frac = c; + var->data.location = location; + nir_shader_add_variable(nir, var); + store_location_var(vars, var, nir); + } + } + } + if (mode != nir_var_shader_out) + return; + /* scan for missing components which would break shader interfaces */ + for (unsigned i = 0; i < VARYING_SLOT_TESS_MAX; i++) { + for (unsigned j = 0; j < 4; j++) { + if (!old_vars[i][j] || vars[i][j] || glsl_type_is_struct(glsl_without_array(old_vars[i][j]->type))) + continue; + + nir_variable *copy = NULL; + nir_variable *ref = NULL; + for (unsigned k = 0; k < 4; k++) { + if (!copy) + copy = vars[i][k]; + if (!ref) + ref = old_vars[i][k]; + } + assert(copy); + /* add a 1 component variable to fill the hole */ + nir_variable *var = nir_variable_clone(copy, nir); + var->data.mode = mode; + const struct glsl_type *type = glsl_without_array_or_matrix(var->type); + if (glsl_type_is_vector_or_scalar(type)) + var->type = glsl_vector_type(glsl_get_base_type(type), 1); + else + var->type = glsl_vector_type(GLSL_TYPE_FLOAT, 1); + var->data.location_frac = j; + assert(j % 2 == 0 || !glsl_type_is_64bit(glsl_without_array(var->type))); + nir_shader_add_variable(nir, var); + store_location_var(vars, var, nir); + /* write zero so it doesn't get pruned */ + nir_builder b = nir_builder_at(nir_after_block(nir_impl_last_block(nir_shader_get_entrypoint(nir)))); + nir_def *store = nir_imm_intN_t(&b, j == 3 ? 1 : 0, glsl_type_is_64bit(glsl_without_array(var->type)) ? 64 : 32); + if (nir_is_arrayed_io(copy, nir->info.stage)) { + var->type = glsl_array_type(var->type, glsl_array_size(ref->type), glsl_get_explicit_stride(ref->type)); + nir_deref_instr *deref = nir_build_deref_var(&b, var); + deref = nir_build_deref_array(&b, deref, nir_load_invocation_id(&b)); + nir_store_deref(&b, deref, store, 0x1); + } else { + nir_store_var(&b, var, store, 0x1); + } + } + } +} + + +static bool +eliminate_io_wrmasks_instr(const nir_instr *instr, const void *data) +{ + const nir_shader *nir = data; + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + switch (intr->intrinsic) { + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_primitive_output: + case nir_intrinsic_store_per_vertex_output: + break; + default: + return false; + } + unsigned src_components = nir_intrinsic_src_components(intr, 0); + unsigned wrmask = nir_intrinsic_write_mask(intr); + unsigned num_components = util_bitcount(wrmask); + if (num_components != src_components) + return true; + if ((nir_intrinsic_instr_src_type(intr, 0) & NIR_ALU_TYPE_SIZE_MASK) == 64) + num_components *= 2; + if (nir->xfb_info) { + nir_io_semantics s = nir_intrinsic_io_semantics(intr); + nir_src *src_offset = nir_get_io_offset_src(intr); + if (nir_src_is_const(*src_offset)) { + unsigned slot_offset = nir_src_as_uint(*src_offset); + for (unsigned i = 0; i < nir->xfb_info->output_count; i++) { + if (nir->xfb_info->outputs[i].location == s.location + slot_offset) { + unsigned xfb_components = util_bitcount(nir->xfb_info->outputs[i].component_mask); + if (xfb_components != MIN2(4, num_components)) + return true; + num_components -= xfb_components; + if (!num_components) + break; + } + } + } else { + for (unsigned i = 0; i xfb_info->output_count; i++) { + if (nir->xfb_info->outputs[i].location >= s.location && + nir->xfb_info->outputs[i].location < s.location + s.num_slots) { + unsigned xfb_components = util_bitcount(nir->xfb_info->outputs[i].component_mask); + if (xfb_components < MIN2(num_components, 4)) + return true; + num_components -= xfb_components; + if (!num_components) + break; + } + } + } + } + return false; +} + +static int +zink_type_size(const struct glsl_type *type, bool bindless) +{ + return glsl_count_attribute_slots(type, false); +} + struct zink_shader * zink_shader_create(struct zink_screen *screen, struct nir_shader *nir, const struct pipe_stream_output_info *so_info) @@ -4849,8 +5418,32 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir, nir->info.stage == MESA_SHADER_TESS_EVAL) indirect_derefs_modes |= nir_var_shader_in | nir_var_shader_out; - NIR_PASS_V(nir, nir_lower_indirect_derefs, indirect_derefs_modes, - UINT32_MAX); + NIR_PASS_V(nir, nir_lower_indirect_derefs, indirect_derefs_modes, UINT32_MAX); + + nir_lower_io_options lower_io_flags = 0; + if (!screen->info.feats.features.shaderInt64 || !screen->info.feats.features.shaderFloat64) + lower_io_flags = nir_lower_io_lower_64bit_to_32; + else if (!screen->info.feats.features.shaderFloat64) + lower_io_flags = nir_lower_io_lower_64bit_float_to_32; + bool temp_inputs = nir->info.stage != MESA_SHADER_VERTEX && nir->info.inputs_read & BITFIELD_RANGE(VARYING_SLOT_CLIP_DIST0, 4); + bool temp_outputs = nir->info.stage != MESA_SHADER_FRAGMENT && (nir->info.outputs_read | nir->info.outputs_written) & BITFIELD_RANGE(VARYING_SLOT_CLIP_DIST0, 4); + if (temp_inputs || temp_outputs) { + NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), temp_outputs, temp_inputs); + NIR_PASS_V(nir, nir_lower_global_vars_to_local); + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_lower_var_copies); + } + NIR_PASS_V(nir, nir_lower_io, nir_var_shader_out, zink_type_size, lower_io_flags); + if (nir->info.stage == MESA_SHADER_VERTEX) + lower_io_flags |= nir_lower_io_lower_64bit_to_32; + NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, zink_type_size, lower_io_flags); + optimize_nir(nir, NULL); + nir_gather_xfb_info_from_intrinsics(nir); + NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_in | nir_var_shader_out, eliminate_io_wrmasks_instr, nir); + /* clean up io to improve direct access */ + optimize_nir(nir, NULL); + rework_io_vars(nir, nir_var_shader_in); + rework_io_vars(nir, nir_var_shader_out); if (nir->info.stage < MESA_SHADER_COMPUTE) create_gfx_pushconst(nir);