diff --git a/src/amd/vulkan/radv_nir_lower_vs_inputs.c b/src/amd/vulkan/radv_nir_lower_vs_inputs.c index fbed239856b..d726e3c5be8 100644 --- a/src/amd/vulkan/radv_nir_lower_vs_inputs.c +++ b/src/amd/vulkan/radv_nir_lower_vs_inputs.c @@ -33,7 +33,7 @@ typedef struct { const struct radv_shader_args *args; const struct radv_shader_info *info; const struct radv_pipeline_key *pl_key; - uint32_t address32_hi; + const struct radeon_info *rad_info; } lower_vs_inputs_state; static nir_ssa_def * @@ -80,6 +80,245 @@ lower_load_vs_input_from_prolog(nir_builder *b, nir_intrinsic_instr *intrin, return extracted; } +static nir_ssa_def * +calc_vs_input_index_instance_rate(nir_builder *b, unsigned location, lower_vs_inputs_state *s) +{ + const uint32_t divisor = s->pl_key->vs.instance_rate_divisors[location]; + nir_ssa_def *start_instance = nir_load_base_instance(b); + + if (divisor == 0) + return start_instance; + + nir_ssa_def *instance_id = nir_udiv_imm(b, nir_load_instance_id(b), divisor); + return nir_iadd(b, start_instance, instance_id); +} + +static nir_ssa_def * +calc_vs_input_index(nir_builder *b, unsigned location, lower_vs_inputs_state *s) +{ + if (s->pl_key->vs.instance_rate_inputs & BITFIELD_BIT(location)) + return calc_vs_input_index_instance_rate(b, location, s); + + return nir_iadd(b, nir_load_first_vertex(b), nir_load_vertex_id_zero_base(b)); +} + +static bool +can_use_untyped_load(const struct util_format_description *f, const unsigned bit_size) +{ + /* All components must have same size and type. */ + if (!f->is_array) + return false; + + const struct util_format_channel_description *c = &f->channel[0]; + return c->size == bit_size && bit_size >= 32; +} + +static nir_ssa_def * +oob_input_load_value(nir_builder *b, const unsigned channel_idx, const unsigned bit_size, + const bool is_float) +{ + /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification: + * For 64-bit data types, no default attribute values are provided. Input variables + * must not use more components than provided by the attribute. + */ + if (bit_size == 64) + return nir_ssa_undef(b, 1, bit_size); + + if (channel_idx == 3) { + if (is_float) + return nir_imm_floatN_t(b, 1.0, bit_size); + else + return nir_imm_intN_t(b, 1, bit_size); + } + + return nir_imm_intN_t(b, 0, bit_size); +} + +static unsigned +count_format_bytes(const struct util_format_description *f, const unsigned first_channel, + const unsigned num_channels) +{ + if (!num_channels) + return 0; + + const unsigned last_channel = first_channel + num_channels - 1; + assert(last_channel < f->nr_channels); + unsigned bits = 0; + for (unsigned i = first_channel; i <= last_channel; ++i) { + bits += f->channel[i].size; + } + + assert(bits % 8 == 0); + return bits / 8; +} + +static nir_ssa_def * +lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs_state *s) +{ + nir_src *offset_src = nir_get_io_offset_src(intrin); + assert(nir_src_is_const(*offset_src)); + + const unsigned base = nir_intrinsic_base(intrin); + const unsigned base_offset = nir_src_as_uint(*offset_src); + const unsigned location = base + base_offset - VERT_ATTRIB_GENERIC0; + const unsigned bit_size = intrin->dest.ssa.bit_size; + const unsigned dest_num_components = intrin->dest.ssa.num_components; + + /* Convert the component offset to bit_size units. + * (Intrinsic component offset is in 32-bit units.) + * + * Small bitsize inputs consume the same space as 32-bit inputs, + * but 64-bit inputs consume twice as many. + * 64-bit variables must not have a component of 1 or 3. + * (See VK spec 15.1.5 "Component Assignment") + */ + const unsigned component = nir_intrinsic_component(intrin) / (MAX2(32, bit_size) / 32); + + /* Bitmask of components in bit_size units + * of the current input load that are actually used. + */ + const unsigned dest_use_mask = nir_ssa_def_components_read(&intrin->dest.ssa) << component; + + /* If the input is entirely unused, just replace it with undef. + * This is just in case we debug this pass without running DCE first. + */ + if (!dest_use_mask) + return nir_ssa_undef(b, dest_num_components, bit_size); + + const uint32_t attrib_binding = s->pl_key->vs.vertex_attribute_bindings[location]; + const uint32_t attrib_offset = s->pl_key->vs.vertex_attribute_offsets[location]; + const uint32_t attrib_stride = s->pl_key->vs.vertex_attribute_strides[location]; + const enum pipe_format attrib_format = s->pl_key->vs.vertex_attribute_formats[location]; + const struct util_format_description *f = util_format_description(attrib_format); + const unsigned binding_index = + s->info->vs.use_per_attribute_vb_descs ? location : attrib_binding; + const unsigned desc_index = + util_bitcount(s->info->vs.vb_desc_usage_mask & u_bit_consecutive(0, binding_index)); + + nir_ssa_def *vertex_buffers_arg = ac_nir_load_arg(b, &s->args->ac, s->args->ac.vertex_buffers); + nir_ssa_def *vertex_buffers = + nir_pack_64_2x32_split(b, vertex_buffers_arg, nir_imm_int(b, s->rad_info->address32_hi)); + nir_ssa_def *descriptor = + nir_load_smem_amd(b, 4, vertex_buffers, nir_imm_int(b, desc_index * 16)); + nir_ssa_def *base_index = calc_vs_input_index(b, location, s); + nir_ssa_def *zero = nir_imm_int(b, 0); + + /* Try to shrink the load format by skipping unused components from the start. + * Beneficial because the backend may be able to emit fewer HW instructions. + * Only possible with array formats. + */ + const unsigned first_used_channel = ffs(dest_use_mask) - 1; + const unsigned skipped_start = f->is_array ? first_used_channel : 0; + + /* Number of channels we actually use and load. + * Don't shrink the format here because this might allow the backend to + * emit fewer (but larger than needed) HW instructions. + */ + const unsigned first_trailing_unused_channel = util_last_bit(dest_use_mask); + const unsigned max_loaded_channels = MIN2(first_trailing_unused_channel, f->nr_channels); + const unsigned fetch_num_channels = + skipped_start >= max_loaded_channels ? 0 : max_loaded_channels - skipped_start; + + /* Load VS inputs from VRAM. + * + * For the vast majority of cases this will only create 1x load_(typed)_buffer_amd + * intrinsic and the backend is responsible for further splitting that + * to as many HW instructions as needed based on alignment. + * + * Take care to prevent loaded components from failing the range check, + * by emitting several load intrinsics with different index sources. + * This is necessary because the backend can't further roll the const offset + * into the index source of MUBUF / MTBUF instructions. + */ + nir_ssa_def *loads[NIR_MAX_VEC_COMPONENTS] = {0}; + unsigned num_loads = 0; + for (unsigned x = 0, channels; x < fetch_num_channels; x += channels) { + channels = fetch_num_channels - x; + const unsigned start = skipped_start + x; + enum pipe_format fetch_format = attrib_format; + nir_ssa_def *index = base_index; + + /* Add excess constant offset to the index. */ + unsigned const_off = attrib_offset + count_format_bytes(f, 0, start); + if (attrib_stride && const_off > attrib_stride) { + index = nir_iadd_imm(b, base_index, const_off / attrib_stride); + const_off %= attrib_stride; + } + + /* Reduce the number of loaded channels until we can pass the range check. + * Only for array formats. VK spec mandates proper alignment for packed formats. + * Note, NONE seems to occur in real use and is considered an array format. + */ + if (f->is_array && fetch_format != PIPE_FORMAT_NONE) { + while (channels > 1 && attrib_stride && + (const_off + count_format_bytes(f, start, channels)) > attrib_stride) { + channels--; + } + + /* Keep the fetch format as large as possible to let the backend emit + * larger load instructions when it deems them beneficial. + */ + fetch_format = + util_format_get_array(f->channel[0].type, f->channel[0].size, f->nr_channels - start, + f->is_unorm || f->is_snorm, f->channel[0].pure_integer); + } + + assert(f->is_array || channels == fetch_num_channels); + + /* Prefer using untyped buffer loads if possible, to avoid potential alignment issues. + * Typed loads can cause GPU hangs when used with improper alignment. + */ + if (can_use_untyped_load(f, bit_size)) { + loads[num_loads++] = + nir_load_buffer_amd(b, channels, bit_size, descriptor, zero, zero, index, + .base = const_off, .memory_modes = nir_var_shader_in); + } else { + const unsigned align_mul = MAX2(1, s->pl_key->vs.vertex_binding_align[attrib_binding]); + const unsigned align_offset = const_off % align_mul; + + loads[num_loads++] = nir_load_typed_buffer_amd( + b, channels, bit_size, descriptor, zero, zero, index, .base = const_off, + .format = fetch_format, .align_mul = align_mul, .align_offset = align_offset, + .memory_modes = nir_var_shader_in); + } + } + + nir_ssa_def *load = loads[0]; + + /* Extract the channels we actually need when we couldn't skip starting + * components or had to emit more than one load instrinsic. + */ + if (num_loads > 0 && (first_used_channel > skipped_start || num_loads != 1)) + load = nir_extract_bits(b, loads, num_loads, (first_used_channel - skipped_start) * bit_size, + max_loaded_channels - first_used_channel, bit_size); + + /* Return early if possible to avoid generating unnecessary IR. */ + if (num_loads > 0 && first_used_channel == component && + load->num_components == dest_num_components) + return load; + + /* Fill unused and OOB components. + */ + const nir_alu_type dst_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(intrin)); + nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS] = {0}; + for (unsigned i = 0; i < dest_num_components; ++i) { + const unsigned c = i + component; + if (!(dest_use_mask & BITFIELD_BIT(c))) { + /* Fill unused channels with zero. */ + channels[i] = nir_imm_zero(b, 1, bit_size); + } else if (c < max_loaded_channels) { + /* Use channels that were loaded from VRAM. */ + assert(c >= first_used_channel); + channels[i] = nir_channel(b, load, c - first_used_channel); + } else { + /* Handle input loads that are larger than their format. */ + channels[i] = oob_input_load_value(b, c, bit_size, dst_type == nir_type_float); + } + } + + return nir_vec(b, channels, dest_num_components); +} + static bool lower_vs_input_instr(nir_builder *b, nir_instr *instr, void *state) { @@ -99,8 +338,7 @@ lower_vs_input_instr(nir_builder *b, nir_instr *instr, void *state) if (s->info->vs.dynamic_inputs) { replacement = lower_load_vs_input_from_prolog(b, intrin, s); } else { - /* TODO: lower non-dynamic inputs */ - return false; + replacement = lower_load_vs_input(b, intrin, s); } nir_ssa_def_rewrite_uses(&intrin->dest.ssa, replacement); @@ -112,7 +350,7 @@ lower_vs_input_instr(nir_builder *b, nir_instr *instr, void *state) bool radv_nir_lower_vs_inputs(nir_shader *shader, const struct radv_pipeline_stage *vs_stage, - const struct radv_pipeline_key *pl_key, uint32_t address32_hi) + const struct radv_pipeline_key *pl_key, const struct radeon_info *rad_info) { assert(shader->info.stage == MESA_SHADER_VERTEX); @@ -120,7 +358,7 @@ radv_nir_lower_vs_inputs(nir_shader *shader, const struct radv_pipeline_stage *v .info = &vs_stage->info, .args = &vs_stage->args, .pl_key = pl_key, - .address32_hi = address32_hi, + .rad_info = rad_info, }; return nir_shader_instructions_pass(shader, lower_vs_input_instr, diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index ad6c4ac618c..c098318f73d 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -3239,7 +3239,7 @@ radv_postprocess_nir(struct radv_pipeline *pipeline, */ if (stage->stage == MESA_SHADER_VERTEX) { NIR_PASS(_, stage->nir, radv_nir_lower_vs_inputs, stage, pipeline_key, - device->physical_device->rad_info.address32_hi); + &device->physical_device->rad_info); } /* Lower I/O intrinsics to memory instructions. */ diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 38e3a6249c0..c607016b120 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -557,7 +557,8 @@ void radv_nir_lower_abi(nir_shader *shader, enum amd_gfx_level gfx_level, const struct radv_pipeline_key *pl_key, uint32_t address32_hi); bool radv_nir_lower_vs_inputs(nir_shader *shader, const struct radv_pipeline_stage *vs_stage, - const struct radv_pipeline_key *key, uint32_t address32_hi); + const struct radv_pipeline_key *key, + const struct radeon_info *rad_info); void radv_init_shader_arenas(struct radv_device *device); void radv_destroy_shader_arenas(struct radv_device *device);