radv: Apply swizzle and alpha adjust in radv_nir_lower_vs_inputs.

Deal with VS input related things in a single pass instead of
having two different passes.

Fossil DB stats on Rembrandt (GFX10.3):

Totals from 174 (0.13% of 134913) affected shaders:
VGPRs: 7736 -> 7520 (-2.79%)
CodeSize: 354004 -> 353604 (-0.11%); split: -0.17%, +0.06%
MaxWaves: 4196 -> 4248 (+1.24%)
Instrs: 65228 -> 65139 (-0.14%); split: -0.19%, +0.06%
Latency: 265823 -> 265728 (-0.04%); split: -0.12%, +0.08%
InvThroughput: 84629 -> 84644 (+0.02%); split: -0.08%, +0.10%
VClause: 1618 -> 1606 (-0.74%); split: -0.93%, +0.19%
SClause: 1382 -> 1379 (-0.22%); split: -0.36%, +0.14%
Copies: 5586 -> 5566 (-0.36%); split: -0.55%, +0.20%
PreSGPRs: 4994 -> 5037 (+0.86%); split: -0.10%, +0.96%
PreVGPRs: 4948 -> 4955 (+0.14%); split: -0.04%, +0.18%

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16805>
This commit is contained in:
Timur Kristóf
2023-03-06 12:29:28 -08:00
committed by Marge Bot
parent a46acdbc3f
commit 27c8131978
2 changed files with 80 additions and 126 deletions

View File

@@ -152,6 +152,60 @@ count_format_bytes(const struct util_format_description *f, const unsigned first
return bits / 8; return bits / 8;
} }
static bool
format_needs_swizzle(const struct util_format_description *f)
{
for (unsigned i = 0; i < f->nr_channels; ++i) {
if (f->swizzle[i] != PIPE_SWIZZLE_X + i)
return true;
}
return false;
}
static unsigned
first_used_swizzled_channel(const struct util_format_description *f, const unsigned mask,
const bool backwards)
{
unsigned first_used = backwards ? 0 : f->nr_channels;
const unsigned it_mask = mask & BITFIELD_MASK(f->nr_channels);
u_foreach_bit (b, it_mask) {
assert(f->swizzle[b] != PIPE_SWIZZLE_0 && f->swizzle[b] != PIPE_SWIZZLE_1);
const unsigned c = f->swizzle[b] - PIPE_SWIZZLE_X;
first_used = backwards ? MAX2(first_used, c) : MIN2(first_used, c);
}
return first_used;
}
static nir_ssa_def *
adjust_vertex_fetch_alpha(nir_builder *b, enum ac_vs_input_alpha_adjust alpha_adjust,
nir_ssa_def *alpha)
{
if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED)
alpha = nir_f2u32(b, alpha);
/* For the integer-like cases, do a natural sign extension.
*
* For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 and happen to contain 0, 1, 2, 3 as
* the two LSBs of the exponent.
*/
unsigned offset = alpha_adjust == AC_ALPHA_ADJUST_SNORM ? 23u : 0u;
alpha = nir_ibfe_imm(b, alpha, offset, 2u);
/* Convert back to the right type. */
if (alpha_adjust == AC_ALPHA_ADJUST_SNORM) {
alpha = nir_i2f32(b, alpha);
alpha = nir_fmax(b, alpha, nir_imm_float(b, -1.0f));
} else if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED) {
alpha = nir_i2f32(b, alpha);
}
return alpha;
}
static nir_ssa_def * static nir_ssa_def *
lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs_state *s) lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs_state *s)
{ {
@@ -190,6 +244,8 @@ lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs
const uint32_t attrib_stride = s->pl_key->vs.vertex_attribute_strides[location]; const uint32_t attrib_stride = s->pl_key->vs.vertex_attribute_strides[location];
const enum pipe_format attrib_format = s->pl_key->vs.vertex_attribute_formats[location]; const enum pipe_format attrib_format = s->pl_key->vs.vertex_attribute_formats[location];
const struct util_format_description *f = util_format_description(attrib_format); const struct util_format_description *f = util_format_description(attrib_format);
const struct ac_vtx_format_info *vtx_info =
ac_get_vtx_format_info(s->rad_info->gfx_level, s->rad_info->family, attrib_format);
const unsigned binding_index = const unsigned binding_index =
s->info->vs.use_per_attribute_vb_descs ? location : attrib_binding; s->info->vs.use_per_attribute_vb_descs ? location : attrib_binding;
const unsigned desc_index = const unsigned desc_index =
@@ -203,18 +259,31 @@ lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs
nir_ssa_def *base_index = calc_vs_input_index(b, location, s); nir_ssa_def *base_index = calc_vs_input_index(b, location, s);
nir_ssa_def *zero = nir_imm_int(b, 0); nir_ssa_def *zero = nir_imm_int(b, 0);
/* We currently implement swizzling for all formats in shaders.
* Note, it is possible to specify swizzling in the DST_SEL fields of descriptors,
* but we don't use that because typed loads using the MTBUF instruction format
* don't support DST_SEL, so it's simpler to just handle it all in shaders.
*/
const bool needs_swizzle = format_needs_swizzle(f);
/* We need to adjust the alpha channel as loaded by the HW,
* for example sign extension and normalization may be necessary.
*/
const enum ac_vs_input_alpha_adjust alpha_adjust = vtx_info->alpha_adjust;
/* Try to shrink the load format by skipping unused components from the start. /* Try to shrink the load format by skipping unused components from the start.
* Beneficial because the backend may be able to emit fewer HW instructions. * Beneficial because the backend may be able to emit fewer HW instructions.
* Only possible with array formats. * Only possible with array formats.
*/ */
const unsigned first_used_channel = ffs(dest_use_mask) - 1; const unsigned first_used_channel = first_used_swizzled_channel(f, dest_use_mask, false);
const unsigned skipped_start = f->is_array ? first_used_channel : 0; const unsigned skipped_start = f->is_array ? first_used_channel : 0;
/* Number of channels we actually use and load. /* Number of channels we actually use and load.
* Don't shrink the format here because this might allow the backend to * Don't shrink the format here because this might allow the backend to
* emit fewer (but larger than needed) HW instructions. * emit fewer (but larger than needed) HW instructions.
*/ */
const unsigned first_trailing_unused_channel = util_last_bit(dest_use_mask); const unsigned first_trailing_unused_channel =
first_used_swizzled_channel(f, dest_use_mask, true) + 1;
const unsigned max_loaded_channels = MIN2(first_trailing_unused_channel, f->nr_channels); const unsigned max_loaded_channels = MIN2(first_trailing_unused_channel, f->nr_channels);
const unsigned fetch_num_channels = const unsigned fetch_num_channels =
skipped_start >= max_loaded_channels ? 0 : max_loaded_channels - skipped_start; skipped_start >= max_loaded_channels ? 0 : max_loaded_channels - skipped_start;
@@ -294,10 +363,12 @@ lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs
/* Return early if possible to avoid generating unnecessary IR. */ /* Return early if possible to avoid generating unnecessary IR. */
if (num_loads > 0 && first_used_channel == component && if (num_loads > 0 && first_used_channel == component &&
load->num_components == dest_num_components) load->num_components == dest_num_components && !needs_swizzle &&
alpha_adjust == AC_ALPHA_ADJUST_NONE)
return load; return load;
/* Fill unused and OOB components. /* Fill unused and OOB components.
* Apply swizzle and alpha adjust according to the format.
*/ */
const nir_alu_type dst_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(intrin)); const nir_alu_type dst_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(intrin));
nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS] = {0}; nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS] = {0};
@@ -308,8 +379,12 @@ lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs
channels[i] = nir_imm_zero(b, 1, bit_size); channels[i] = nir_imm_zero(b, 1, bit_size);
} else if (c < max_loaded_channels) { } else if (c < max_loaded_channels) {
/* Use channels that were loaded from VRAM. */ /* Use channels that were loaded from VRAM. */
assert(c >= first_used_channel); const unsigned sw = f->swizzle[c];
channels[i] = nir_channel(b, load, c - first_used_channel); assert(sw >= first_used_channel);
channels[i] = nir_channel(b, load, sw - first_used_channel);
if (alpha_adjust != AC_ALPHA_ADJUST_NONE && c == 3)
channels[i] = adjust_vertex_fetch_alpha(b, alpha_adjust, channels[i]);
} else { } else {
/* Handle input loads that are larger than their format. */ /* Handle input loads that are larger than their format. */
channels[i] = oob_input_load_value(b, c, bit_size, dst_type == nir_type_float); channels[i] = oob_input_load_value(b, c, bit_size, dst_type == nir_type_float);

View File

@@ -2789,122 +2789,6 @@ non_uniform_access_callback(const nir_src *src, void *_)
return nir_chase_binding(*src).success ? 0x2 : 0x3; return nir_chase_binding(*src).success ? 0x2 : 0x3;
} }
static nir_ssa_def *
radv_adjust_vertex_fetch_alpha(nir_builder *b, enum ac_vs_input_alpha_adjust alpha_adjust,
nir_ssa_def *alpha)
{
if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED)
alpha = nir_f2u32(b, alpha);
/* For the integer-like cases, do a natural sign extension.
*
* For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 and happen to contain 0, 1, 2, 3 as
* the two LSBs of the exponent.
*/
unsigned offset = alpha_adjust == AC_ALPHA_ADJUST_SNORM ? 23u : 0u;
alpha = nir_ibfe_imm(b, alpha, offset, 2u);
/* Convert back to the right type. */
if (alpha_adjust == AC_ALPHA_ADJUST_SNORM) {
alpha = nir_i2f32(b, alpha);
alpha = nir_fmax(b, alpha, nir_imm_float(b, -1.0f));
} else if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED) {
alpha = nir_i2f32(b, alpha);
}
return alpha;
}
static bool
radv_lower_vs_input(nir_shader *nir, const struct radv_physical_device *pdevice,
const struct radv_pipeline_key *pipeline_key)
{
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
bool progress = false;
if (pipeline_key->vs.has_prolog)
return false;
nir_builder b;
nir_builder_init(&b, impl);
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_input)
continue;
unsigned location = nir_intrinsic_base(intrin) - VERT_ATTRIB_GENERIC0;
unsigned component = nir_intrinsic_component(intrin);
unsigned num_components = intrin->dest.ssa.num_components;
enum pipe_format attrib_format = pipeline_key->vs.vertex_attribute_formats[location];
const struct ac_vtx_format_info *desc = ac_get_vtx_format_info(
pdevice->rad_info.gfx_level, pdevice->rad_info.family, attrib_format);
bool is_float =
nir_alu_type_get_base_type(nir_intrinsic_dest_type(intrin)) == nir_type_float;
unsigned mask = nir_ssa_def_components_read(&intrin->dest.ssa) << component;
unsigned num_channels = MIN2(util_last_bit(mask), desc->num_channels);
static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
bool post_shuffle = G_008F0C_DST_SEL_X(desc->dst_sel) == V_008F0C_SQ_SEL_Z;
const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
b.cursor = nir_after_instr(instr);
nir_ssa_def *channels[4];
if (post_shuffle) {
/* Expand to load 3 components because it's shuffled like X<->Z. */
intrin->num_components = MAX2(component + num_components, 3);
intrin->dest.ssa.num_components = intrin->num_components;
nir_intrinsic_set_component(intrin, 0);
num_channels = MAX2(num_channels, 3);
}
for (uint32_t i = 0; i < num_components; i++) {
unsigned idx = i + (post_shuffle ? component : 0);
if (swizzle[i + component] < num_channels) {
channels[i] = nir_channel(&b, &intrin->dest.ssa, swizzle[idx]);
} else if (i + component == 3) {
channels[i] = is_float ? nir_imm_floatN_t(&b, 1.0f, intrin->dest.ssa.bit_size)
: nir_imm_intN_t(&b, 1u, intrin->dest.ssa.bit_size);
} else {
channels[i] = nir_imm_zero(&b, 1, intrin->dest.ssa.bit_size);
}
}
if (desc->alpha_adjust != AC_ALPHA_ADJUST_NONE && component + num_components == 4) {
unsigned idx = num_components - 1;
channels[idx] = radv_adjust_vertex_fetch_alpha(&b, desc->alpha_adjust, channels[idx]);
}
nir_ssa_def *new_dest = nir_vec(&b, channels, num_components);
nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa, new_dest,
new_dest->parent_instr);
progress = true;
}
}
if (progress)
nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
else
nir_metadata_preserve(impl, nir_metadata_all);
return progress;
}
void void
radv_pipeline_stage_init(const VkPipelineShaderStageCreateInfo *sinfo, radv_pipeline_stage_init(const VkPipelineShaderStageCreateInfo *sinfo,
struct radv_pipeline_stage *out_stage, gl_shader_stage stage) struct radv_pipeline_stage *out_stage, gl_shader_stage stage)
@@ -3611,11 +3495,6 @@ radv_graphics_pipeline_compile(struct radv_graphics_pipeline *pipeline,
stages[i].feedback.duration += os_time_get_nano() - stage_start; stages[i].feedback.duration += os_time_get_nano() - stage_start;
} }
if (stages[MESA_SHADER_VERTEX].nir) {
NIR_PASS(_, stages[MESA_SHADER_VERTEX].nir, radv_lower_vs_input, device->physical_device,
pipeline_key);
}
radv_fill_shader_info(pipeline, pipeline_layout, pipeline_key, stages, noop_fs, active_nir_stages); radv_fill_shader_info(pipeline, pipeline_layout, pipeline_key, stages, noop_fs, active_nir_stages);
radv_declare_pipeline_args(device, stages, pipeline_key, active_nir_stages); radv_declare_pipeline_args(device, stages, pipeline_key, active_nir_stages);