radv: Apply swizzle and alpha adjust in radv_nir_lower_vs_inputs.
Deal with VS input related things in a single pass instead of having two different passes. Fossil DB stats on Rembrandt (GFX10.3): Totals from 174 (0.13% of 134913) affected shaders: VGPRs: 7736 -> 7520 (-2.79%) CodeSize: 354004 -> 353604 (-0.11%); split: -0.17%, +0.06% MaxWaves: 4196 -> 4248 (+1.24%) Instrs: 65228 -> 65139 (-0.14%); split: -0.19%, +0.06% Latency: 265823 -> 265728 (-0.04%); split: -0.12%, +0.08% InvThroughput: 84629 -> 84644 (+0.02%); split: -0.08%, +0.10% VClause: 1618 -> 1606 (-0.74%); split: -0.93%, +0.19% SClause: 1382 -> 1379 (-0.22%); split: -0.36%, +0.14% Copies: 5586 -> 5566 (-0.36%); split: -0.55%, +0.20% PreSGPRs: 4994 -> 5037 (+0.86%); split: -0.10%, +0.96% PreVGPRs: 4948 -> 4955 (+0.14%); split: -0.04%, +0.18% Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16805>
This commit is contained in:
@@ -152,6 +152,60 @@ count_format_bytes(const struct util_format_description *f, const unsigned first
|
|||||||
return bits / 8;
|
return bits / 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
format_needs_swizzle(const struct util_format_description *f)
|
||||||
|
{
|
||||||
|
for (unsigned i = 0; i < f->nr_channels; ++i) {
|
||||||
|
if (f->swizzle[i] != PIPE_SWIZZLE_X + i)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned
|
||||||
|
first_used_swizzled_channel(const struct util_format_description *f, const unsigned mask,
|
||||||
|
const bool backwards)
|
||||||
|
{
|
||||||
|
unsigned first_used = backwards ? 0 : f->nr_channels;
|
||||||
|
const unsigned it_mask = mask & BITFIELD_MASK(f->nr_channels);
|
||||||
|
|
||||||
|
u_foreach_bit (b, it_mask) {
|
||||||
|
assert(f->swizzle[b] != PIPE_SWIZZLE_0 && f->swizzle[b] != PIPE_SWIZZLE_1);
|
||||||
|
const unsigned c = f->swizzle[b] - PIPE_SWIZZLE_X;
|
||||||
|
first_used = backwards ? MAX2(first_used, c) : MIN2(first_used, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
return first_used;
|
||||||
|
}
|
||||||
|
|
||||||
|
static nir_ssa_def *
|
||||||
|
adjust_vertex_fetch_alpha(nir_builder *b, enum ac_vs_input_alpha_adjust alpha_adjust,
|
||||||
|
nir_ssa_def *alpha)
|
||||||
|
{
|
||||||
|
if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED)
|
||||||
|
alpha = nir_f2u32(b, alpha);
|
||||||
|
|
||||||
|
/* For the integer-like cases, do a natural sign extension.
|
||||||
|
*
|
||||||
|
* For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 and happen to contain 0, 1, 2, 3 as
|
||||||
|
* the two LSBs of the exponent.
|
||||||
|
*/
|
||||||
|
unsigned offset = alpha_adjust == AC_ALPHA_ADJUST_SNORM ? 23u : 0u;
|
||||||
|
|
||||||
|
alpha = nir_ibfe_imm(b, alpha, offset, 2u);
|
||||||
|
|
||||||
|
/* Convert back to the right type. */
|
||||||
|
if (alpha_adjust == AC_ALPHA_ADJUST_SNORM) {
|
||||||
|
alpha = nir_i2f32(b, alpha);
|
||||||
|
alpha = nir_fmax(b, alpha, nir_imm_float(b, -1.0f));
|
||||||
|
} else if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED) {
|
||||||
|
alpha = nir_i2f32(b, alpha);
|
||||||
|
}
|
||||||
|
|
||||||
|
return alpha;
|
||||||
|
}
|
||||||
|
|
||||||
static nir_ssa_def *
|
static nir_ssa_def *
|
||||||
lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs_state *s)
|
lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs_state *s)
|
||||||
{
|
{
|
||||||
@@ -190,6 +244,8 @@ lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs
|
|||||||
const uint32_t attrib_stride = s->pl_key->vs.vertex_attribute_strides[location];
|
const uint32_t attrib_stride = s->pl_key->vs.vertex_attribute_strides[location];
|
||||||
const enum pipe_format attrib_format = s->pl_key->vs.vertex_attribute_formats[location];
|
const enum pipe_format attrib_format = s->pl_key->vs.vertex_attribute_formats[location];
|
||||||
const struct util_format_description *f = util_format_description(attrib_format);
|
const struct util_format_description *f = util_format_description(attrib_format);
|
||||||
|
const struct ac_vtx_format_info *vtx_info =
|
||||||
|
ac_get_vtx_format_info(s->rad_info->gfx_level, s->rad_info->family, attrib_format);
|
||||||
const unsigned binding_index =
|
const unsigned binding_index =
|
||||||
s->info->vs.use_per_attribute_vb_descs ? location : attrib_binding;
|
s->info->vs.use_per_attribute_vb_descs ? location : attrib_binding;
|
||||||
const unsigned desc_index =
|
const unsigned desc_index =
|
||||||
@@ -203,18 +259,31 @@ lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs
|
|||||||
nir_ssa_def *base_index = calc_vs_input_index(b, location, s);
|
nir_ssa_def *base_index = calc_vs_input_index(b, location, s);
|
||||||
nir_ssa_def *zero = nir_imm_int(b, 0);
|
nir_ssa_def *zero = nir_imm_int(b, 0);
|
||||||
|
|
||||||
|
/* We currently implement swizzling for all formats in shaders.
|
||||||
|
* Note, it is possible to specify swizzling in the DST_SEL fields of descriptors,
|
||||||
|
* but we don't use that because typed loads using the MTBUF instruction format
|
||||||
|
* don't support DST_SEL, so it's simpler to just handle it all in shaders.
|
||||||
|
*/
|
||||||
|
const bool needs_swizzle = format_needs_swizzle(f);
|
||||||
|
|
||||||
|
/* We need to adjust the alpha channel as loaded by the HW,
|
||||||
|
* for example sign extension and normalization may be necessary.
|
||||||
|
*/
|
||||||
|
const enum ac_vs_input_alpha_adjust alpha_adjust = vtx_info->alpha_adjust;
|
||||||
|
|
||||||
/* Try to shrink the load format by skipping unused components from the start.
|
/* Try to shrink the load format by skipping unused components from the start.
|
||||||
* Beneficial because the backend may be able to emit fewer HW instructions.
|
* Beneficial because the backend may be able to emit fewer HW instructions.
|
||||||
* Only possible with array formats.
|
* Only possible with array formats.
|
||||||
*/
|
*/
|
||||||
const unsigned first_used_channel = ffs(dest_use_mask) - 1;
|
const unsigned first_used_channel = first_used_swizzled_channel(f, dest_use_mask, false);
|
||||||
const unsigned skipped_start = f->is_array ? first_used_channel : 0;
|
const unsigned skipped_start = f->is_array ? first_used_channel : 0;
|
||||||
|
|
||||||
/* Number of channels we actually use and load.
|
/* Number of channels we actually use and load.
|
||||||
* Don't shrink the format here because this might allow the backend to
|
* Don't shrink the format here because this might allow the backend to
|
||||||
* emit fewer (but larger than needed) HW instructions.
|
* emit fewer (but larger than needed) HW instructions.
|
||||||
*/
|
*/
|
||||||
const unsigned first_trailing_unused_channel = util_last_bit(dest_use_mask);
|
const unsigned first_trailing_unused_channel =
|
||||||
|
first_used_swizzled_channel(f, dest_use_mask, true) + 1;
|
||||||
const unsigned max_loaded_channels = MIN2(first_trailing_unused_channel, f->nr_channels);
|
const unsigned max_loaded_channels = MIN2(first_trailing_unused_channel, f->nr_channels);
|
||||||
const unsigned fetch_num_channels =
|
const unsigned fetch_num_channels =
|
||||||
skipped_start >= max_loaded_channels ? 0 : max_loaded_channels - skipped_start;
|
skipped_start >= max_loaded_channels ? 0 : max_loaded_channels - skipped_start;
|
||||||
@@ -294,10 +363,12 @@ lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs
|
|||||||
|
|
||||||
/* Return early if possible to avoid generating unnecessary IR. */
|
/* Return early if possible to avoid generating unnecessary IR. */
|
||||||
if (num_loads > 0 && first_used_channel == component &&
|
if (num_loads > 0 && first_used_channel == component &&
|
||||||
load->num_components == dest_num_components)
|
load->num_components == dest_num_components && !needs_swizzle &&
|
||||||
|
alpha_adjust == AC_ALPHA_ADJUST_NONE)
|
||||||
return load;
|
return load;
|
||||||
|
|
||||||
/* Fill unused and OOB components.
|
/* Fill unused and OOB components.
|
||||||
|
* Apply swizzle and alpha adjust according to the format.
|
||||||
*/
|
*/
|
||||||
const nir_alu_type dst_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(intrin));
|
const nir_alu_type dst_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(intrin));
|
||||||
nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS] = {0};
|
nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS] = {0};
|
||||||
@@ -308,8 +379,12 @@ lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs
|
|||||||
channels[i] = nir_imm_zero(b, 1, bit_size);
|
channels[i] = nir_imm_zero(b, 1, bit_size);
|
||||||
} else if (c < max_loaded_channels) {
|
} else if (c < max_loaded_channels) {
|
||||||
/* Use channels that were loaded from VRAM. */
|
/* Use channels that were loaded from VRAM. */
|
||||||
assert(c >= first_used_channel);
|
const unsigned sw = f->swizzle[c];
|
||||||
channels[i] = nir_channel(b, load, c - first_used_channel);
|
assert(sw >= first_used_channel);
|
||||||
|
channels[i] = nir_channel(b, load, sw - first_used_channel);
|
||||||
|
|
||||||
|
if (alpha_adjust != AC_ALPHA_ADJUST_NONE && c == 3)
|
||||||
|
channels[i] = adjust_vertex_fetch_alpha(b, alpha_adjust, channels[i]);
|
||||||
} else {
|
} else {
|
||||||
/* Handle input loads that are larger than their format. */
|
/* Handle input loads that are larger than their format. */
|
||||||
channels[i] = oob_input_load_value(b, c, bit_size, dst_type == nir_type_float);
|
channels[i] = oob_input_load_value(b, c, bit_size, dst_type == nir_type_float);
|
||||||
|
@@ -2789,122 +2789,6 @@ non_uniform_access_callback(const nir_src *src, void *_)
|
|||||||
return nir_chase_binding(*src).success ? 0x2 : 0x3;
|
return nir_chase_binding(*src).success ? 0x2 : 0x3;
|
||||||
}
|
}
|
||||||
|
|
||||||
static nir_ssa_def *
|
|
||||||
radv_adjust_vertex_fetch_alpha(nir_builder *b, enum ac_vs_input_alpha_adjust alpha_adjust,
|
|
||||||
nir_ssa_def *alpha)
|
|
||||||
{
|
|
||||||
if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED)
|
|
||||||
alpha = nir_f2u32(b, alpha);
|
|
||||||
|
|
||||||
/* For the integer-like cases, do a natural sign extension.
|
|
||||||
*
|
|
||||||
* For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 and happen to contain 0, 1, 2, 3 as
|
|
||||||
* the two LSBs of the exponent.
|
|
||||||
*/
|
|
||||||
unsigned offset = alpha_adjust == AC_ALPHA_ADJUST_SNORM ? 23u : 0u;
|
|
||||||
|
|
||||||
alpha = nir_ibfe_imm(b, alpha, offset, 2u);
|
|
||||||
|
|
||||||
/* Convert back to the right type. */
|
|
||||||
if (alpha_adjust == AC_ALPHA_ADJUST_SNORM) {
|
|
||||||
alpha = nir_i2f32(b, alpha);
|
|
||||||
alpha = nir_fmax(b, alpha, nir_imm_float(b, -1.0f));
|
|
||||||
} else if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED) {
|
|
||||||
alpha = nir_i2f32(b, alpha);
|
|
||||||
}
|
|
||||||
|
|
||||||
return alpha;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
radv_lower_vs_input(nir_shader *nir, const struct radv_physical_device *pdevice,
|
|
||||||
const struct radv_pipeline_key *pipeline_key)
|
|
||||||
{
|
|
||||||
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
|
|
||||||
bool progress = false;
|
|
||||||
|
|
||||||
if (pipeline_key->vs.has_prolog)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
nir_builder b;
|
|
||||||
nir_builder_init(&b, impl);
|
|
||||||
|
|
||||||
nir_foreach_block(block, impl) {
|
|
||||||
nir_foreach_instr(instr, block) {
|
|
||||||
if (instr->type != nir_instr_type_intrinsic)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
||||||
if (intrin->intrinsic != nir_intrinsic_load_input)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
unsigned location = nir_intrinsic_base(intrin) - VERT_ATTRIB_GENERIC0;
|
|
||||||
|
|
||||||
unsigned component = nir_intrinsic_component(intrin);
|
|
||||||
unsigned num_components = intrin->dest.ssa.num_components;
|
|
||||||
|
|
||||||
enum pipe_format attrib_format = pipeline_key->vs.vertex_attribute_formats[location];
|
|
||||||
const struct ac_vtx_format_info *desc = ac_get_vtx_format_info(
|
|
||||||
pdevice->rad_info.gfx_level, pdevice->rad_info.family, attrib_format);
|
|
||||||
bool is_float =
|
|
||||||
nir_alu_type_get_base_type(nir_intrinsic_dest_type(intrin)) == nir_type_float;
|
|
||||||
|
|
||||||
unsigned mask = nir_ssa_def_components_read(&intrin->dest.ssa) << component;
|
|
||||||
unsigned num_channels = MIN2(util_last_bit(mask), desc->num_channels);
|
|
||||||
|
|
||||||
static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
|
|
||||||
static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
|
|
||||||
bool post_shuffle = G_008F0C_DST_SEL_X(desc->dst_sel) == V_008F0C_SQ_SEL_Z;
|
|
||||||
const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
|
|
||||||
|
|
||||||
b.cursor = nir_after_instr(instr);
|
|
||||||
nir_ssa_def *channels[4];
|
|
||||||
|
|
||||||
if (post_shuffle) {
|
|
||||||
/* Expand to load 3 components because it's shuffled like X<->Z. */
|
|
||||||
intrin->num_components = MAX2(component + num_components, 3);
|
|
||||||
intrin->dest.ssa.num_components = intrin->num_components;
|
|
||||||
|
|
||||||
nir_intrinsic_set_component(intrin, 0);
|
|
||||||
|
|
||||||
num_channels = MAX2(num_channels, 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < num_components; i++) {
|
|
||||||
unsigned idx = i + (post_shuffle ? component : 0);
|
|
||||||
|
|
||||||
if (swizzle[i + component] < num_channels) {
|
|
||||||
channels[i] = nir_channel(&b, &intrin->dest.ssa, swizzle[idx]);
|
|
||||||
} else if (i + component == 3) {
|
|
||||||
channels[i] = is_float ? nir_imm_floatN_t(&b, 1.0f, intrin->dest.ssa.bit_size)
|
|
||||||
: nir_imm_intN_t(&b, 1u, intrin->dest.ssa.bit_size);
|
|
||||||
} else {
|
|
||||||
channels[i] = nir_imm_zero(&b, 1, intrin->dest.ssa.bit_size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (desc->alpha_adjust != AC_ALPHA_ADJUST_NONE && component + num_components == 4) {
|
|
||||||
unsigned idx = num_components - 1;
|
|
||||||
channels[idx] = radv_adjust_vertex_fetch_alpha(&b, desc->alpha_adjust, channels[idx]);
|
|
||||||
}
|
|
||||||
|
|
||||||
nir_ssa_def *new_dest = nir_vec(&b, channels, num_components);
|
|
||||||
|
|
||||||
nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa, new_dest,
|
|
||||||
new_dest->parent_instr);
|
|
||||||
|
|
||||||
progress = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (progress)
|
|
||||||
nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
|
|
||||||
else
|
|
||||||
nir_metadata_preserve(impl, nir_metadata_all);
|
|
||||||
|
|
||||||
return progress;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
void
|
||||||
radv_pipeline_stage_init(const VkPipelineShaderStageCreateInfo *sinfo,
|
radv_pipeline_stage_init(const VkPipelineShaderStageCreateInfo *sinfo,
|
||||||
struct radv_pipeline_stage *out_stage, gl_shader_stage stage)
|
struct radv_pipeline_stage *out_stage, gl_shader_stage stage)
|
||||||
@@ -3611,11 +3495,6 @@ radv_graphics_pipeline_compile(struct radv_graphics_pipeline *pipeline,
|
|||||||
stages[i].feedback.duration += os_time_get_nano() - stage_start;
|
stages[i].feedback.duration += os_time_get_nano() - stage_start;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (stages[MESA_SHADER_VERTEX].nir) {
|
|
||||||
NIR_PASS(_, stages[MESA_SHADER_VERTEX].nir, radv_lower_vs_input, device->physical_device,
|
|
||||||
pipeline_key);
|
|
||||||
}
|
|
||||||
|
|
||||||
radv_fill_shader_info(pipeline, pipeline_layout, pipeline_key, stages, noop_fs, active_nir_stages);
|
radv_fill_shader_info(pipeline, pipeline_layout, pipeline_key, stages, noop_fs, active_nir_stages);
|
||||||
|
|
||||||
radv_declare_pipeline_args(device, stages, pipeline_key, active_nir_stages);
|
radv_declare_pipeline_args(device, stages, pipeline_key, active_nir_stages);
|
||||||
|
Reference in New Issue
Block a user