radv,aco: stop lowering FS outputs in NIR

This was a bad idea because:
- it diverges too much with the fragment shader epilog
- it doesn't allow to implement alpha-to-coverage via MRTZ correctly
- it was supposed to be used by LLVM but this never happened

Reverting this back allows us to fix alpha-to-coverage via MRTZ
on GFX11 easily, including for fragment shader epilogs.

fossils-db (NAVI21):
Totals from 20411 (15.13% of 134913) affected shaders:
VGPRs: 972056 -> 971400 (-0.07%); split: -0.08%, +0.01%
CodeSize: 92284804 -> 92295392 (+0.01%); split: -0.05%, +0.06%
MaxWaves: 465010 -> 465166 (+0.03%); split: +0.03%, -0.00%
Instrs: 17034162 -> 17034963 (+0.00%); split: -0.00%, +0.01%
Latency: 252013190 -> 251971764 (-0.02%); split: -0.03%, +0.02%
InvThroughput: 45859625 -> 45842556 (-0.04%); split: -0.04%, +0.01%
VClause: 324627 -> 324629 (+0.00%); split: -0.03%, +0.03%
SClause: 672918 -> 672826 (-0.01%); split: -0.05%, +0.04%
Copies: 1172126 -> 1158152 (-1.19%); split: -1.20%, +0.01%
Branches: 420602 -> 420604 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 1025441 -> 1025481 (+0.00%)
PreVGPRs: 861787 -> 860650 (-0.13%); split: -0.17%, +0.03%

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20126>
This commit is contained in:
Samuel Pitoiset
2022-12-02 09:24:17 +01:00
committed by Marge Bot
parent 3be728f1d0
commit a297ac10a4
2 changed files with 61 additions and 232 deletions

View File

@@ -3385,173 +3385,6 @@ radv_lower_vs_input(nir_shader *nir, const struct radv_physical_device *pdevice,
return progress;
}
static bool
radv_lower_fs_output(nir_shader *nir, const struct radv_pipeline_key *pipeline_key)
{
if (pipeline_key->ps.has_epilog)
return false;
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
bool progress = false;
nir_builder b;
nir_builder_init(&b, impl);
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_store_output)
continue;
int slot = nir_intrinsic_base(intrin) - FRAG_RESULT_DATA0;
if (slot < 0)
continue;
unsigned write_mask = nir_intrinsic_write_mask(intrin);
unsigned col_format = (pipeline_key->ps.col_format >> (4 * slot)) & 0xf;
bool is_int8 = (pipeline_key->ps.is_int8 >> slot) & 1;
bool is_int10 = (pipeline_key->ps.is_int10 >> slot) & 1;
bool enable_mrt_output_nan_fixup = (pipeline_key->ps.enable_mrt_output_nan_fixup >> slot) & 1;
bool is_16bit = intrin->src[0].ssa->bit_size == 16;
if (col_format == V_028714_SPI_SHADER_ZERO)
continue;
b.cursor = nir_before_instr(instr);
nir_ssa_def *values[4];
/* Extract the export values. */
for (unsigned i = 0; i < 4; i++) {
if (write_mask & (1 << i)) {
values[i] = nir_channel(&b, intrin->src[0].ssa, i);
} else {
values[i] = nir_ssa_undef(&b, 1, 32);
}
}
/* Replace NaN by zero (for 32-bit float formats) to fix game bugs if requested. */
if (enable_mrt_output_nan_fixup && !nir->info.internal && !is_16bit) {
u_foreach_bit(i, write_mask) {
const bool save_exact = b.exact;
b.exact = true;
nir_ssa_def *isnan = nir_fneu(&b, values[i], values[i]);
b.exact = save_exact;
values[i] = nir_bcsel(&b, isnan, nir_imm_zero(&b, 1, 32), values[i]);
}
}
if (col_format == V_028714_SPI_SHADER_FP16_ABGR ||
col_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
col_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
col_format == V_028714_SPI_SHADER_UINT16_ABGR ||
col_format == V_028714_SPI_SHADER_SINT16_ABGR) {
/* Convert and/or clamp the export values. */
switch (col_format) {
case V_028714_SPI_SHADER_UINT16_ABGR: {
unsigned max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
u_foreach_bit(i, write_mask) {
if (is_int8 || is_int10) {
values[i] = nir_umin(&b, values[i], i == 3 && is_int10 ? nir_imm_int(&b, 3u)
: nir_imm_int(&b, max_rgb));
} else if (is_16bit) {
values[i] = nir_u2u32(&b, values[i]);
}
}
break;
}
case V_028714_SPI_SHADER_SINT16_ABGR: {
unsigned max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
unsigned min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
u_foreach_bit(i, write_mask) {
if (is_int8 || is_int10) {
values[i] = nir_imin(&b, values[i], i == 3 && is_int10 ? nir_imm_int(&b, 1u)
: nir_imm_int(&b, max_rgb));
values[i] = nir_imax(&b, values[i], i == 3 && is_int10 ? nir_imm_int(&b, -2u)
: nir_imm_int(&b, min_rgb));
} else if (is_16bit) {
values[i] = nir_i2i32(&b, values[i]);
}
}
break;
}
case V_028714_SPI_SHADER_UNORM16_ABGR:
case V_028714_SPI_SHADER_SNORM16_ABGR:
u_foreach_bit(i, write_mask) {
if (is_16bit) {
values[i] = nir_f2f32(&b, values[i]);
}
}
break;
default:
break;
}
/* Only nir_pack_32_2x16_split needs 16-bit inputs. */
bool input_16_bit = col_format == V_028714_SPI_SHADER_FP16_ABGR && is_16bit;
unsigned new_write_mask = 0;
/* Pack the export values. */
for (unsigned i = 0; i < 2; i++) {
bool enabled = (write_mask >> (i * 2)) & 0x3;
if (!enabled) {
values[i] = nir_ssa_undef(&b, 1, 32);
continue;
}
nir_ssa_def *src0 = values[i * 2];
nir_ssa_def *src1 = values[i * 2 + 1];
if (!(write_mask & (1 << (i * 2))))
src0 = nir_imm_zero(&b, 1, input_16_bit ? 16 : 32);
if (!(write_mask & (1 << (i * 2 + 1))))
src1 = nir_imm_zero(&b, 1, input_16_bit ? 16 : 32);
if (col_format == V_028714_SPI_SHADER_FP16_ABGR) {
if (is_16bit) {
values[i] = nir_pack_32_2x16_split(&b, src0, src1);
} else {
values[i] = nir_pack_half_2x16_split(&b, src0, src1);
}
} else if (col_format == V_028714_SPI_SHADER_UNORM16_ABGR) {
values[i] = nir_pack_unorm_2x16(&b, nir_vec2(&b, src0, src1));
} else if (col_format == V_028714_SPI_SHADER_SNORM16_ABGR) {
values[i] = nir_pack_snorm_2x16(&b, nir_vec2(&b, src0, src1));
} else if (col_format == V_028714_SPI_SHADER_UINT16_ABGR) {
values[i] = nir_pack_uint_2x16(&b, nir_vec2(&b, src0, src1));
} else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR) {
values[i] = nir_pack_sint_2x16(&b, nir_vec2(&b, src0, src1));
}
new_write_mask |= 1 << i;
}
/* Update the write mask for compressed outputs. */
nir_intrinsic_set_write_mask(intrin, new_write_mask);
intrin->num_components = util_last_bit(new_write_mask);
}
nir_ssa_def *new_src = nir_vec(&b, values, intrin->num_components);
nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], nir_src_for_ssa(new_src));
progress = true;
}
}
if (progress)
nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
else
nir_metadata_preserve(impl, nir_metadata_all);
return progress;
}
void
radv_pipeline_stage_init(const VkPipelineShaderStageCreateInfo *sinfo,
struct radv_pipeline_stage *out_stage, gl_shader_stage stage)
@@ -4130,11 +3963,6 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout
pipeline_key);
}
if (stages[MESA_SHADER_FRAGMENT].nir && !radv_use_llvm_for_stage(device, MESA_SHADER_FRAGMENT)) {
/* TODO: Convert the LLVM backend. */
NIR_PASS(_, stages[MESA_SHADER_FRAGMENT].nir, radv_lower_fs_output, pipeline_key);
}
radv_fill_shader_info(pipeline, pipeline_layout, pipeline_key, stages);
radv_declare_pipeline_args(device, stages, pipeline_key);