diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c index 1962e1466d1..f10deeb530b 100644 --- a/src/asahi/compiler/agx_compile.c +++ b/src/asahi/compiler/agx_compile.c @@ -419,26 +419,6 @@ agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr) agx_src_index(&instr->src[0])); } -static void -agx_write_sample_mask_1(agx_builder *b) -{ - if (b->shader->nir->info.fs.uses_discard && !b->shader->did_sample_mask) { - /* If the shader uses discard, the sample mask must be written by the - * shader on all execution paths. If we've reached the end of the shader, - * we are therefore still active and need to write a full sample mask. - * TODO: interactions with MSAA and gl_SampleMask writes - */ - agx_sample_mask(b, agx_immediate(0xFF), agx_immediate(1)); - agx_signal_pix(b, 1); - b->shader->did_sample_mask = true; - - assert(!(b->shader->nir->info.outputs_written & - (BITFIELD64_BIT(FRAG_RESULT_DEPTH) | - BITFIELD64_BIT(FRAG_RESULT_STENCIL))) && - "incompatible"); - } -} - static agx_instr * agx_emit_local_store_pixel(agx_builder *b, nir_intrinsic_instr *instr) { @@ -451,8 +431,6 @@ agx_emit_local_store_pixel(agx_builder *b, nir_intrinsic_instr *instr) agx_wait_pix(b, 0x000C); } - agx_write_sample_mask_1(b); - /* Compact the registers according to the mask */ agx_index compacted[4] = {agx_null()}; @@ -499,7 +477,6 @@ agx_emit_store_zs(agx_builder *b, nir_intrinsic_instr *instr) * maybe rename this flag to something more general. */ b->shader->out->writes_sample_mask = true; - assert(!b->shader->did_sample_mask && "incompatible"); return agx_zs_emit(b, agx_src_index(&instr->src[0]), zs, base); } @@ -672,26 +649,6 @@ agx_emit_load_frag_coord(agx_builder *b, agx_index dst, agx_emit_collect_to(b, dst, 4, dests); } -/* - * Demoting a helper invocation is logically equivalent to zeroing the sample - * mask. Metal implement discard as such. - * - * XXX: Actually, Metal's "discard" is a demote, and what is implemented here - * is a demote. There might be a better way to implement this to get correct - * helper invocation semantics. For now, I'm kicking the can down the road. - */ -static agx_instr * -agx_emit_discard(agx_builder *b) -{ - assert(!b->shader->key->fs.ignore_tib_dependencies && "invalid usage"); - agx_wait_pix(b, 0x0001); - b->shader->did_writeout = true; - - b->shader->out->writes_sample_mask = true; - agx_sample_mask(b, agx_immediate(0xFF), agx_immediate(0)); - return agx_signal_pix(b, 1); -} - static agx_instr * agx_load_compute_dimension(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr, enum agx_sr base) @@ -876,9 +833,6 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr) agx_emit_load_frag_coord(b, dst, instr); return NULL; - case nir_intrinsic_discard: - return agx_emit_discard(b); - case nir_intrinsic_sample_mask_agx: { assert(stage == MESA_SHADER_FRAGMENT); b->shader->out->writes_sample_mask = true; @@ -2202,10 +2156,6 @@ agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl, */ agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link); agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block)); - - if (ctx->stage == MESA_SHADER_FRAGMENT && !impl->function->is_preamble) - agx_write_sample_mask_1(&_b); - agx_logical_end(&_b); agx_stop(&_b); @@ -2427,7 +2377,7 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key, /* Late clip plane lowering created discards */ if (nir->info.stage == MESA_SHADER_FRAGMENT) { - NIR_PASS_V(nir, agx_nir_lower_zs_emit); + NIR_PASS_V(nir, agx_nir_lower_discard_zs_emit); } /* Late sysval lowering creates large loads. Load lowering creates unpacks */ @@ -2455,10 +2405,16 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key, out->push_count = key->reserved_preamble; agx_optimize_nir(nir, &out->push_count); - /* Implement conditional discard with real control flow like Metal */ - NIR_PASS_V(nir, nir_lower_discard_if, - (nir_lower_discard_if_to_cf | nir_lower_demote_if_to_cf | - nir_lower_terminate_if_to_cf)); + /* Create sample_mask instructions late, since NIR's scheduling is not aware + * of the ordering requirements between sample_mask and pixel stores. + * + * Note: when epilogs are used, special handling is required since the sample + * count is dynamic when the main fragment shader is compiled. + */ + if (key->fs.nr_samples) { + assert(nir->info.stage == MESA_SHADER_FRAGMENT); + NIR_PASS_V(nir, agx_nir_lower_sample_mask, key->fs.nr_samples); + } /* Must be last since NIR passes can remap driver_location freely */ if (nir->info.stage == MESA_SHADER_VERTEX) diff --git a/src/asahi/compiler/agx_compile.h b/src/asahi/compiler/agx_compile.h index 776044edda4..e218cdc6fa5 100644 --- a/src/asahi/compiler/agx_compile.h +++ b/src/asahi/compiler/agx_compile.h @@ -153,6 +153,13 @@ struct agx_fs_shader_key { * tilebuffer loads (including blending). */ bool ignore_tib_dependencies; + + /* In a monolithic fragment shader or in a fragment epilogue, the number of + * samples in the tilebuffer. In a non-monolithic fragment shader, leave + * zero. This is used for the correct lowering of sample_mask instructions, + * to ensure that all samples are written out. Can be set conservatively. + */ + unsigned nr_samples; }; struct agx_shader_key { diff --git a/src/asahi/compiler/agx_compiler.h b/src/asahi/compiler/agx_compiler.h index 9004c3065f7..35bb1a5f846 100644 --- a/src/asahi/compiler/agx_compiler.h +++ b/src/asahi/compiler/agx_compiler.h @@ -379,7 +379,7 @@ typedef struct { unsigned alloc; /* I don't really understand how writeout ops work yet */ - bool did_writeout, did_sample_mask; + bool did_writeout; /* Has r0l been zeroed yet due to control flow? */ bool any_cf; @@ -793,7 +793,8 @@ void agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies, void agx_compute_liveness(agx_context *ctx); void agx_liveness_ins_update(BITSET_WORD *live, agx_instr *I); -bool agx_nir_lower_zs_emit(nir_shader *s); +bool agx_nir_lower_discard_zs_emit(nir_shader *s); +bool agx_nir_lower_sample_mask(nir_shader *s, unsigned nr_samples); bool agx_nir_lower_texture(nir_shader *s, bool support_lod_bias); bool agx_nir_opt_preamble(nir_shader *s, unsigned *preamble_size); bool agx_nir_lower_load_mask(nir_shader *shader); diff --git a/src/asahi/compiler/agx_nir_lower_zs_emit.c b/src/asahi/compiler/agx_nir_lower_discard_zs_emit.c similarity index 73% rename from src/asahi/compiler/agx_nir_lower_zs_emit.c rename to src/asahi/compiler/agx_nir_lower_discard_zs_emit.c index 34af8e26cdb..dfddfb5d930 100644 --- a/src/asahi/compiler/agx_nir_lower_zs_emit.c +++ b/src/asahi/compiler/agx_nir_lower_discard_zs_emit.c @@ -12,7 +12,7 @@ #define BASE_S 2 static bool -lower(nir_function_impl *impl, nir_block *block) +lower_zs_emit(nir_function_impl *impl, nir_block *block) { nir_intrinsic_instr *zs_emit = NULL; bool progress = false; @@ -72,7 +72,7 @@ lower(nir_function_impl *impl, nir_block *block) } static bool -lower_discard_to_z(nir_builder *b, nir_instr *instr, UNUSED void *data) +lower_discard(nir_builder *b, nir_instr *instr, UNUSED void *data) { if (instr->type != nir_instr_type_intrinsic) return false; @@ -84,34 +84,42 @@ lower_discard_to_z(nir_builder *b, nir_instr *instr, UNUSED void *data) b->cursor = nir_before_instr(instr); - if (intr->intrinsic == nir_intrinsic_discard_if) - nir_push_if(b, intr->src[0].ssa); - - bool stencil_written = - b->shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); - - nir_store_zs_agx(b, nir_imm_intN_t(b, ALL_SAMPLES, 16), - nir_imm_float(b, NAN), - stencil_written ? nir_imm_intN_t(b, 0, 16) - : nir_ssa_undef(b, 1, 16) /* stencil */, - .base = BASE_Z | (stencil_written ? BASE_S : 0)); + nir_ssa_def *all_samples = nir_imm_intN_t(b, ALL_SAMPLES, 16); + nir_ssa_def *no_samples = nir_imm_intN_t(b, 0, 16); if (intr->intrinsic == nir_intrinsic_discard_if) - nir_push_else(b, NULL); + no_samples = nir_bcsel(b, intr->src[0].ssa, no_samples, all_samples); + + /* This will get lowered later to zs_emit if needed */ + nir_sample_mask_agx(b, all_samples, no_samples); + b->shader->info.fs.uses_discard = false; + b->shader->info.outputs_written |= BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK); nir_instr_remove(instr); return true; } -bool +static bool +agx_nir_lower_discard(nir_shader *s) +{ + if (!s->info.fs.uses_discard) + return false; + + return nir_shader_instructions_pass( + s, lower_discard, nir_metadata_block_index | nir_metadata_dominance, + NULL); +} + +static bool agx_nir_lower_zs_emit(nir_shader *s) { - bool any_progress = false; - + /* If depth/stencil isn't written, there's nothing to lower */ if (!(s->info.outputs_written & (BITFIELD64_BIT(FRAG_RESULT_STENCIL) | BITFIELD64_BIT(FRAG_RESULT_DEPTH)))) return false; + bool any_progress = false; + nir_foreach_function(function, s) { if (!function->impl) continue; @@ -119,7 +127,7 @@ agx_nir_lower_zs_emit(nir_shader *s) bool progress = false; nir_foreach_block(block, function->impl) { - progress |= lower(function->impl, block); + progress |= lower_zs_emit(function->impl, block); } if (progress) { @@ -132,9 +140,17 @@ agx_nir_lower_zs_emit(nir_shader *s) any_progress |= progress; } - any_progress |= nir_shader_instructions_pass( - s, lower_discard_to_z, nir_metadata_block_index | nir_metadata_dominance, - NULL); - s->info.fs.uses_discard = false; return any_progress; } + +bool +agx_nir_lower_discard_zs_emit(nir_shader *s) +{ + bool progress = false; + + /* Lower depth/stencil writes before discard so the interaction works */ + progress |= agx_nir_lower_zs_emit(s); + progress |= agx_nir_lower_discard(s); + + return progress; +} diff --git a/src/asahi/compiler/agx_nir_lower_sample_mask.c b/src/asahi/compiler/agx_nir_lower_sample_mask.c new file mode 100644 index 00000000000..ff66156ad0e --- /dev/null +++ b/src/asahi/compiler/agx_nir_lower_sample_mask.c @@ -0,0 +1,176 @@ +/* + * Copyright 2023 Alyssa Rosenzweig + * SPDX-License-Identifier: MIT + */ + +#include "compiler/nir/nir_builder.h" +#include "agx_compiler.h" + +/* + * sample_mask takes two bitmasks as arguments, TARGET and LIVE. Each bit refers + * to an indexed sample. Roughly, the instruction does: + * + * foreach sample in TARGET { + * if sample in LIVE { + * mark sample live + * } else { + * kill sample + * } + * } + * + * As a special case, TARGET may be set to all-1s (~0) to refer to all samples + * regardless of the framebuffer sample count. + * + * For example, to discard an entire pixel unconditionally, we could run: + * + * sample_mask ~0, 0 + * + * sample_mask must follow these rules: + * + * 1. All sample_mask instructions affecting a sample must execute before a + * local_store_pixel instruction targeting that sample. + * + * 2. If sample_mask is used anywhere in a shader, the state of every sample + * must be set on all execution paths. That is, the union of the TARGET sets + * of executed sample_masks must contain all samples. + * + * 3. If a sample is killed, future sample_mask instructions will have + * no effect. The following code sequence correctly implements a conditional + * discard (if there are no other sample_mask instructions in the shader): + * + * sample_mask discarded, 0 + * sample_mask ~0, ~0 + * + * 4. If zs_emit is used anywhere in the shader, sample_mask must not be used. + * Instead, zs_emit with depth = NaN can be emitted. + * + * This pass legalizes sample_mask instructions to satisfy these rules. + */ + +#define ALL_SAMPLES (0xFF) +#define BASE_Z 1 +#define BASE_S 2 + +static bool +lower_sample_mask_to_zs(nir_builder *b, nir_instr *instr, UNUSED void *data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + bool depth_written = + b->shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH); + bool stencil_written = + b->shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); + + b->cursor = nir_before_instr(instr); + + /* Existing zs_emit instructions need to be fixed up to write their own depth + * for consistency. + */ + if (intr->intrinsic == nir_intrinsic_store_zs_agx && !depth_written) { + /* Load the current depth at this pixel */ + nir_ssa_def *z = nir_channel(b, nir_load_frag_coord(b), 2); + + /* Write it out from this store_zs */ + nir_intrinsic_set_base(intr, nir_intrinsic_base(intr) | BASE_Z); + nir_instr_rewrite_src_ssa(instr, &intr->src[1], z); + + /* We'll set outputs_written after the pass in case there are multiple + * store_zs_agx instructions needing fixup. + */ + b->shader->info.fs.depth_layout = FRAG_DEPTH_LAYOUT_ANY; + return true; + } + + if (intr->intrinsic != nir_intrinsic_sample_mask_agx) + return false; + + nir_ssa_def *target = intr->src[0].ssa; + nir_ssa_def *live = intr->src[1].ssa; + nir_ssa_def *discard = nir_iand(b, target, nir_inot(b, live)); + + /* Write a NaN depth value for discarded samples */ + nir_store_zs_agx(b, discard, nir_imm_float(b, NAN), + stencil_written ? nir_imm_intN_t(b, 0, 16) + : nir_ssa_undef(b, 1, 16) /* stencil */, + .base = BASE_Z | (stencil_written ? BASE_S : 0)); + + nir_instr_remove(instr); + return true; +} + +bool +agx_nir_lower_sample_mask(nir_shader *shader, unsigned nr_samples) +{ + if (!(shader->info.outputs_written & + (BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)))) + return false; + + /* sample_mask can't be used with zs_emit, so lower sample_mask to zs_emit */ + if (shader->info.outputs_written & (BITFIELD64_BIT(FRAG_RESULT_DEPTH) | + BITFIELD64_BIT(FRAG_RESULT_STENCIL))) { + bool progress = nir_shader_instructions_pass( + shader, lower_sample_mask_to_zs, + nir_metadata_block_index | nir_metadata_dominance, NULL); + + /* The lowering requires an unconditional depth write. We mark this after + * lowering so the lowering knows whether there was already a depth write + */ + assert(progress && "must have lowered something,given the outputs"); + shader->info.outputs_written |= BITFIELD64_BIT(FRAG_RESULT_DEPTH); + + return true; + } + + /* nir_lower_io_to_temporaries ensures that stores are in the last block */ + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + nir_block *block = nir_impl_last_block(impl); + + nir_builder b; + nir_builder_init(&b, impl); + + /* Check which samples get a value written in the last block */ + uint8_t samples_set = 0; + + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_sample_mask_agx) + continue; + + if (!nir_src_is_const(intr->src[0])) + continue; + + samples_set |= nir_src_as_uint(intr->src[0]); + } + + /* If all samples are set, we're good to go */ + if ((samples_set & BITFIELD_MASK(nr_samples)) == BITFIELD_MASK(nr_samples)) + return false; + + /* Otherwise, at least one sample is not set in the last block and hence may + * not be set at all. Insert an instruction in the last block to ensure it + * will be live. + */ + b.cursor = nir_after_block(block); + + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_local_pixel_agx) + continue; + + b.cursor = nir_before_instr(instr); + break; + } + + nir_sample_mask_agx(&b, nir_imm_intN_t(&b, ALL_SAMPLES, 16), + nir_imm_intN_t(&b, ALL_SAMPLES, 16)); + return true; +} diff --git a/src/asahi/compiler/meson.build b/src/asahi/compiler/meson.build index 3dfa5350fbe..437bb75fbf6 100644 --- a/src/asahi/compiler/meson.build +++ b/src/asahi/compiler/meson.build @@ -9,7 +9,8 @@ libasahi_agx_files = files( 'agx_insert_waits.c', 'agx_nir_lower_address.c', 'agx_nir_lower_frag_sidefx.c', - 'agx_nir_lower_zs_emit.c', + 'agx_nir_lower_sample_mask.c', + 'agx_nir_lower_discard_zs_emit.c', 'agx_nir_lower_texture.c', 'agx_nir_lower_load_mask.c', 'agx_nir_lower_shared_bitsize.c', diff --git a/src/asahi/lib/agx_meta.c b/src/asahi/lib/agx_meta.c index 1560e8b5d87..6b8aea4ec73 100644 --- a/src/asahi/lib/agx_meta.c +++ b/src/asahi/lib/agx_meta.c @@ -75,6 +75,7 @@ agx_build_background_shader(struct agx_meta_cache *cache, struct agx_shader_key compiler_key = { .fs.ignore_tib_dependencies = true, + .fs.nr_samples = key->tib.nr_samples, }; for (unsigned rt = 0; rt < ARRAY_SIZE(key->op); ++rt) { diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index bab3c5aabd1..771ebd11154 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -1418,6 +1418,9 @@ agx_compile_variant(struct agx_device *dev, struct agx_uncompiled_shader *so, struct agx_shader_key base_key = {0}; + if (nir->info.stage == MESA_SHADER_FRAGMENT) + base_key.fs.nr_samples = 1; + NIR_PASS_V(nir, agx_nir_lower_sysvals, compiled, &base_key.reserved_preamble);