agx: Implement depth and stencil export

Lower FRAG_RESULT_DEPTH and FRAG_RESULT_STENCIL writes to a combnied zs_emit
instruction with a multisampling index. To be used in the following commit.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20365>
This commit is contained in:
Alyssa Rosenzweig
2022-12-16 23:38:07 -05:00
committed by Marge Bot
parent 15155268de
commit 9578b47af3
8 changed files with 170 additions and 0 deletions

View File

@@ -494,6 +494,32 @@ agx_emit_local_store_pixel(agx_builder *b, nir_intrinsic_instr *instr)
nir_intrinsic_base(instr));
}
static agx_instr *
agx_emit_store_zs(agx_builder *b, nir_intrinsic_instr *instr)
{
unsigned base = nir_intrinsic_base(instr);
bool write_z = base & 1;
bool write_s = base & 2;
/* TODO: Handle better */
assert(!b->shader->key->fs.ignore_tib_dependencies && "not used");
agx_writeout(b, 0x0001);
agx_index z = agx_src_index(&instr->src[1]);
agx_index s = agx_src_index(&instr->src[2]);
agx_index zs = (write_z && write_s) ? agx_vec2(b, z, s) :
write_z ? z :
s;
/* Not necessarily a sample mask but overlapping hw mechanism... Should
* maybe rename this flag to something more general.
*/
b->shader->out->writes_sample_mask = true;
return agx_zs_emit(b, agx_src_index(&instr->src[0]), zs, base);
}
static void
agx_emit_local_load_pixel(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
{
@@ -713,6 +739,10 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
assert(stage == MESA_SHADER_VERTEX);
return agx_emit_store_vary(b, instr);
case nir_intrinsic_store_zs_agx:
assert(stage == MESA_SHADER_FRAGMENT);
return agx_emit_store_zs(b, instr);
case nir_intrinsic_store_local_pixel_agx:
assert(stage == MESA_SHADER_FRAGMENT);
return agx_emit_local_store_pixel(b, instr);
@@ -1880,6 +1910,8 @@ agx_preprocess_nir(nir_shader *nir)
NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
glsl_type_size, 0);
if (nir->info.stage == MESA_SHADER_FRAGMENT) {
NIR_PASS_V(nir, agx_nir_lower_zs_emit);
/* Interpolate varyings at fp16 and write to the tilebuffer at fp16. As an
* exception, interpolate flat shaded at fp32. This works around a
* hardware limitation. The resulting code (with an extra f2f16 at the end
@@ -1952,6 +1984,16 @@ agx_compile_shader_nir(nir_shader *nir,
out->no_colour_output = !(nir->info.outputs_written >> FRAG_RESULT_DATA0);
out->disable_tri_merging = nir->info.fs.needs_all_helper_invocations ||
nir->info.fs.needs_quad_helper_invocations;
/* Report a canonical depth layout */
enum gl_frag_depth_layout layout = nir->info.fs.depth_layout;
if (!(nir->info.outputs_written & BITFIELD_BIT(FRAG_RESULT_DEPTH)))
out->depth_layout = FRAG_DEPTH_LAYOUT_UNCHANGED;
else if (layout == FRAG_DEPTH_LAYOUT_NONE)
out->depth_layout = FRAG_DEPTH_LAYOUT_ANY;
else
out->depth_layout = layout;
}
agx_optimize_nir(nir, &out->push_count);

View File

@@ -173,6 +173,9 @@ struct agx_shader_info {
/* Does the shader control the sample mask? */
bool writes_sample_mask;
/* Depth layout, never equal to NONE */
enum gl_frag_depth_layout depth_layout;
/* Is colour output omitted? */
bool no_colour_output;

View File

@@ -301,6 +301,7 @@ typedef struct {
uint32_t channels;
uint32_t bfi_mask;
uint16_t pixel_offset;
uint16_t zs;
enum agx_sr sr;
enum agx_icond icond;
enum agx_fcond fcond;
@@ -806,6 +807,7 @@ void agx_compute_liveness(agx_context *ctx);
void agx_liveness_ins_update(BITSET_WORD *live, agx_instr *I);
bool agx_lower_resinfo(nir_shader *s);
bool agx_nir_lower_zs_emit(nir_shader *s);
bool agx_nir_lower_array_texture(nir_shader *s);
bool agx_nir_opt_preamble(nir_shader *s, unsigned *preamble_size);
bool agx_nir_lower_load_mask(nir_shader *shader);

View File

@@ -0,0 +1,89 @@
/*
* Copyright 2022 Alyssa Rosenzweig
* SPDX-License-Identifier: MIT
*/
#include "agx_compiler.h"
#include "compiler/nir/nir.h"
#include "compiler/nir/nir_builder.h"
#define ALL_SAMPLES 0xFF
#define BASE_Z 1
#define BASE_S 2
static bool
lower(nir_function_impl *impl, nir_block *block)
{
nir_intrinsic_instr *zs_emit = NULL;
bool progress = false;
nir_foreach_instr_reverse_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic != nir_intrinsic_store_output)
continue;
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
if (sem.location != FRAG_RESULT_DEPTH && sem.location != FRAG_RESULT_STENCIL)
continue;
if (zs_emit == NULL) {
nir_builder b;
nir_builder_init(&b, impl);
b.cursor = nir_before_instr(instr);
/* Multisampling will get lowered later if needed, default to broadcast */
nir_ssa_def *sample_mask = nir_imm_intN_t(&b, ALL_SAMPLES, 16);
zs_emit = nir_store_zs_agx(&b, sample_mask,
nir_ssa_undef(&b, 1, 32) /* depth */,
nir_ssa_undef(&b, 1, 16) /* stencil */);
}
nir_ssa_def *value = intr->src[0].ssa;
bool z = (sem.location == FRAG_RESULT_DEPTH);
unsigned src_idx = z ? 1 : 2;
unsigned base = z ? BASE_Z : BASE_S;
assert((nir_intrinsic_base(zs_emit) & base) == 0 &&
"each of depth/stencil may only be written once");
nir_instr_rewrite_src_ssa(&zs_emit->instr, &zs_emit->src[src_idx], value);
nir_intrinsic_set_base(zs_emit, nir_intrinsic_base(zs_emit) | base);
nir_instr_remove(instr);
progress = true;
}
return progress;
}
bool
agx_nir_lower_zs_emit(nir_shader *s)
{
bool any_progress = false;
nir_foreach_function(function, s) {
if (!function->impl)
continue;
bool progress = false;
nir_foreach_block(block, function->impl) {
progress |= lower(function->impl, block);
}
if (progress) {
nir_metadata_preserve(function->impl, nir_metadata_block_index |
nir_metadata_dominance);
} else {
nir_metadata_preserve(function->impl, nir_metadata_all);
}
any_progress |= progress;
}
return any_progress;
}

View File

@@ -117,6 +117,7 @@ NEST = immediate("nest")
INVERT_COND = immediate("invert_cond")
NEST = immediate("nest")
TARGET = immediate("target", "agx_block *")
ZS = immediate("zs")
PERSPECTIVE = immediate("perspective", "bool")
SR = enum("sr", {
0: 'threadgroup_position_in_grid.x',
@@ -251,6 +252,10 @@ op("get_sr", (0x72, 0x7F | L, 4, _), dests = 1, imms = [SR])
op("sample_mask", (0x7fc1, 0xffff, 6, _), dests = 0, srcs = 1, can_eliminate = False)
# Sources: sample mask, combined depth/stencil
op("zs_emit", (0x41, 0xFF | L, 4, _), dests = 0, srcs = 2,
can_eliminate = False, imms = [ZS])
# Essentially same encoding. Last source is the sample mask
op("ld_tile", (0x49, 0x7F, 8, _), dests = 1, srcs = 1,
imms = [FORMAT, MASK, PIXEL_OFFSET], can_reorder = False)

View File

@@ -123,6 +123,7 @@ agx_optimizer_inline_imm(agx_instr **defs, agx_instr *I,
/* cmpselsrc takes integer immediates only */
if (s >= 2 && I->op == AGX_OPCODE_FCMPSEL) float_src = false;
if (I->op == AGX_OPCODE_ST_TILE && s == 0) continue;
if (I->op == AGX_OPCODE_ZS_EMIT && s != 0) continue;
if (float_src) {
bool fp16 = (def->dest[0].size == AGX_SIZE_16);
@@ -179,6 +180,7 @@ agx_optimizer_copyprop(agx_instr **defs, agx_instr *I)
(I->op == AGX_OPCODE_DEVICE_LOAD &&
(s != 0 || def->src[0].value >= 256)) ||
I->op == AGX_OPCODE_PHI ||
I->op == AGX_OPCODE_ZS_EMIT ||
I->op == AGX_OPCODE_ST_TILE ||
I->op == AGX_OPCODE_LD_TILE ||
I->op == AGX_OPCODE_BLOCK_IMAGE_STORE ||

View File

@@ -719,6 +719,32 @@ agx_pack_instr(struct util_dynarray *emission, struct util_dynarray *fixups, agx
break;
}
case AGX_OPCODE_ZS_EMIT:
{
agx_index S = I->src[0];
if (S.type == AGX_INDEX_IMMEDIATE)
assert(S.value < BITFIELD_BIT(8));
else
assert_register_is_aligned(S);
agx_index T = I->src[1];
assert_register_is_aligned(T);
assert(I->zs >= 1 && I->zs <= 3);
uint32_t word0 =
agx_opcodes_info[I->op].encoding.exact |
((S.type == AGX_INDEX_IMMEDIATE) ? (1 << 8) : 0) |
((S.value & BITFIELD_MASK(6)) << 9) |
((T.value & BITFIELD_MASK(6)) << 16) |
((T.value >> 6) << 26) |
((S.value >> 6) << 24) |
(I->zs << 29);
memcpy(util_dynarray_grow_bytes(emission, 1, 4), &word0, 4);
break;
}
case AGX_OPCODE_JMP_EXEC_ANY:
case AGX_OPCODE_JMP_EXEC_NONE:
{

View File

@@ -23,6 +23,7 @@ libasahi_agx_files = files(
'agx_compile.c',
'agx_dce.c',
'agx_liveness.c',
'agx_nir_lower_zs_emit.c',
'agx_nir_lower_texture.c',
'agx_nir_lower_load_mask.c',
'agx_nir_lower_ubo.c',