ac/nir/ngg: Use attribute ring for mesh shader params.
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21409>
This commit is contained in:
@@ -155,6 +155,7 @@ enum {
|
||||
typedef enum {
|
||||
ms_out_mode_lds,
|
||||
ms_out_mode_scratch_ring,
|
||||
ms_out_mode_attr_ring,
|
||||
ms_out_mode_var,
|
||||
} ms_out_mode;
|
||||
|
||||
@@ -184,6 +185,14 @@ typedef struct
|
||||
ms_out_part prm_attr;
|
||||
} scratch_ring;
|
||||
|
||||
/* VRAM attributes ring (GFX11 only) for all non-position outputs.
|
||||
* GFX11 doesn't have to reload attributes from this ring at the end of the shader.
|
||||
*/
|
||||
struct {
|
||||
ms_out_part vtx_attr;
|
||||
ms_out_part prm_attr;
|
||||
} attr_ring;
|
||||
|
||||
/* Outputs without cross-invocation access can be stored in variables. */
|
||||
struct {
|
||||
ms_out_part vtx_attr;
|
||||
@@ -3692,6 +3701,9 @@ ms_get_out_layout_part(unsigned location,
|
||||
} else if (mask & s->layout.scratch_ring.prm_attr.mask) {
|
||||
*out_mode = ms_out_mode_scratch_ring;
|
||||
return &s->layout.scratch_ring.prm_attr;
|
||||
} else if (mask & s->layout.attr_ring.prm_attr.mask) {
|
||||
*out_mode = ms_out_mode_attr_ring;
|
||||
return &s->layout.attr_ring.prm_attr;
|
||||
} else if (mask & s->layout.var.prm_attr.mask) {
|
||||
*out_mode = ms_out_mode_var;
|
||||
return &s->layout.var.prm_attr;
|
||||
@@ -3703,6 +3715,9 @@ ms_get_out_layout_part(unsigned location,
|
||||
} else if (mask & s->layout.scratch_ring.vtx_attr.mask) {
|
||||
*out_mode = ms_out_mode_scratch_ring;
|
||||
return &s->layout.scratch_ring.vtx_attr;
|
||||
} else if (mask & s->layout.attr_ring.vtx_attr.mask) {
|
||||
*out_mode = ms_out_mode_attr_ring;
|
||||
return &s->layout.attr_ring.vtx_attr;
|
||||
} else if (mask & s->layout.var.vtx_attr.mask) {
|
||||
*out_mode = ms_out_mode_var;
|
||||
return &s->layout.var.vtx_attr;
|
||||
@@ -3786,6 +3801,20 @@ ms_store_arrayed_output_intrin(nir_builder *b,
|
||||
.write_mask = write_mask,
|
||||
.memory_modes = nir_var_shader_out,
|
||||
.access = ACCESS_COHERENT);
|
||||
} else if (out_mode == ms_out_mode_attr_ring) {
|
||||
/* GFX11+: Store params straight to the attribute ring.
|
||||
*
|
||||
* Even though the access pattern may not be the most optimal,
|
||||
* this is still much better than reserving LDS and losing waves.
|
||||
* (Also much better than storing and reloading from the scratch ring.)
|
||||
*/
|
||||
const nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
|
||||
nir_ssa_def *ring = nir_load_ring_attr_amd(b);
|
||||
nir_ssa_def *soffset = nir_load_ring_attr_offset_amd(b);
|
||||
soffset = nir_iadd_imm(b, soffset, s->vs_output_param_offset[io_sem.location] * 16 * 32);
|
||||
nir_store_buffer_amd(b, store_val, ring, base_addr_off, soffset, arr_index, .base = const_off,
|
||||
.memory_modes = nir_var_shader_out,
|
||||
.access = ACCESS_COHERENT | ACCESS_IS_SWIZZLED_AMD);
|
||||
} else if (out_mode == ms_out_mode_var) {
|
||||
if (store_val->bit_size > 32) {
|
||||
/* Split 64-bit store values to 32-bit components. */
|
||||
@@ -3840,6 +3869,16 @@ ms_load_arrayed_output(nir_builder *b,
|
||||
.base = const_off,
|
||||
.memory_modes = nir_var_shader_out,
|
||||
.access = ACCESS_COHERENT);
|
||||
} else if (out_mode == ms_out_mode_attr_ring) {
|
||||
/* Loading from the attribute ring shouldn't be necessary,
|
||||
* but we implement it for the sake of NV_mesh_shader.
|
||||
*/
|
||||
nir_ssa_def *ring = nir_load_ring_attr_amd(b);
|
||||
nir_ssa_def *soffset = nir_load_ring_attr_offset_amd(b);
|
||||
soffset = nir_iadd_imm(b, soffset, s->vs_output_param_offset[driver_location] * 16 * 32);
|
||||
return nir_load_buffer_amd(b, num_components, load_bit_size, ring, base_addr_off, soffset,
|
||||
arr_index, .base = const_off, .memory_modes = nir_var_shader_out,
|
||||
.access = ACCESS_COHERENT | ACCESS_IS_SWIZZLED_AMD);
|
||||
} else if (out_mode == ms_out_mode_var) {
|
||||
nir_ssa_def *arr[8] = {0};
|
||||
unsigned num_32bit_components = num_components * load_bit_size / 32;
|
||||
@@ -4275,17 +4314,20 @@ emit_ms_finale(nir_builder *b, lower_ngg_ms_state *s)
|
||||
nir_ssa_def *has_output_vertex = nir_ilt(b, invocation_index, num_vtx);
|
||||
nir_if *if_has_output_vertex = nir_push_if(b, has_output_vertex);
|
||||
{
|
||||
/* All per-vertex attributes. */
|
||||
ms_emit_arrayed_outputs(b, invocation_index, s->per_vertex_outputs, s);
|
||||
const uint64_t per_vertex_outputs =
|
||||
s->per_vertex_outputs & ~s->layout.attr_ring.vtx_attr.mask;
|
||||
ms_emit_arrayed_outputs(b, invocation_index, per_vertex_outputs, s);
|
||||
|
||||
ac_nir_export_position(b, s->gfx_level, s->clipdist_enable_mask,
|
||||
!s->has_param_exports, false,
|
||||
s->per_vertex_outputs, s->outputs);
|
||||
|
||||
if (s->has_param_exports) {
|
||||
ac_nir_export_parameters(b, s->vs_output_param_offset,
|
||||
s->per_vertex_outputs, 0,
|
||||
s->outputs, NULL, NULL);
|
||||
/* Export generic attributes on GFX10.3
|
||||
* (On GFX11 they are already stored in the attribute ring.)
|
||||
*/
|
||||
if (s->has_param_exports && s->gfx_level == GFX10_3) {
|
||||
ac_nir_export_parameters(b, s->vs_output_param_offset, per_vertex_outputs, 0, s->outputs,
|
||||
NULL, NULL);
|
||||
}
|
||||
}
|
||||
nir_pop_if(b, if_has_output_vertex);
|
||||
@@ -4294,8 +4336,8 @@ emit_ms_finale(nir_builder *b, lower_ngg_ms_state *s)
|
||||
nir_ssa_def *has_output_primitive = nir_ilt(b, invocation_index, num_prm);
|
||||
nir_if *if_has_output_primitive = nir_push_if(b, has_output_primitive);
|
||||
{
|
||||
/* Generic per-primitive attributes. */
|
||||
uint64_t per_primitive_outputs = s->per_primitive_outputs & ~SPECIAL_MS_OUT_MASK;
|
||||
uint64_t per_primitive_outputs =
|
||||
s->per_primitive_outputs & ~s->layout.attr_ring.prm_attr.mask & ~SPECIAL_MS_OUT_MASK;
|
||||
ms_emit_arrayed_outputs(b, invocation_index, per_primitive_outputs, s);
|
||||
|
||||
/* Insert layer output store if the pipeline uses multiview but the API shader doesn't write it. */
|
||||
@@ -4344,9 +4386,13 @@ emit_ms_finale(nir_builder *b, lower_ngg_ms_state *s)
|
||||
|
||||
ms_emit_primitive_export(b, prim_exp_arg, per_primitive_outputs, s);
|
||||
|
||||
ac_nir_export_parameters(b, s->vs_output_param_offset,
|
||||
per_primitive_outputs, 0,
|
||||
s->outputs, NULL, NULL);
|
||||
/* Export generic attributes on GFX10.3
|
||||
* (On GFX11 they are already stored in the attribute ring.)
|
||||
*/
|
||||
if (s->has_param_exports && s->gfx_level == GFX10_3) {
|
||||
ac_nir_export_parameters(b, s->vs_output_param_offset, per_primitive_outputs, 0,
|
||||
s->outputs, NULL, NULL);
|
||||
}
|
||||
}
|
||||
nir_pop_if(b, if_has_output_primitive);
|
||||
}
|
||||
@@ -4520,18 +4566,32 @@ ms_calculate_arrayed_output_layout(ms_out_mem_layout *l,
|
||||
}
|
||||
|
||||
static ms_out_mem_layout
|
||||
ms_calculate_output_layout(unsigned api_shared_size,
|
||||
uint64_t per_vertex_output_mask,
|
||||
uint64_t per_primitive_output_mask,
|
||||
uint64_t cross_invocation_output_access,
|
||||
unsigned max_vertices,
|
||||
unsigned max_primitives,
|
||||
unsigned vertices_per_prim)
|
||||
ms_calculate_output_layout(enum amd_gfx_level gfx_level, unsigned api_shared_size,
|
||||
uint64_t per_vertex_output_mask, uint64_t per_primitive_output_mask,
|
||||
uint64_t cross_invocation_output_access, unsigned max_vertices,
|
||||
unsigned max_primitives, unsigned vertices_per_prim)
|
||||
{
|
||||
/* These outputs always need export instructions and can't use the attributes ring. */
|
||||
const uint64_t always_export_mask =
|
||||
VARYING_BIT_POS | VARYING_BIT_CULL_DIST0 | VARYING_BIT_CULL_DIST1 | VARYING_BIT_CLIP_DIST0 |
|
||||
VARYING_BIT_CLIP_DIST1 | VARYING_BIT_PSIZ | VARYING_BIT_VIEWPORT |
|
||||
VARYING_BIT_PRIMITIVE_SHADING_RATE | VARYING_BIT_LAYER |
|
||||
BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) |
|
||||
BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) | BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
|
||||
|
||||
const bool use_attr_ring = gfx_level >= GFX11;
|
||||
const uint64_t attr_ring_per_vertex_output_mask =
|
||||
use_attr_ring ? per_vertex_output_mask & ~always_export_mask : 0;
|
||||
const uint64_t attr_ring_per_primitive_output_mask =
|
||||
use_attr_ring ? per_primitive_output_mask & ~always_export_mask : 0;
|
||||
|
||||
const uint64_t lds_per_vertex_output_mask =
|
||||
per_vertex_output_mask & cross_invocation_output_access & ~SPECIAL_MS_OUT_MASK;
|
||||
per_vertex_output_mask & ~attr_ring_per_vertex_output_mask & cross_invocation_output_access &
|
||||
~SPECIAL_MS_OUT_MASK;
|
||||
const uint64_t lds_per_primitive_output_mask =
|
||||
per_primitive_output_mask & cross_invocation_output_access & ~SPECIAL_MS_OUT_MASK;
|
||||
per_primitive_output_mask & ~attr_ring_per_primitive_output_mask &
|
||||
cross_invocation_output_access & ~SPECIAL_MS_OUT_MASK;
|
||||
|
||||
const bool cross_invocation_indices =
|
||||
cross_invocation_output_access & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
|
||||
const bool cross_invocation_cull_primitive =
|
||||
@@ -4540,9 +4600,15 @@ ms_calculate_output_layout(unsigned api_shared_size,
|
||||
/* Shared memory used by the API shader. */
|
||||
ms_out_mem_layout l = { .lds = { .total_size = api_shared_size } };
|
||||
|
||||
/* GFX11+: use attribute ring for all generic attributes. */
|
||||
l.attr_ring.vtx_attr.mask = attr_ring_per_vertex_output_mask;
|
||||
l.attr_ring.prm_attr.mask = attr_ring_per_primitive_output_mask;
|
||||
|
||||
/* Outputs without cross-invocation access can be stored in variables. */
|
||||
l.var.vtx_attr.mask = per_vertex_output_mask & ~cross_invocation_output_access;
|
||||
l.var.prm_attr.mask = per_primitive_output_mask & ~cross_invocation_output_access;
|
||||
l.var.vtx_attr.mask =
|
||||
per_vertex_output_mask & ~attr_ring_per_vertex_output_mask & ~cross_invocation_output_access;
|
||||
l.var.prm_attr.mask = per_primitive_output_mask & ~attr_ring_per_primitive_output_mask &
|
||||
~cross_invocation_output_access;
|
||||
|
||||
/* Workgroup information, see ms_workgroup_* for the layout. */
|
||||
l.lds.workgroup_info_addr = ALIGN(l.lds.total_size, 16);
|
||||
@@ -4628,9 +4694,9 @@ ac_nir_lower_ngg_ms(nir_shader *shader,
|
||||
unsigned max_vertices = shader->info.mesh.max_vertices_out;
|
||||
unsigned max_primitives = shader->info.mesh.max_primitives_out;
|
||||
|
||||
ms_out_mem_layout layout =
|
||||
ms_calculate_output_layout(shader->info.shared_size, per_vertex_outputs, per_primitive_outputs,
|
||||
cross_invocation_access, max_vertices, max_primitives, vertices_per_prim);
|
||||
ms_out_mem_layout layout = ms_calculate_output_layout(
|
||||
gfx_level, shader->info.shared_size, per_vertex_outputs, per_primitive_outputs,
|
||||
cross_invocation_access, max_vertices, max_primitives, vertices_per_prim);
|
||||
|
||||
shader->info.shared_size = layout.lds.total_size;
|
||||
*out_needs_scratch_ring = layout.scratch_ring.vtx_attr.mask || layout.scratch_ring.prm_attr.mask;
|
||||
|
Reference in New Issue
Block a user