radeonsi/gfx11: implement attributes through memory

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16328>
This commit is contained in:
Marek Olšák
2021-05-11 00:50:43 -04:00
committed by Marge Bot
parent 91a7f43f0b
commit afc110a1f6
14 changed files with 144 additions and 8 deletions

View File

@@ -111,6 +111,7 @@ struct ac_shader_args {
struct ac_arg es2gs_offset; /* separate legacy ES */
struct ac_arg gs2vs_offset; /* legacy GS */
struct ac_arg gs_wave_id; /* legacy GS */
struct ac_arg gs_attr_offset; /* gfx11+: attribute ring offset in 512B increments */
struct ac_arg gs_vtx_offset[6]; /* GFX6-8: [0-5], GFX9+: [0-2] packed */
struct ac_arg gs_prim_id;
struct ac_arg gs_invocation_id;

View File

@@ -1322,6 +1322,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, "");
if (ctx->stage == MESA_SHADER_TESS_EVAL)
ret = si_insert_input_ret(ctx, ret, ctx->args.tess_offchip_offset, 4);
if (ctx->ac.chip_class >= GFX11)
ret = si_insert_input_ret(ctx, ret, ctx->args.gs_attr_offset, 5);
ret = si_insert_input_ptr(ctx, ret, ctx->internal_bindings, 8 + SI_SGPR_INTERNAL_BINDINGS);
ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
@@ -1330,6 +1332,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
8 + SI_SGPR_CONST_AND_SHADER_BUFFERS);
ret = si_insert_input_ptr(ctx, ret, ctx->samplers_and_images, 8 + SI_SGPR_SAMPLERS_AND_IMAGES);
ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
if (ctx->ac.chip_class >= GFX11)
ret = si_insert_input_ptr(ctx, ret, ctx->gs_attr_address, 8 + GFX9_SGPR_ATTRIBUTE_RING_ADDR);
if (ctx->stage == MESA_SHADER_VERTEX) {
ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex, 8 + SI_SGPR_BASE_VERTEX);

View File

@@ -99,6 +99,9 @@ void si_blitter_end(struct si_context *sctx)
* non-global VS user SGPRs. */
sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX);
if (sctx->chip_class >= GFX11)
sctx->gs_attribute_ring_pointer_dirty = true;
/* Reset SI_SGPR_SMALL_PRIM_CULL_INFO: */
if (sctx->screen->use_ngg_culling)
si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);

View File

@@ -2072,6 +2072,8 @@ void si_shader_pointers_mark_dirty(struct si_context *sctx)
sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
sctx->compute_shaderbuf_sgprs_dirty = true;
sctx->compute_image_sgprs_dirty = true;
if (sctx->chip_class >= GFX11)
sctx->gs_attribute_ring_pointer_dirty = true;
}
/* Set a base register address for user data constants in the given shader.
@@ -2227,6 +2229,13 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx)
sh_base[PIPE_SHADER_TESS_CTRL]);
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY),
sh_base[PIPE_SHADER_GEOMETRY]);
if (sctx->gs_attribute_ring_pointer_dirty) {
assert(sctx->chip_class >= GFX11);
radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + GFX9_SGPR_ATTRIBUTE_RING_ADDR * 4,
sctx->screen->attribute_ring->gpu_address);
sctx->gs_attribute_ring_pointer_dirty = false;
}
radeon_end();
sctx->shader_pointers_dirty &= ~u_bit_consecutive(SI_DESCS_INTERNAL, SI_DESCS_FIRST_COMPUTE);
@@ -2791,6 +2800,9 @@ void si_init_all_descriptors(struct si_context *sctx)
si_get_user_data_base(sctx->chip_class, TESS_OFF, GS_OFF,
NGG_OFF, PIPE_SHADER_GEOMETRY));
si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
si_set_ring_buffer(sctx, SI_GS_ATTRIBUTE_RING, &sctx->screen->attribute_ring->b.b,
0, ~0u, false, true, 16, 32, 0);
}
static bool si_upload_shader_descriptors(struct si_context *sctx, unsigned mask)

View File

@@ -385,6 +385,10 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
if (ctx->screen->info.has_vgt_flush_ngg_legacy_bug && !ctx->ngg)
ctx->flags |= SI_CONTEXT_VGT_FLUSH;
if (ctx->screen->attribute_ring) {
radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->screen->attribute_ring,
RADEON_USAGE_READWRITE | RADEON_PRIO_SHADER_RINGS);
}
if (ctx->border_color_buffer) {
radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->border_color_buffer,
RADEON_USAGE_READ | RADEON_PRIO_BORDER_COLORS);

View File

@@ -878,6 +878,8 @@ static void si_destroy_screen(struct pipe_screen *pscreen)
sscreen->num_disk_shader_cache_misses);
}
si_resource_reference(&sscreen->attribute_ring, NULL);
simple_mtx_destroy(&sscreen->aux_context_lock);
if (sscreen->aux_context) {
@@ -1381,6 +1383,17 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
sscreen->ngg_subgroup_size = 128;
if (sscreen->info.chip_class >= GFX11) {
/* TODO: tweak this */
unsigned attr_ring_size_per_se = align(1400000, 64 * 1024);
unsigned attr_ring_size = attr_ring_size_per_se * sscreen->info.max_se;
assert(attr_ring_size <= 16 * 1024 * 1024); /* maximum size */
sscreen->attribute_ring = si_aligned_buffer_create(&sscreen->b, SI_RESOURCE_FLAG_32BIT,
PIPE_USAGE_DEFAULT,
/* TODO: remove the overallocation */
attr_ring_size * 16, 2 * 1024 * 1024);
}
/* Create the auxiliary context. This must be done last. */
sscreen->aux_context = si_create_context(
&sscreen->b,

View File

@@ -707,6 +707,8 @@ struct si_screen {
struct util_idalloc_mt buffer_ids;
struct util_vertex_state_cache vertex_state_cache;
struct si_resource *attribute_ring;
};
struct si_sampler_view {
@@ -1209,6 +1211,7 @@ struct si_context {
bool bindless_descriptors_dirty;
bool graphics_bindless_pointer_dirty;
bool compute_bindless_pointer_dirty;
bool gs_attribute_ring_pointer_dirty;
/* Allocated bindless handles */
struct hash_table *tex_handles;

View File

@@ -542,7 +542,10 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset);
if (ctx->screen->info.chip_class >= GFX11)
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_attr_offset);
else
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
@@ -573,6 +576,10 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
}
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->small_prim_cull_info);
if (ctx->screen->info.chip_class >= GFX11)
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_attr_address);
else
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
if (ctx->stage == MESA_SHADER_VERTEX)
declare_vb_descriptor_input_sgprs(ctx);

View File

@@ -215,6 +215,7 @@ enum
SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
GFX9_SGPR_SMALL_PRIM_CULL_INFO = MAX2(SI_VS_NUM_USER_SGPR, SI_TES_NUM_USER_SGPR),
GFX9_SGPR_ATTRIBUTE_RING_ADDR,
GFX9_GS_NUM_USER_SGPR,
/* PS only */

View File

@@ -73,6 +73,7 @@ struct si_shader_context {
struct ac_arg internal_bindings;
struct ac_arg bindless_samplers_and_images;
struct ac_arg small_prim_cull_info;
struct ac_arg gs_attr_address;
/* API VS */
struct ac_arg vb_descriptors[5];
struct ac_arg vertex_index0;

View File

@@ -110,14 +110,18 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
else
ret = si_insert_input_ret(ctx, ret, ctx->args.gs2vs_offset, 2);
ret = si_insert_input_ret(ctx, ret, ctx->args.merged_wave_info, 3);
ret = si_insert_input_ret(ctx, ret, ctx->args.scratch_offset, 5);
if (ctx->screen->info.chip_class >= GFX11)
ret = si_insert_input_ret(ctx, ret, ctx->args.gs_attr_offset, 5);
else
ret = si_insert_input_ret(ctx, ret, ctx->args.scratch_offset, 5);
ret = si_insert_input_ptr(ctx, ret, ctx->internal_bindings, 8 + SI_SGPR_INTERNAL_BINDINGS);
ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
if (ctx->screen->use_ngg) {
ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
ret = si_insert_input_ptr(ctx, ret, ctx->small_prim_cull_info, 8 + GFX9_SGPR_SMALL_PRIM_CULL_INFO);
if (ctx->screen->info.chip_class >= GFX11)
ret = si_insert_input_ptr(ctx, ret, ctx->gs_attr_address, 8 + GFX9_SGPR_ATTRIBUTE_RING_ADDR);
}
unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;

View File

@@ -690,6 +690,9 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx,
ac_build_export(&ctx->ac, &pos_args[i]);
}
if (!shader->info.nr_param_exports)
return;
/* Build parameter exports. Use 2 loops to export params in ascending order.
* 32 is the maximum number of parameter exports.
*/
@@ -707,8 +710,61 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx,
&param_exports[offset]);
}
for (unsigned i = 0; i < shader->info.nr_param_exports; i++)
ac_build_export(&ctx->ac, &param_exports[i]);
if (ctx->screen->info.chip_class >= GFX11) {
/* Get the attribute ring address and descriptor. */
LLVMValueRef attr_address;
if (ctx->stage == MESA_SHADER_VERTEX && shader->selector->info.base.vs.blit_sgprs_amd) {
LLVMValueRef ptr =
LLVMBuildPointerCast(ctx->ac.builder,
ac_get_arg(&ctx->ac, ctx->internal_bindings),
LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_CONST_32BIT), "");
attr_address = ac_build_load_to_sgpr(&ctx->ac, ptr,
LLVMConstInt(ctx->ac.i32, SI_GS_ATTRIBUTE_RING * 4, 0));
} else {
attr_address = ac_get_arg(&ctx->ac, ctx->gs_attr_address);
}
unsigned stride = 16 * shader->info.nr_param_exports;
LLVMValueRef attr_desc[4] = {
attr_address,
LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi) |
S_008F04_STRIDE(stride) |
S_008F04_SWIZZLE_ENABLE_GFX11(3) /* 16B */, 0),
LLVMConstInt(ctx->ac.i32, 0xffffffff, 0),
LLVMConstInt(ctx->ac.i32, S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_32_32_32_FLOAT) |
S_008F0C_INDEX_STRIDE(2) /* 32 elements */, 0),
};
LLVMValueRef attr_rsrc = ac_build_gather_values(&ctx->ac, attr_desc, 4);
LLVMValueRef attr_offset = LLVMBuildShl(ctx->ac.builder,
si_unpack_param(ctx, ctx->args.gs_attr_offset, 0, 15),
LLVMConstInt(ctx->ac.i32, 9, 0), ""); /* 512B increments */
LLVMValueRef vindex = gfx10_get_thread_id_in_tg(ctx);
LLVMValueRef soffset[32];
/* Compute scalar offsets first. */
for (unsigned i = 0; i < shader->info.nr_param_exports; i++) {
soffset[i] = LLVMBuildAdd(ctx->ac.builder, attr_offset,
LLVMConstInt(ctx->ac.i32, 32 * i * 16, 0), "");
}
/* Write attributes to the attribute ring buffer. */
for (unsigned i = 0; i < shader->info.nr_param_exports; i++) {
LLVMValueRef vdata = ac_build_gather_values_extended(&ctx->ac, param_exports[i].out,
4, 1, false);
ac_build_buffer_store_dword(&ctx->ac, attr_rsrc, vdata, vindex,
ctx->ac.i32_0, soffset[i], ac_swizzled);
}
} else {
/* Export attributes using parameter exports. */
for (unsigned i = 0; i < shader->info.nr_param_exports; i++)
ac_build_export(&ctx->ac, &param_exports[i]);
}
}
void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi)

View File

@@ -5791,5 +5791,27 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE));
}
if (sctx->chip_class >= GFX11) {
/* We must wait for idle before changing the SPI attribute ring registers. */
si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
assert((sscreen->attribute_ring->gpu_address >> 32) == sscreen->info.address32_hi);
/* The PS will read inputs from this address. */
si_pm4_set_reg(pm4, R_031118_SPI_ATTRIBUTE_RING_BASE,
sscreen->attribute_ring->gpu_address >> 16);
si_pm4_set_reg(pm4, R_03111C_SPI_ATTRIBUTE_RING_SIZE,
S_03111C_MEM_SIZE(((sscreen->attribute_ring->bo_size /
sscreen->info.max_se) >> 16) - 1) |
S_03111C_L1_POLICY(1));
}
sctx->cs_preamble_state = pm4;
}

View File

@@ -357,8 +357,8 @@ enum
/* Image descriptor of color buffer 0 for KHR_blend_equation_advanced. */
SI_PS_IMAGE_COLORBUF0,
SI_PS_IMAGE_COLORBUF0_HI,
SI_PS_IMAGE_COLORBUF0_FMASK,
SI_PS_IMAGE_COLORBUF0_FMASK_HI,
SI_PS_IMAGE_COLORBUF0_FMASK, /* gfx6-10 */
SI_PS_IMAGE_COLORBUF0_FMASK_HI, /* gfx6-10 */
/* Internal constant buffers. */
SI_HS_CONST_DEFAULT_TESS_LEVELS,
@@ -368,12 +368,17 @@ enum
SI_PS_CONST_SAMPLE_POSITIONS,
SI_RING_ESGS, /* gfx6-8 */
SI_RING_GSVS,
SI_RING_GSVS, /* gfx6-10 */
SI_NUM_INTERNAL_BINDINGS,
/* Aliases to reuse slots that are unused on other generations. */
SI_GS_QUERY_BUF = SI_RING_ESGS, /* gfx10+ */
/* Only u_blitter uses this (and compute should be used in most cases, so this shouldn't
* be used much). Normal draws get the address from a user SGPR.
*/
SI_GS_ATTRIBUTE_RING = SI_RING_GSVS, /* gfx11+ */
};
/* Indices into sctx->descriptors, laid out so that gfx and compute pipelines