radv: precompute legacy GS register values
To make emission faster. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29022>
This commit is contained in:

committed by
Marge Bot

parent
fa9b0ee86c
commit
4b53d36f0d
@@ -3180,39 +3180,21 @@ radv_emit_hw_gs(const struct radv_device *device, struct radeon_cmdbuf *ctx_cs,
|
||||
{
|
||||
const struct radv_physical_device *pdev = radv_device_physical(device);
|
||||
const struct radv_legacy_gs_info *gs_state = &gs->info.gs_ring_info;
|
||||
unsigned gs_max_out_vertices;
|
||||
const uint8_t *num_components;
|
||||
uint8_t max_stream;
|
||||
unsigned offset;
|
||||
uint64_t va;
|
||||
|
||||
gs_max_out_vertices = gs->info.gs.vertices_out;
|
||||
max_stream = gs->info.gs.max_stream;
|
||||
num_components = gs->info.gs.num_stream_output_components;
|
||||
|
||||
offset = num_components[0] * gs_max_out_vertices;
|
||||
|
||||
radeon_set_context_reg_seq(ctx_cs, R_028A60_VGT_GSVS_RING_OFFSET_1, 3);
|
||||
radeon_emit(ctx_cs, offset);
|
||||
if (max_stream >= 1)
|
||||
offset += num_components[1] * gs_max_out_vertices;
|
||||
radeon_emit(ctx_cs, offset);
|
||||
if (max_stream >= 2)
|
||||
offset += num_components[2] * gs_max_out_vertices;
|
||||
radeon_emit(ctx_cs, offset);
|
||||
if (max_stream >= 3)
|
||||
offset += num_components[3] * gs_max_out_vertices;
|
||||
radeon_set_context_reg(ctx_cs, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
|
||||
radeon_emit(ctx_cs, gs->info.regs.gs.vgt_gsvs_ring_offset[0]);
|
||||
radeon_emit(ctx_cs, gs->info.regs.gs.vgt_gsvs_ring_offset[1]);
|
||||
radeon_emit(ctx_cs, gs->info.regs.gs.vgt_gsvs_ring_offset[2]);
|
||||
radeon_set_context_reg(ctx_cs, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gs->info.regs.gs.vgt_gsvs_ring_itemsize);
|
||||
|
||||
radeon_set_context_reg_seq(ctx_cs, R_028B5C_VGT_GS_VERT_ITEMSIZE, 4);
|
||||
radeon_emit(ctx_cs, num_components[0]);
|
||||
radeon_emit(ctx_cs, (max_stream >= 1) ? num_components[1] : 0);
|
||||
radeon_emit(ctx_cs, (max_stream >= 2) ? num_components[2] : 0);
|
||||
radeon_emit(ctx_cs, (max_stream >= 3) ? num_components[3] : 0);
|
||||
radeon_emit(ctx_cs, gs->info.regs.gs.vgt_gs_vert_itemsize[0]);
|
||||
radeon_emit(ctx_cs, gs->info.regs.gs.vgt_gs_vert_itemsize[1]);
|
||||
radeon_emit(ctx_cs, gs->info.regs.gs.vgt_gs_vert_itemsize[2]);
|
||||
radeon_emit(ctx_cs, gs->info.regs.gs.vgt_gs_vert_itemsize[3]);
|
||||
|
||||
uint32_t gs_num_invocations = gs->info.gs.invocations;
|
||||
radeon_set_context_reg(ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
|
||||
S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0));
|
||||
radeon_set_context_reg(ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT, gs->info.regs.gs.vgt_gs_instance_cnt);
|
||||
|
||||
if (pdev->info.gfx_level <= GFX8) {
|
||||
/* GFX6-8: ESGS offchip ring buffer is allocated according to VGT_ESGS_RING_ITEMSIZE.
|
||||
@@ -3248,14 +3230,10 @@ radv_emit_hw_gs(const struct radv_device *device, struct radeon_cmdbuf *ctx_cs,
|
||||
radeon_emit(cs, gs->config.rsrc2);
|
||||
}
|
||||
|
||||
radeon_set_sh_reg_idx(
|
||||
pdev, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
|
||||
ac_apply_cu_en(S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F), C_00B21C_CU_EN, 0, &pdev->info));
|
||||
radeon_set_sh_reg_idx(pdev, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3, gs->info.regs.gs.spi_shader_pgm_rsrc3_gs);
|
||||
|
||||
if (pdev->info.gfx_level >= GFX10) {
|
||||
radeon_set_sh_reg_idx(pdev, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
|
||||
ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0),
|
||||
C_00B204_CU_EN_GFX10, 16, &pdev->info));
|
||||
radeon_set_sh_reg_idx(pdev, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3, gs->info.regs.gs.spi_shader_pgm_rsrc4_gs);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3271,7 +3249,7 @@ radv_emit_geometry_shader(const struct radv_device *device, struct radeon_cmdbuf
|
||||
radv_emit_hw_vs(device, ctx_cs, cs, gs_copy_shader);
|
||||
}
|
||||
|
||||
radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT, gs->info.gs.vertices_out);
|
||||
radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT, gs->info.regs.vgt_gs_max_vert_out);
|
||||
|
||||
if (gs->info.merged_shader_compiled_separately) {
|
||||
const struct radv_userdata_info *vgt_esgs_ring_itemsize = radv_get_user_sgpr(gs, AC_UD_VGT_ESGS_RING_ITEMSIZE);
|
||||
@@ -3301,7 +3279,7 @@ radv_emit_mesh_shader(const struct radv_device *device, struct radeon_cmdbuf *ct
|
||||
const uint32_t gs_out = radv_conv_gl_prim_to_gs_out(ms->info.ms.output_prim);
|
||||
|
||||
radv_emit_hw_ngg(device, ctx_cs, cs, NULL, ms);
|
||||
radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT, ms->info.regs.ms.vgt_gs_max_vert_out);
|
||||
radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT, ms->info.regs.vgt_gs_max_vert_out);
|
||||
radeon_set_uconfig_reg_idx(pdev, ctx_cs, R_030908_VGT_PRIMITIVE_TYPE, 1, V_008958_DI_PT_POINTLIST);
|
||||
|
||||
if (pdev->mesh_fast_launch_2) {
|
||||
|
@@ -1464,6 +1464,7 @@ radv_open_rtld_binary(struct radv_device *device, const struct radv_shader_binar
|
||||
static void
|
||||
radv_precompute_registers_hw_gs(struct radv_device *device, struct radv_shader_binary *binary)
|
||||
{
|
||||
const struct radv_physical_device *pdev = radv_device_physical(device);
|
||||
struct radv_shader_info *info = &binary->info;
|
||||
|
||||
info->regs.gs.vgt_esgs_ring_itemsize = info->gs_ring_info.esgs_itemsize;
|
||||
@@ -1474,6 +1475,41 @@ radv_precompute_registers_hw_gs(struct radv_device *device, struct radv_shader_b
|
||||
info->regs.gs.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(info->gs_ring_info.es_verts_per_subgroup) |
|
||||
S_028A44_GS_PRIMS_PER_SUBGRP(info->gs_ring_info.gs_prims_per_subgroup) |
|
||||
S_028A44_GS_INST_PRIMS_IN_SUBGRP(info->gs_ring_info.gs_inst_prims_in_subgroup);
|
||||
|
||||
const uint32_t gs_max_out_vertices = info->gs.vertices_out;
|
||||
const uint8_t max_stream = info->gs.max_stream;
|
||||
const uint8_t *num_components = info->gs.num_stream_output_components;
|
||||
|
||||
uint32_t offset = num_components[0] * gs_max_out_vertices;
|
||||
info->regs.gs.vgt_gsvs_ring_offset[0] = offset;
|
||||
|
||||
if (max_stream >= 1)
|
||||
offset += num_components[1] * gs_max_out_vertices;
|
||||
info->regs.gs.vgt_gsvs_ring_offset[1] = offset;
|
||||
|
||||
if (max_stream >= 2)
|
||||
offset += num_components[2] * gs_max_out_vertices;
|
||||
info->regs.gs.vgt_gsvs_ring_offset[2] = offset;
|
||||
|
||||
if (max_stream >= 3)
|
||||
offset += num_components[3] * gs_max_out_vertices;
|
||||
info->regs.gs.vgt_gsvs_ring_itemsize = offset;
|
||||
|
||||
for (uint32_t i = 0; i < 4; i++)
|
||||
info->regs.gs.vgt_gs_vert_itemsize[i] = (max_stream >= i) ? num_components[i] : 0;
|
||||
|
||||
const uint32_t gs_num_invocations = info->gs.invocations;
|
||||
info->regs.gs.vgt_gs_instance_cnt =
|
||||
S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0);
|
||||
|
||||
info->regs.gs.spi_shader_pgm_rsrc3_gs =
|
||||
ac_apply_cu_en(S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F), C_00B21C_CU_EN, 0, &pdev->info);
|
||||
|
||||
if (pdev->info.gfx_level >= GFX10) {
|
||||
info->regs.gs.spi_shader_pgm_rsrc4_gs =
|
||||
ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0), C_00B204_CU_EN_GFX10,
|
||||
16, &pdev->info);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -1482,7 +1518,7 @@ radv_precompute_registers_hw_ms(struct radv_device *device, struct radv_shader_b
|
||||
const struct radv_physical_device *pdev = radv_device_physical(device);
|
||||
struct radv_shader_info *info = &binary->info;
|
||||
|
||||
info->regs.ms.vgt_gs_max_vert_out = pdev->mesh_fast_launch_2 ? info->ngg_info.max_out_verts : info->workgroup_size;
|
||||
info->regs.vgt_gs_max_vert_out = pdev->mesh_fast_launch_2 ? info->ngg_info.max_out_verts : info->workgroup_size;
|
||||
|
||||
info->regs.ms.spi_shader_gs_meshlet_dim = S_00B2B0_MESHLET_NUM_THREAD_X(info->cs.block_size[0] - 1) |
|
||||
S_00B2B0_MESHLET_NUM_THREAD_Y(info->cs.block_size[1] - 1) |
|
||||
@@ -1527,12 +1563,14 @@ radv_precompute_registers_hw_cs(struct radv_device *device, struct radv_shader_b
|
||||
static void
|
||||
radv_precompute_registers(struct radv_device *device, struct radv_shader_binary *binary)
|
||||
{
|
||||
const struct radv_shader_info *info = &binary->info;
|
||||
struct radv_shader_info *info = &binary->info;
|
||||
|
||||
switch (info->stage) {
|
||||
case MESA_SHADER_GEOMETRY:
|
||||
if (!info->is_ngg)
|
||||
radv_precompute_registers_hw_gs(device, binary);
|
||||
|
||||
info->regs.vgt_gs_max_vert_out = info->gs.vertices_out;
|
||||
break;
|
||||
case MESA_SHADER_MESH:
|
||||
radv_precompute_registers_hw_ms(device, binary);
|
||||
|
@@ -254,15 +254,20 @@ struct radv_shader_info {
|
||||
/* Precomputed register values. */
|
||||
struct {
|
||||
struct {
|
||||
uint32_t spi_shader_pgm_rsrc3_gs;
|
||||
uint32_t spi_shader_pgm_rsrc4_gs;
|
||||
uint32_t vgt_esgs_ring_itemsize;
|
||||
uint32_t vgt_gs_instance_cnt;
|
||||
uint32_t vgt_gs_max_prims_per_subgroup;
|
||||
uint32_t vgt_gs_onchip_cntl;
|
||||
uint32_t vgt_gs_vert_itemsize[4];
|
||||
uint32_t vgt_gsvs_ring_itemsize;
|
||||
uint32_t vgt_gsvs_ring_offset[3];
|
||||
} gs;
|
||||
|
||||
struct {
|
||||
uint32_t spi_shader_gs_meshlet_dim;
|
||||
uint32_t spi_shader_gs_meshlet_exp_alloc;
|
||||
uint32_t vgt_gs_max_vert_out;
|
||||
} ms;
|
||||
|
||||
struct {
|
||||
@@ -277,6 +282,9 @@ struct radv_shader_info {
|
||||
uint32_t compute_num_thread_z;
|
||||
uint32_t compute_resource_limits;
|
||||
} cs;
|
||||
|
||||
/* Common registers between stages. */
|
||||
uint32_t vgt_gs_max_vert_out;
|
||||
} regs;
|
||||
};
|
||||
|
||||
|
Reference in New Issue
Block a user