diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index ebf623908cf..4299ab72356 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2647,17 +2647,16 @@ radv_emit_patch_control_points(struct radv_cmd_buffer *cmd_buffer) */ if (cmd_buffer->state.uses_dynamic_patch_control_points) { /* Compute the number of patches. */ - cmd_buffer->state.tess_num_patches = get_tcs_num_patches( - d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, vs->info.vs.num_linked_outputs, + cmd_buffer->state.tess_num_patches = radv_get_tcs_num_patches( + pdev, d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, vs->info.vs.num_linked_outputs, tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs, - tcs->info.tcs.num_linked_outputs, tcs->info.tcs.num_linked_patch_outputs, pdev->hs.tess_offchip_block_dw_size, - pdev->info.gfx_level, pdev->info.family); + tcs->info.tcs.num_linked_outputs, tcs->info.tcs.num_linked_patch_outputs); /* Compute the LDS size. */ cmd_buffer->state.tess_lds_size = - calculate_tess_lds_size(pdev->info.gfx_level, d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, - vs->info.vs.num_linked_outputs, cmd_buffer->state.tess_num_patches, - tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs); + radv_get_tess_lds_size(pdev, d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, + vs->info.vs.num_linked_outputs, cmd_buffer->state.tess_num_patches, + tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs); } ls_hs_config = S_028B58_NUM_PATCHES(cmd_buffer->state.tess_num_patches) | diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 8d8a0290141..c840953f56d 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -2955,6 +2955,45 @@ radv_get_user_sgpr(const struct radv_shader *shader, int idx) return &shader->info.user_sgprs_locs.shader_data[idx]; } +static uint32_t +radv_get_tess_patch_size(uint32_t tcs_num_input_vertices, uint32_t tcs_num_output_vertices, uint32_t tcs_num_inputs, + uint32_t tcs_num_lds_outputs, uint32_t tcs_num_lds_patch_outputs) +{ + const uint32_t input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs); + const uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size; + const uint32_t lds_output_vertex_size = tcs_num_lds_outputs * 16; + const uint32_t lds_pervertex_output_patch_size = tcs_num_output_vertices * lds_output_vertex_size; + const uint32_t lds_output_patch_size = lds_pervertex_output_patch_size + tcs_num_lds_patch_outputs * 16; + + return input_patch_size + lds_output_patch_size; +} + +uint32_t +radv_get_tcs_num_patches(const struct radv_physical_device *pdev, unsigned tcs_num_input_vertices, + unsigned tcs_num_output_vertices, unsigned tcs_num_inputs, unsigned tcs_num_lds_outputs, + unsigned tcs_num_lds_patch_outputs, unsigned tcs_num_vram_outputs, + unsigned tcs_num_vram_patch_outputs) +{ + const uint32_t lds_per_patch = radv_get_tess_patch_size( + tcs_num_input_vertices, tcs_num_output_vertices, tcs_num_inputs, tcs_num_lds_outputs, tcs_num_lds_patch_outputs); + const uint32_t vram_per_patch = radv_get_tess_patch_size(tcs_num_input_vertices, tcs_num_output_vertices, 0, + tcs_num_vram_outputs, tcs_num_vram_patch_outputs); + + return ac_compute_num_tess_patches(&pdev->info, tcs_num_input_vertices, tcs_num_output_vertices, vram_per_patch, + lds_per_patch, pdev->ge_wave_size, false); +} + +uint32_t +radv_get_tess_lds_size(const struct radv_physical_device *pdev, uint32_t tcs_num_input_vertices, + uint32_t tcs_num_output_vertices, uint32_t tcs_num_inputs, uint32_t tcs_num_patches, + uint32_t tcs_num_lds_outputs, uint32_t tcs_num_lds_patch_outputs) +{ + const uint32_t lds_per_patch = radv_get_tess_patch_size( + tcs_num_input_vertices, tcs_num_output_vertices, tcs_num_inputs, tcs_num_lds_outputs, tcs_num_lds_patch_outputs); + + return ac_compute_tess_lds_size(&pdev->info, lds_per_patch, tcs_num_patches); +} + VkResult radv_dump_shader_stats(struct radv_device *device, struct radv_pipeline *pipeline, struct radv_shader *shader, gl_shader_stage stage, FILE *output) diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 4adad619389..ddda3af0f6f 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -642,85 +642,14 @@ get_tcs_input_vertex_stride(unsigned tcs_num_inputs) return stride; } -static inline unsigned -calculate_tess_lds_size(enum amd_gfx_level gfx_level, unsigned tcs_num_input_vertices, unsigned tcs_num_output_vertices, - unsigned tcs_num_inputs, unsigned tcs_num_patches, unsigned tcs_num_outputs, - unsigned tcs_num_patch_outputs) -{ - unsigned input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs); - unsigned output_vertex_size = tcs_num_outputs * 16; +uint32_t radv_get_tcs_num_patches(const struct radv_physical_device *pdev, unsigned tcs_num_input_vertices, + unsigned tcs_num_output_vertices, unsigned tcs_num_inputs, + unsigned tcs_num_lds_outputs, unsigned tcs_num_lds_patch_outputs, + unsigned tcs_num_vram_outputs, unsigned tcs_num_vram_patch_outputs); - unsigned input_patch_size = tcs_num_input_vertices * input_vertex_size; - - unsigned pervertex_output_patch_size = tcs_num_output_vertices * output_vertex_size; - unsigned output_patch_size = pervertex_output_patch_size + tcs_num_patch_outputs * 16; - - unsigned output_patch0_offset = input_patch_size * tcs_num_patches; - - unsigned lds_size = output_patch0_offset + output_patch_size * tcs_num_patches; - - if (gfx_level >= GFX7) { - assert(lds_size <= 65536); - lds_size = align(lds_size, 512) / 512; - } else { - assert(lds_size <= 32768); - lds_size = align(lds_size, 256) / 256; - } - - return lds_size; -} - -static inline unsigned -get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_vertices, unsigned tcs_num_inputs, - unsigned tcs_num_lds_outputs, unsigned tcs_num_lds_patch_outputs, unsigned tcs_num_vram_outputs, - unsigned tcs_num_vram_patch_outputs, unsigned tess_offchip_block_dw_size, - enum amd_gfx_level gfx_level, enum radeon_family family) -{ - uint32_t input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs); - uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size; - uint32_t lds_output_vertex_size = tcs_num_lds_outputs * 16; - uint32_t lds_pervertex_output_patch_size = tcs_num_output_vertices * lds_output_vertex_size; - uint32_t lds_output_patch_size = lds_pervertex_output_patch_size + tcs_num_lds_patch_outputs * 16; - - uint32_t vram_output_vertex_size = tcs_num_vram_outputs * 16; - uint32_t vram_pervertex_output_patch_size = tcs_num_output_vertices * vram_output_vertex_size; - uint32_t vram_output_patch_size = vram_pervertex_output_patch_size + tcs_num_vram_patch_outputs * 16; - - /* Ensure that we only need one wave per SIMD so we don't need to check - * resource usage. Also ensures that the number of tcs in and out - * vertices per threadgroup are at most 256. - */ - unsigned num_patches = 64 / MAX2(tcs_num_input_vertices, tcs_num_output_vertices) * 4; - /* Make sure that the data fits in LDS. This assumes the shaders only - * use LDS for the inputs and outputs. - */ - unsigned hardware_lds_size = 32768; - - /* Looks like STONEY hangs if we use more than 32 KiB LDS in a single - * threadgroup, even though there is more than 32 KiB LDS. - * - * Test: dEQP-VK.tessellation.shader_input_output.barrier - */ - if (gfx_level >= GFX7 && family != CHIP_STONEY) - hardware_lds_size = 65536; - - if (input_patch_size + lds_output_patch_size) - num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + lds_output_patch_size)); - /* Make sure the output data fits in the offchip buffer */ - if (vram_output_patch_size) - num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / vram_output_patch_size); - /* Not necessary for correctness, but improves performance. The - * specific value is taken from the proprietary driver. - */ - num_patches = MIN2(num_patches, 40); - - /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */ - if (gfx_level == GFX6) { - unsigned one_wave = 64 / MAX2(tcs_num_input_vertices, tcs_num_output_vertices); - num_patches = MIN2(num_patches, one_wave); - } - return num_patches; -} +uint32_t radv_get_tess_lds_size(const struct radv_physical_device *pdev, uint32_t tcs_num_input_vertices, + uint32_t tcs_num_output_vertices, uint32_t tcs_num_inputs, uint32_t tcs_num_patches, + uint32_t tcs_num_lds_outputs, uint32_t tcs_num_lds_patch_outputs); void radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage, const struct radv_graphics_state_key *gfx_state); diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index 21739d37b34..6282ba838b7 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -564,17 +564,15 @@ gather_shader_info_tcs(struct radv_device *device, const nir_shader *nir, if (gfx_state->ts.patch_control_points) { /* Number of tessellation patches per workgroup processed by the current pipeline. */ - info->num_tess_patches = get_tcs_num_patches( - gfx_state->ts.patch_control_points, nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs, + info->num_tess_patches = radv_get_tcs_num_patches( + pdev, gfx_state->ts.patch_control_points, nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs, info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs, info->tcs.num_linked_outputs, - info->tcs.num_linked_patch_outputs, pdev->hs.tess_offchip_block_dw_size, pdev->info.gfx_level, - pdev->info.family); + info->tcs.num_linked_patch_outputs); /* LDS size used by VS+TCS for storing TCS inputs and outputs. */ - info->tcs.num_lds_blocks = - calculate_tess_lds_size(pdev->info.gfx_level, gfx_state->ts.patch_control_points, - nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs, info->num_tess_patches, - info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs); + info->tcs.num_lds_blocks = radv_get_tess_lds_size( + pdev, gfx_state->ts.patch_control_points, nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs, + info->num_tess_patches, info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs); } }