diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index eb3562eb84e..e1f1022c52d 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -1756,3 +1756,85 @@ unsigned ac_get_compute_resource_limits(struct radeon_info *info, unsigned waves } return compute_resource_limits; } + +void ac_get_hs_info(struct radeon_info *info, + struct ac_hs_info *hs) +{ + bool double_offchip_buffers = info->chip_class >= GFX7 && + info->family != CHIP_CARRIZO && + info->family != CHIP_STONEY; + unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64; + unsigned max_offchip_buffers; + unsigned offchip_granularity; + unsigned hs_offchip_param; + + hs->tess_offchip_block_dw_size = + info->family == CHIP_HAWAII ? 4096 : 8192; + + /* + * Per RadeonSI: + * This must be one less than the maximum number due to a hw limitation. + * Various hardware bugs need this. + * + * Per AMDVLK: + * Vega10 should limit max_offchip_buffers to 508 (4 * 127). + * Gfx7 should limit max_offchip_buffers to 508 + * Gfx6 should limit max_offchip_buffers to 126 (2 * 63) + * + * Follow AMDVLK here. + */ + if (info->chip_class >= GFX10) { + max_offchip_buffers_per_se = 128; + } else if (info->family == CHIP_VEGA10 || + info->chip_class == GFX7 || + info->chip_class == GFX6) + --max_offchip_buffers_per_se; + + max_offchip_buffers = max_offchip_buffers_per_se * info->max_se; + + /* Hawaii has a bug with offchip buffers > 256 that can be worked + * around by setting 4K granularity. + */ + if (hs->tess_offchip_block_dw_size == 4096) { + assert(info->family == CHIP_HAWAII); + offchip_granularity = V_03093C_X_4K_DWORDS; + } else { + assert(hs->tess_offchip_block_dw_size == 8192); + offchip_granularity = V_03093C_X_8K_DWORDS; + } + + switch (info->chip_class) { + case GFX6: + max_offchip_buffers = MIN2(max_offchip_buffers, 126); + break; + case GFX7: + case GFX8: + case GFX9: + max_offchip_buffers = MIN2(max_offchip_buffers, 508); + break; + case GFX10: + break; + default: + break; + } + + hs->max_offchip_buffers = max_offchip_buffers; + + if (info->chip_class >= GFX10_3) { + hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(max_offchip_buffers - 1) | + S_03093C_OFFCHIP_GRANULARITY_GFX103(offchip_granularity); + } else if (info->chip_class >= GFX7) { + if (info->chip_class >= GFX8) + --max_offchip_buffers; + hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(max_offchip_buffers) | + S_03093C_OFFCHIP_GRANULARITY_GFX7(offchip_granularity); + } else { + hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers); + } + + hs->hs_offchip_param = hs_offchip_param; + + hs->tess_factor_ring_size = 32768 * info->max_se; + hs->tess_offchip_ring_offset = align(hs->tess_factor_ring_size, 64 * 1024); + hs->tess_offchip_ring_size = hs->max_offchip_buffers * hs->tess_offchip_block_dw_size * 4; +} diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index b85821c87f3..814e259bed1 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -261,6 +261,18 @@ void ac_get_harvested_configs(struct radeon_info *info, unsigned raster_config, unsigned ac_get_compute_resource_limits(struct radeon_info *info, unsigned waves_per_threadgroup, unsigned max_waves_per_sh, unsigned threadgroups_per_cu); +struct ac_hs_info { + uint32_t tess_offchip_block_dw_size; + uint32_t max_offchip_buffers; + uint32_t hs_offchip_param; + uint32_t tess_factor_ring_size; + uint32_t tess_offchip_ring_offset; + uint32_t tess_offchip_ring_size; +}; + +void ac_get_hs_info(struct radeon_info *info, + struct ac_hs_info *hs); + #ifdef __cplusplus } #endif diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index b850485b45c..9822deb815d 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -2735,88 +2735,6 @@ radv_device_init_gs_info(struct radv_device *device) device->physical_device->rad_info.family); } -static void -radv_device_init_hs_info(struct radv_device *device) -{ - bool double_offchip_buffers = device->physical_device->rad_info.chip_class >= GFX7 && - device->physical_device->rad_info.family != CHIP_CARRIZO && - device->physical_device->rad_info.family != CHIP_STONEY; - unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64; - unsigned max_offchip_buffers; - unsigned offchip_granularity; - unsigned hs_offchip_param; - - device->tess_offchip_block_dw_size = - device->physical_device->rad_info.family == CHIP_HAWAII ? 4096 : 8192; - - /* - * Per RadeonSI: - * This must be one less than the maximum number due to a hw limitation. - * Various hardware bugs need this. - * - * Per AMDVLK: - * Vega10 should limit max_offchip_buffers to 508 (4 * 127). - * Gfx7 should limit max_offchip_buffers to 508 - * Gfx6 should limit max_offchip_buffers to 126 (2 * 63) - * - * Follow AMDVLK here. - */ - if (device->physical_device->rad_info.chip_class >= GFX10) { - max_offchip_buffers_per_se = 128; - } else if (device->physical_device->rad_info.family == CHIP_VEGA10 || - device->physical_device->rad_info.chip_class == GFX7 || - device->physical_device->rad_info.chip_class == GFX6) - --max_offchip_buffers_per_se; - - max_offchip_buffers = max_offchip_buffers_per_se * device->physical_device->rad_info.max_se; - - /* Hawaii has a bug with offchip buffers > 256 that can be worked - * around by setting 4K granularity. - */ - if (device->tess_offchip_block_dw_size == 4096) { - assert(device->physical_device->rad_info.family == CHIP_HAWAII); - offchip_granularity = V_03093C_X_4K_DWORDS; - } else { - assert(device->tess_offchip_block_dw_size == 8192); - offchip_granularity = V_03093C_X_8K_DWORDS; - } - - switch (device->physical_device->rad_info.chip_class) { - case GFX6: - max_offchip_buffers = MIN2(max_offchip_buffers, 126); - break; - case GFX7: - case GFX8: - case GFX9: - max_offchip_buffers = MIN2(max_offchip_buffers, 508); - break; - case GFX10: - break; - default: - break; - } - - device->max_offchip_buffers = max_offchip_buffers; - - if (device->physical_device->rad_info.chip_class >= GFX10_3) { - hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(max_offchip_buffers - 1) | - S_03093C_OFFCHIP_GRANULARITY_GFX103(offchip_granularity); - } else if (device->physical_device->rad_info.chip_class >= GFX7) { - if (device->physical_device->rad_info.chip_class >= GFX8) - --max_offchip_buffers; - hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(max_offchip_buffers) | - S_03093C_OFFCHIP_GRANULARITY_GFX7(offchip_granularity); - } else { - hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers); - } - - device->hs_offchip_param = hs_offchip_param; - - device->tess_factor_ring_size = 32768 * device->physical_device->rad_info.max_se; - device->tess_offchip_ring_offset = align(device->tess_factor_ring_size, 64 * 1024); - device->tess_offchip_ring_size = device->max_offchip_buffers * device->tess_offchip_block_dw_size * 4; -} - static VkResult radv_device_init_border_color(struct radv_device *device) { @@ -3410,7 +3328,8 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr radv_device_init_gs_info(device); - radv_device_init_hs_info(device); + ac_get_hs_info(&device->physical_device->rad_info, + &device->hs); if (device->instance->debug_flags & RADV_DEBUG_HANG) { /* Enable GPU hangs detection and dump logs if a GPU hang is @@ -3755,11 +3674,11 @@ radv_fill_shader_rings(struct radv_queue *queue, uint32_t *map, bool add_sample_ if (tess_rings_bo) { uint64_t tess_va = radv_buffer_get_va(tess_rings_bo); - uint64_t tess_offchip_va = tess_va + queue->device->tess_offchip_ring_offset; + uint64_t tess_offchip_va = tess_va + queue->device->hs.tess_offchip_ring_offset; desc[0] = tess_va; desc[1] = S_008F04_BASE_ADDRESS_HI(tess_va >> 32); - desc[2] = queue->device->tess_factor_ring_size; + desc[2] = queue->device->hs.tess_factor_ring_size; desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); @@ -3773,7 +3692,7 @@ radv_fill_shader_rings(struct radv_queue *queue, uint32_t *map, bool add_sample_ desc[4] = tess_offchip_va; desc[5] = S_008F04_BASE_ADDRESS_HI(tess_offchip_va >> 32); - desc[6] = queue->device->tess_offchip_ring_size; + desc[6] = queue->device->hs.tess_offchip_ring_size; desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); @@ -3834,7 +3753,7 @@ radv_emit_tess_factor_ring(struct radv_queue *queue, struct radeon_cmdbuf *cs, if (!tess_rings_bo) return; - tf_ring_size = queue->device->tess_factor_ring_size / 4; + tf_ring_size = queue->device->hs.tess_factor_ring_size / 4; tf_va = radv_buffer_get_va(tess_rings_bo); radv_cs_add_buffer(queue->device->ws, cs, tess_rings_bo); @@ -3849,11 +3768,11 @@ radv_emit_tess_factor_ring(struct radv_queue *queue, struct radeon_cmdbuf *cs, } else if (queue->device->physical_device->rad_info.chip_class == GFX9) { radeon_set_uconfig_reg(cs, R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(tf_va >> 40)); } - radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM, queue->device->hs_offchip_param); + radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM, queue->device->hs.hs_offchip_param); } else { radeon_set_config_reg(cs, R_008988_VGT_TF_RING_SIZE, S_008988_SIZE(tf_ring_size)); radeon_set_config_reg(cs, R_0089B8_VGT_TF_MEMORY_BASE, tf_va >> 8); - radeon_set_config_reg(cs, R_0089B0_VGT_HS_OFFCHIP_PARAM, queue->device->hs_offchip_param); + radeon_set_config_reg(cs, R_0089B0_VGT_HS_OFFCHIP_PARAM, queue->device->hs.hs_offchip_param); } } @@ -4051,7 +3970,7 @@ radv_update_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave if (add_tess_rings) { result = queue->device->ws->buffer_create( - queue->device->ws, queue->device->tess_offchip_ring_offset + queue->device->tess_offchip_ring_size, 256, + queue->device->ws, queue->device->hs.tess_offchip_ring_offset + queue->device->hs.tess_offchip_ring_size, 256, RADEON_DOMAIN_VRAM, ring_bo_flags, RADV_BO_PRIORITY_SCRATCH, 0, &tess_rings_bo); if (result != VK_SUCCESS) goto fail; diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index b1636671fa4..63b5400836e 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -3553,7 +3553,7 @@ gather_tess_info(struct radv_device *device, struct radv_pipeline_stage *stages, tess_in_patch_size, tess_out_patch_size, stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_inputs, stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_outputs, - stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_patch_outputs, device->tess_offchip_block_dw_size, + stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_patch_outputs, device->hs.tess_offchip_block_dw_size, device->physical_device->rad_info.chip_class, device->physical_device->rad_info.family); /* LDS size used by VS+TCS for storing TCS inputs and outputs. */ diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 4cbfc84df88..2e5917f9456 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -774,16 +774,11 @@ struct radv_device { int queue_count[RADV_MAX_QUEUE_FAMILIES]; bool pbb_allowed; - uint32_t tess_offchip_block_dw_size; uint32_t scratch_waves; uint32_t dispatch_initiator; uint32_t gs_table_depth; - uint32_t hs_offchip_param; - uint32_t max_offchip_buffers; - uint32_t tess_offchip_ring_size; - uint32_t tess_offchip_ring_offset; - uint32_t tess_factor_ring_size; + struct ac_hs_info hs; /* MSAA sample locations. * The first index is the sample index.