radv: Track scratch usage across pipelines & command buffers.
Based on code written by Dave Airlie. Signed-off-by: Bas Nieuwenhuizen <basni@oogle.com> Reviewed-by: Dave Airlie <airlied@redhat.com>
This commit is contained in:
@@ -627,6 +627,13 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer,
|
|||||||
radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
|
radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
|
||||||
pipeline->graphics.prim_restart_enable);
|
pipeline->graphics.prim_restart_enable);
|
||||||
|
|
||||||
|
cmd_buffer->scratch_size_needed =
|
||||||
|
MAX2(cmd_buffer->scratch_size_needed,
|
||||||
|
pipeline->max_waves * pipeline->scratch_bytes_per_wave);
|
||||||
|
|
||||||
|
radeon_set_context_reg(cmd_buffer->cs, R_0286E8_SPI_TMPRING_SIZE,
|
||||||
|
S_0286E8_WAVES(pipeline->max_waves) |
|
||||||
|
S_0286E8_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10));
|
||||||
cmd_buffer->state.emitted_pipeline = pipeline;
|
cmd_buffer->state.emitted_pipeline = pipeline;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1402,6 +1409,8 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
|
|||||||
free(up);
|
free(up);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cmd_buffer->scratch_size_needed = 0;
|
||||||
|
cmd_buffer->compute_scratch_size_needed = 0;
|
||||||
if (cmd_buffer->upload.upload_bo)
|
if (cmd_buffer->upload.upload_bo)
|
||||||
cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
|
cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
|
||||||
cmd_buffer->upload.upload_bo, 8);
|
cmd_buffer->upload.upload_bo, 8);
|
||||||
@@ -1629,9 +1638,15 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
|
|||||||
radeon_emit(cmd_buffer->cs, compute_shader->rsrc1);
|
radeon_emit(cmd_buffer->cs, compute_shader->rsrc1);
|
||||||
radeon_emit(cmd_buffer->cs, compute_shader->rsrc2);
|
radeon_emit(cmd_buffer->cs, compute_shader->rsrc2);
|
||||||
|
|
||||||
|
|
||||||
|
cmd_buffer->compute_scratch_size_needed =
|
||||||
|
MAX2(cmd_buffer->compute_scratch_size_needed,
|
||||||
|
pipeline->max_waves * pipeline->scratch_bytes_per_wave);
|
||||||
|
|
||||||
/* change these once we have scratch support */
|
/* change these once we have scratch support */
|
||||||
radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE,
|
radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE,
|
||||||
S_00B860_WAVES(32) | S_00B860_WAVESIZE(0));
|
S_00B860_WAVES(pipeline->max_waves) |
|
||||||
|
S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10));
|
||||||
|
|
||||||
radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
|
radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
|
||||||
radeon_emit(cmd_buffer->cs,
|
radeon_emit(cmd_buffer->cs,
|
||||||
@@ -1821,6 +1836,11 @@ void radv_CmdExecuteCommands(
|
|||||||
for (uint32_t i = 0; i < commandBufferCount; i++) {
|
for (uint32_t i = 0; i < commandBufferCount; i++) {
|
||||||
RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
|
RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
|
||||||
|
|
||||||
|
primary->scratch_size_needed = MAX2(primary->scratch_size_needed,
|
||||||
|
secondary->scratch_size_needed);
|
||||||
|
primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed,
|
||||||
|
secondary->compute_scratch_size_needed);
|
||||||
|
|
||||||
primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
|
primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -813,6 +813,28 @@ VkResult radv_CreateDevice(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if HAVE_LLVM < 0x0400
|
||||||
|
device->llvm_supports_spill = false;
|
||||||
|
#else
|
||||||
|
device->llvm_supports_spill = true;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* The maximum number of scratch waves. Scratch space isn't divided
|
||||||
|
* evenly between CUs. The number is only a function of the number of CUs.
|
||||||
|
* We can decrease the constant to decrease the scratch buffer size.
|
||||||
|
*
|
||||||
|
* sctx->scratch_waves must be >= the maximum posible size of
|
||||||
|
* 1 threadgroup, so that the hw doesn't hang from being unable
|
||||||
|
* to start any.
|
||||||
|
*
|
||||||
|
* The recommended value is 4 per CU at most. Higher numbers don't
|
||||||
|
* bring much benefit, but they still occupy chip resources (think
|
||||||
|
* async compute). I've seen ~2% performance difference between 4 and 32.
|
||||||
|
*/
|
||||||
|
uint32_t max_threads_per_block = 2048;
|
||||||
|
device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units,
|
||||||
|
max_threads_per_block / 64);
|
||||||
|
|
||||||
result = radv_device_init_meta(device);
|
result = radv_device_init_meta(device);
|
||||||
if (result != VK_SUCCESS)
|
if (result != VK_SUCCESS)
|
||||||
goto fail;
|
goto fail;
|
||||||
|
@@ -104,6 +104,19 @@ void radv_DestroyShaderModule(
|
|||||||
vk_free2(&device->alloc, pAllocator, module);
|
vk_free2(&device->alloc, pAllocator, module);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
radv_pipeline_destroy(struct radv_device *device,
|
||||||
|
struct radv_pipeline *pipeline,
|
||||||
|
const VkAllocationCallbacks* allocator)
|
||||||
|
{
|
||||||
|
for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i)
|
||||||
|
if (pipeline->shaders[i])
|
||||||
|
radv_shader_variant_destroy(device, pipeline->shaders[i]);
|
||||||
|
|
||||||
|
vk_free2(&device->alloc, allocator, pipeline);
|
||||||
|
}
|
||||||
|
|
||||||
void radv_DestroyPipeline(
|
void radv_DestroyPipeline(
|
||||||
VkDevice _device,
|
VkDevice _device,
|
||||||
VkPipeline _pipeline,
|
VkPipeline _pipeline,
|
||||||
@@ -115,11 +128,7 @@ void radv_DestroyPipeline(
|
|||||||
if (!_pipeline)
|
if (!_pipeline)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i)
|
radv_pipeline_destroy(device, pipeline, pAllocator);
|
||||||
if (pipeline->shaders[i])
|
|
||||||
radv_shader_variant_destroy(device, pipeline->shaders[i]);
|
|
||||||
|
|
||||||
vk_free2(&device->alloc, pAllocator, pipeline);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -499,6 +508,48 @@ radv_pipeline_compile(struct radv_pipeline *pipeline,
|
|||||||
return variant;
|
return variant;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static VkResult
|
||||||
|
radv_pipeline_scratch_init(struct radv_device *device,
|
||||||
|
struct radv_pipeline *pipeline)
|
||||||
|
{
|
||||||
|
unsigned scratch_bytes_per_wave = 0;
|
||||||
|
unsigned max_waves = 0;
|
||||||
|
unsigned min_waves = 1;
|
||||||
|
|
||||||
|
for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
|
||||||
|
if (pipeline->shaders[i]) {
|
||||||
|
unsigned max_stage_waves = device->scratch_waves;
|
||||||
|
|
||||||
|
scratch_bytes_per_wave = MAX2(scratch_bytes_per_wave,
|
||||||
|
pipeline->shaders[i]->config.scratch_bytes_per_wave);
|
||||||
|
|
||||||
|
max_stage_waves = MIN2(max_stage_waves,
|
||||||
|
4 * device->physical_device->rad_info.num_good_compute_units *
|
||||||
|
(256 / pipeline->shaders[i]->config.num_vgprs));
|
||||||
|
max_waves = MAX2(max_waves, max_stage_waves);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pipeline->shaders[MESA_SHADER_COMPUTE]) {
|
||||||
|
unsigned group_size = pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[0] *
|
||||||
|
pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[1] *
|
||||||
|
pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[2];
|
||||||
|
min_waves = MAX2(min_waves, round_up_u32(group_size, 64));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (scratch_bytes_per_wave)
|
||||||
|
max_waves = MIN2(max_waves, 0xffffffffu / scratch_bytes_per_wave);
|
||||||
|
|
||||||
|
if (scratch_bytes_per_wave && max_waves < min_waves) {
|
||||||
|
/* Not really true at this moment, but will be true on first
|
||||||
|
* execution. Avoid having hanging shaders. */
|
||||||
|
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
|
||||||
|
}
|
||||||
|
pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave;
|
||||||
|
pipeline->max_waves = max_waves;
|
||||||
|
return VK_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
static uint32_t si_translate_blend_function(VkBlendOp op)
|
static uint32_t si_translate_blend_function(VkBlendOp op)
|
||||||
{
|
{
|
||||||
switch (op) {
|
switch (op) {
|
||||||
@@ -1313,6 +1364,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
|
|||||||
const VkAllocationCallbacks *alloc)
|
const VkAllocationCallbacks *alloc)
|
||||||
{
|
{
|
||||||
struct radv_shader_module fs_m = {0};
|
struct radv_shader_module fs_m = {0};
|
||||||
|
VkResult result;
|
||||||
|
|
||||||
if (alloc == NULL)
|
if (alloc == NULL)
|
||||||
alloc = &device->alloc;
|
alloc = &device->alloc;
|
||||||
@@ -1421,7 +1473,8 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
|
|||||||
radv_dump_pipeline_stats(device, pipeline);
|
radv_dump_pipeline_stats(device, pipeline);
|
||||||
}
|
}
|
||||||
|
|
||||||
return VK_SUCCESS;
|
result = radv_pipeline_scratch_init(device, pipeline);
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
VkResult
|
VkResult
|
||||||
@@ -1447,7 +1500,7 @@ radv_graphics_pipeline_create(
|
|||||||
result = radv_pipeline_init(pipeline, device, cache,
|
result = radv_pipeline_init(pipeline, device, cache,
|
||||||
pCreateInfo, extra, pAllocator);
|
pCreateInfo, extra, pAllocator);
|
||||||
if (result != VK_SUCCESS) {
|
if (result != VK_SUCCESS) {
|
||||||
vk_free2(&device->alloc, pAllocator, pipeline);
|
radv_pipeline_destroy(device, pipeline, pAllocator);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1493,6 +1546,7 @@ static VkResult radv_compute_pipeline_create(
|
|||||||
RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
|
RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
|
||||||
RADV_FROM_HANDLE(radv_shader_module, module, pCreateInfo->stage.module);
|
RADV_FROM_HANDLE(radv_shader_module, module, pCreateInfo->stage.module);
|
||||||
struct radv_pipeline *pipeline;
|
struct radv_pipeline *pipeline;
|
||||||
|
VkResult result;
|
||||||
|
|
||||||
pipeline = vk_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
|
pipeline = vk_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
|
||||||
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
|
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
|
||||||
@@ -1510,6 +1564,13 @@ static VkResult radv_compute_pipeline_create(
|
|||||||
pCreateInfo->stage.pSpecializationInfo,
|
pCreateInfo->stage.pSpecializationInfo,
|
||||||
pipeline->layout, NULL);
|
pipeline->layout, NULL);
|
||||||
|
|
||||||
|
|
||||||
|
result = radv_pipeline_scratch_init(device, pipeline);
|
||||||
|
if (result != VK_SUCCESS) {
|
||||||
|
radv_pipeline_destroy(device, pipeline, pAllocator);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
*pPipeline = radv_pipeline_to_handle(pipeline);
|
*pPipeline = radv_pipeline_to_handle(pipeline);
|
||||||
|
|
||||||
if (device->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) {
|
if (device->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) {
|
||||||
|
@@ -485,6 +485,8 @@ struct radv_device {
|
|||||||
|
|
||||||
uint64_t debug_flags;
|
uint64_t debug_flags;
|
||||||
|
|
||||||
|
bool llvm_supports_spill;
|
||||||
|
uint32_t scratch_waves;
|
||||||
/* MSAA sample locations.
|
/* MSAA sample locations.
|
||||||
* The first index is the sample index.
|
* The first index is the sample index.
|
||||||
* The second index is the coordinate: X, Y. */
|
* The second index is the coordinate: X, Y. */
|
||||||
@@ -726,6 +728,9 @@ struct radv_cmd_buffer {
|
|||||||
struct radv_cmd_buffer_upload upload;
|
struct radv_cmd_buffer_upload upload;
|
||||||
|
|
||||||
bool record_fail;
|
bool record_fail;
|
||||||
|
|
||||||
|
uint32_t scratch_size_needed;
|
||||||
|
uint32_t compute_scratch_size_needed;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct radv_image;
|
struct radv_image;
|
||||||
@@ -923,6 +928,9 @@ struct radv_pipeline {
|
|||||||
bool prim_restart_enable;
|
bool prim_restart_enable;
|
||||||
} graphics;
|
} graphics;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
unsigned max_waves;
|
||||||
|
unsigned scratch_bytes_per_wave;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct radv_graphics_pipeline_create_info {
|
struct radv_graphics_pipeline_create_info {
|
||||||
|
Reference in New Issue
Block a user