diff --git a/src/freedreno/vulkan/tu_autotune.c b/src/freedreno/vulkan/tu_autotune.c index 0874fafa548..4476e50b7a7 100644 --- a/src/freedreno/vulkan/tu_autotune.c +++ b/src/freedreno/vulkan/tu_autotune.c @@ -57,6 +57,9 @@ * time, so in most cases there will be no locking. */ +void +tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results); + #define TU_AUTOTUNE_DEBUG_LOG 0 /* Dump history entries on autotuner finish, * could be used to gather data from traces. @@ -68,7 +71,6 @@ /* For how many submissions we store renderpass stats. */ #define MAX_HISTORY_LIFETIME 128 -#define TU_AUTOTUNE_RP_BO_SIZE 4096 /** * Tracks results for a given renderpass key @@ -88,62 +90,12 @@ struct tu_renderpass_history { uint32_t avg_samples; }; -struct tu_autotune_results_buffer -{ - int32_t ref_cnt; - - struct tu_device *device; - - /* TODO: It would be better to suballocate the space from - * a memory pool which would create less BOs and waste less space. - */ - struct tu_bo **bos; - uint32_t num_bos; - uint32_t results_written; -}; - -static struct tu_autotune_results_buffer* -tu_autotune_results_buffer_create(struct tu_device *dev) -{ - struct tu_autotune_results_buffer* buffer = - malloc(sizeof(struct tu_autotune_results_buffer)); - - buffer->ref_cnt = 1; - buffer->device = dev; - buffer->results_written = 0; - buffer->num_bos = 0; - buffer->bos = NULL; - - return buffer; -} - -void -tu_autotune_results_buffer_ref(struct tu_autotune_results_buffer *buffer) -{ - assert(buffer && buffer->ref_cnt >= 1); - p_atomic_inc(&buffer->ref_cnt); -} - -void -tu_autotune_results_buffer_unref(struct tu_autotune_results_buffer *buffer) -{ - assert(buffer && buffer->ref_cnt >= 1); - if (p_atomic_dec_zero(&buffer->ref_cnt)) { - for (int i = 0; i < buffer->num_bos; i++) - tu_bo_finish(buffer->device, buffer->bos[i]); - - ralloc_free(buffer->bos); - free(buffer); - } -} - /* Holds per-submission cs which writes the fence. */ struct tu_submission_data { struct list_head node; uint32_t fence; struct tu_cs fence_cs; - struct tu_autotune_results_buffer **buffers; uint32_t buffers_count; }; @@ -175,11 +127,7 @@ free_submission_data(struct tu_submission_data *data) { list_del(&data->node); tu_cs_finish(&data->fence_cs); - for (uint32_t i = 0; i < data->buffers_count; i++) { - tu_autotune_results_buffer_unref(data->buffers[i]); - } - free(data->buffers); free(data); } @@ -220,16 +168,17 @@ hash_renderpass_instance(const struct tu_render_pass *pass, } static void -free_result(struct tu_renderpass_result *result) +free_result(struct tu_device *dev, struct tu_renderpass_result *result) { + tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo); list_del(&result->node); free(result); } static void -free_history(struct tu_renderpass_history *history) +free_history(struct tu_device *dev, struct tu_renderpass_history *history) { - tu_autotune_free_results(&history->results); + tu_autotune_free_results_locked(dev, &history->results); free(history); } @@ -266,7 +215,7 @@ create_history_result(struct tu_autotune *at, uint64_t rp_key) } static void -history_add_result(struct tu_renderpass_history *history, +history_add_result(struct tu_device *dev, struct tu_renderpass_history *history, struct tu_renderpass_result *result) { list_delinit(&result->node); @@ -280,7 +229,9 @@ history_add_result(struct tu_renderpass_history *history, */ struct tu_renderpass_result *old_result = list_last_entry(&history->results, struct tu_renderpass_result, node); - free_result(old_result); + mtx_lock(&dev->autotune_mutex); + free_result(dev, old_result); + mtx_unlock(&dev->autotune_mutex); } /* Do calculations here to avoid locking history in tu_autotune_use_bypass */ @@ -297,7 +248,8 @@ history_add_result(struct tu_renderpass_history *history, static void process_results(struct tu_autotune *at) { - struct tu6_global *global = at->device->global_bo->map; + struct tu_device *dev = at->device; + struct tu6_global *global = dev->global_bo->map; uint32_t current_fence = global->autotune_fence; list_for_each_entry_safe(struct tu_renderpass_result, result, @@ -309,7 +261,7 @@ process_results(struct tu_autotune *at) result->samples_passed = result->samples->samples_end - result->samples->samples_start; - history_add_result(history, result); + history_add_result(dev, history, result); } list_for_each_entry_safe(struct tu_submission_data, submission_data, @@ -338,6 +290,7 @@ queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf) /* TODO: copying each result isn't nice */ struct tu_renderpass_result *copy = malloc(sizeof(*result)); *copy = *result; + tu_bo_get_ref(copy->bo.bo); list_addtail(©->node, &at->pending_results); } } @@ -393,19 +346,13 @@ tu_autotune_on_submit(struct tu_device *dev, struct tu_submission_data *submission_data = create_submission_data(dev, at); submission_data->buffers_count = result_buffers; - submission_data->buffers = - malloc(sizeof(struct tu_autotune_results_buffer *) * result_buffers); - uint32_t buffer_idx = 0; for (uint32_t i = 0; i < cmd_buffer_count; i++) { struct tu_cmd_buffer *cmdbuf = cmd_buffers[i]; if (list_is_empty(&cmdbuf->renderpass_autotune_results)) continue; queue_pending_results(at, cmdbuf); - - submission_data->buffers[buffer_idx++] = cmdbuf->autotune_buffer; - tu_autotune_results_buffer_ref(cmdbuf->autotune_buffer); } #if TU_AUTOTUNE_DEBUG_LOG != 0 @@ -430,7 +377,9 @@ tu_autotune_on_submit(struct tu_device *dev, _mesa_hash_table_remove_key(at->ht, &history->key); u_rwlock_wrunlock(&at->ht_lock); - free_history(history); + mtx_lock(&dev->autotune_mutex); + free_history(dev, history); + mtx_unlock(&dev->autotune_mutex); } return &submission_data->fence_cs; @@ -480,12 +429,14 @@ tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev) } #endif - tu_autotune_free_results(&at->pending_results); + tu_autotune_free_results(dev, &at->pending_results); + mtx_lock(&dev->autotune_mutex); hash_table_foreach(at->ht, entry) { struct tu_renderpass_history *history = entry->data; - free_history(history); + free_history(dev, history); } + mtx_unlock(&dev->autotune_mutex); list_for_each_entry_safe(struct tu_submission_data, submission_data, &at->pending_submission_data, node) { @@ -510,14 +461,22 @@ tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers, } void -tu_autotune_free_results(struct list_head *results) +tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results) { list_for_each_entry_safe(struct tu_renderpass_result, result, results, node) { - free_result(result); + free_result(dev, result); } } +void +tu_autotune_free_results(struct tu_device *dev, struct list_head *results) +{ + mtx_lock(&dev->autotune_mutex); + tu_autotune_free_results_locked(dev, results); + mtx_unlock(&dev->autotune_mutex); +} + static bool fallback_use_bypass(const struct tu_render_pass *pass, const struct tu_framebuffer *framebuffer, @@ -624,32 +583,6 @@ tu_autotune_use_bypass(struct tu_autotune *at, return fallback_use_bypass(pass, framebuffer, cmd_buffer); } -static uint32_t -get_offset_for_renderpass(struct tu_autotune_results_buffer *buffer) -{ - uint32_t results_per_bo = - TU_AUTOTUNE_RP_BO_SIZE / sizeof(struct tu_renderpass_samples); - return (buffer->results_written % results_per_bo) * - sizeof(struct tu_renderpass_samples); -} - -static struct tu_bo * -get_bo_for_renderpass(struct tu_autotune_results_buffer *buffer) -{ - if (get_offset_for_renderpass(buffer) == 0) { - buffer->num_bos++; - buffer->bos = - reralloc(NULL, buffer->bos, struct tu_bo *, buffer->num_bos); - struct tu_bo **new_bo = &buffer->bos[buffer->num_bos - 1]; - - tu_bo_init_new(buffer->device, new_bo, TU_AUTOTUNE_RP_BO_SIZE, - TU_BO_ALLOC_NO_FLAGS); - tu_bo_map(buffer->device, *new_bo); - } - - return buffer->bos[buffer->num_bos - 1]; -} - void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -658,21 +591,21 @@ tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd, if (!autotune_result) return; - /* Lazily allocate memory for renderpass results. - * Secondary command buffers do not support renderpasses. - */ - assert(cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); - if (!cmd->autotune_buffer) { - cmd->autotune_buffer = tu_autotune_results_buffer_create(cmd->device); + struct tu_device *dev = cmd->device; + + static const uint32_t size = sizeof(struct tu_renderpass_samples); + + mtx_lock(&dev->autotune_mutex); + VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size); + mtx_unlock(&dev->autotune_mutex); + if (ret != VK_SUCCESS) { + autotune_result->bo.iova = 0; + return; } - uint32_t bo_offset = get_offset_for_renderpass(cmd->autotune_buffer); - struct tu_bo *bo = get_bo_for_renderpass(cmd->autotune_buffer); + uint64_t result_iova = autotune_result->bo.iova; - uint64_t result_iova = bo->iova + bo_offset; - - autotune_result->samples = - (struct tu_renderpass_samples *) (bo->map + bo_offset); + autotune_result->samples = tu_suballoc_bo_map(&autotune_result->bo); tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); @@ -689,11 +622,10 @@ void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd, if (!autotune_result) return; - uint32_t bo_offset = get_offset_for_renderpass(cmd->autotune_buffer); - struct tu_bo *bo = cmd->autotune_buffer->bos[cmd->autotune_buffer->num_bos - 1]; - cmd->autotune_buffer->results_written += 1; + if (!autotune_result->bo.iova) + return; - uint64_t result_iova = bo->iova + bo_offset + + uint64_t result_iova = autotune_result->bo.iova + offsetof(struct tu_renderpass_samples, samples_end); tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h index 6d3fba012a6..633e5ba5e89 100644 --- a/src/freedreno/vulkan/tu_autotune.h +++ b/src/freedreno/vulkan/tu_autotune.h @@ -32,6 +32,7 @@ struct tu_device; struct tu_cmd_buffer; struct tu_renderpass_history; +struct tu_renderpass_result; /** * "autotune" our decisions about bypass vs GMEM rendering, based on historical @@ -111,32 +112,13 @@ struct tu_renderpass_samples { uint64_t __pad1; }; -/** - * Tracks the results from an individual renderpass. Initially created - * per renderpass, and appended to the tail of at->pending_results. At a later - * time, when the GPU has finished writing the results, we fill samples_passed. - */ -struct tu_renderpass_result { - /* Points into GPU memory */ - struct tu_renderpass_samples* samples; - - /* - * Below here, only used internally within autotune - */ - uint64_t rp_key; - struct tu_renderpass_history *history; - struct list_head node; - uint32_t fence; - uint64_t samples_passed; -}; - VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev); void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev); bool tu_autotune_use_bypass(struct tu_autotune *at, struct tu_cmd_buffer *cmd_buffer, struct tu_renderpass_result **autotune_result); -void tu_autotune_free_results(struct list_head *results); +void tu_autotune_free_results(struct tu_device *dev, struct list_head *results); bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count); @@ -152,9 +134,6 @@ struct tu_cs *tu_autotune_on_submit(struct tu_device *dev, struct tu_autotune_results_buffer; -void tu_autotune_results_buffer_ref(struct tu_autotune_results_buffer *buffer); -void tu_autotune_results_buffer_unref(struct tu_autotune_results_buffer *buffer); - void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_renderpass_result *autotune_result); diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 40bef6bdf69..05c98f0d9d4 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -1514,9 +1514,7 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer) u_trace_fini(&cmd_buffer->trace); - if (cmd_buffer->autotune_buffer) - tu_autotune_results_buffer_unref(cmd_buffer->autotune_buffer); - tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results); + tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results); for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { if (cmd_buffer->descriptors[i].push_set.layout) @@ -1542,16 +1540,7 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer) tu_cs_reset(&cmd_buffer->draw_epilogue_cs); tu_cs_reset(&cmd_buffer->sub_cs); - /* We can't just reset the autotune_buffer's contents, because it is also - * referenced by the submission_data if the command buffer was submitted - * and we may be accessing it after cmdbuf reset/free. - */ - if (cmd_buffer->autotune_buffer) { - tu_autotune_results_buffer_unref(cmd_buffer->autotune_buffer); - cmd_buffer->autotune_buffer = NULL; - } - - tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results); + tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results); for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets)); diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index e2d22892fe2..351affb259c 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -1729,6 +1729,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, mtx_init(&device->bo_mutex, mtx_plain); mtx_init(&device->pipeline_mutex, mtx_plain); + mtx_init(&device->autotune_mutex, mtx_plain); u_rwlock_init(&device->dma_bo_lock); pthread_mutex_init(&device->submit_mutex, NULL); @@ -1789,6 +1790,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, tu_bo_suballocator_init(&device->pipeline_suballoc, device, 128 * 1024, TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP); + tu_bo_suballocator_init(&device->autotune_suballoc, device, + 128 * 1024, 0); result = tu_bo_init_new(device, &device->global_bo, global_size, TU_BO_ALLOC_ALLOW_DUMP); @@ -1992,6 +1995,7 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) tu_autotune_fini(&device->autotune, device); tu_bo_suballocator_finish(&device->pipeline_suballoc); + tu_bo_suballocator_finish(&device->autotune_suballoc); util_sparse_array_finish(&device->bo_map); u_rwlock_destroy(&device->dma_bo_lock); diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index b992c381cfd..28a9c5ea1c8 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -438,6 +438,27 @@ enum global_shader { GLOBAL_SH_COUNT, }; +/** + * Tracks the results from an individual renderpass. Initially created + * per renderpass, and appended to the tail of at->pending_results. At a later + * time, when the GPU has finished writing the results, we fill samples_passed. + */ +struct tu_renderpass_result { + /* Points into GPU memory */ + struct tu_renderpass_samples* samples; + + struct tu_suballoc_bo bo; + + /* + * Below here, only used internally within autotune + */ + uint64_t rp_key; + struct tu_renderpass_history *history; + struct list_head node; + uint32_t fence; + uint64_t samples_passed; +}; + #define TU_BORDER_COLOR_COUNT 4096 #define TU_BORDER_COLOR_BUILTIN 6 @@ -514,6 +535,12 @@ struct tu_device struct tu_suballocator pipeline_suballoc; mtx_t pipeline_mutex; + /* Device-global BO suballocator for reducing BO management for small + * gmem/sysmem autotune result buffers. Synchronized by autotune_mutex. + */ + struct tu_suballocator autotune_suballoc; + mtx_t autotune_mutex; + /* the blob seems to always use 8K factor and 128K param sizes, copy them */ #define TU_TESS_FACTOR_SIZE (8 * 1024) #define TU_TESS_PARAM_SIZE (128 * 1024)