tu/perfetto: Allow gpu time to be passed into tu_perfetto_submit

In preparation to support perfetto on KGSL, on KGSL GPU time is
retrieved on submission and requires minimal post-processing.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12805>
This commit is contained in:
Danylo Piliaiev
2023-08-09 16:14:07 +02:00
committed by Marge Bot
parent 18a47efb80
commit 7f59e37233
4 changed files with 103 additions and 62 deletions

View File

@@ -521,6 +521,11 @@ struct tu_u_trace_submission_data
uint32_t cmd_buffer_count;
uint32_t last_buffer_with_tracepoints;
struct tu_u_trace_cmd_data *cmd_trace_data;
/* GPU time is reset on GPU power cycle and the GPU time
* offset may change between submissions due to power cycle.
*/
uint64_t gpu_ts_offset;
};
VkResult

View File

@@ -919,14 +919,18 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
p_atomic_set(&queue->fence, req.fence);
uint64_t gpu_offset = 0;
#if HAVE_PERFETTO
tu_perfetto_submit(queue->device, queue->device->submit_count);
struct tu_perfetto_clocks clocks =
tu_perfetto_submit(queue->device, queue->device->submit_count, NULL);
gpu_offset = clocks.gpu_ts_offset;
#endif
if (submit->u_trace_submission_data) {
struct tu_u_trace_submission_data *submission_data =
submit->u_trace_submission_data;
submission_data->submission_id = queue->device->submit_count;
submission_data->gpu_ts_offset = gpu_offset;
/* We have to allocate it here since it is different between drm/kgsl */
submission_data->syncobj = (struct tu_u_trace_syncobj *)
vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj),

View File

@@ -247,6 +247,7 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
auto trace_flush_data =
(const struct tu_u_trace_submission_data *) flush_data;
uint32_t submission_id = trace_flush_data->submission_id;
uint64_t gpu_ts_offset = trace_flush_data->gpu_ts_offset;
if (!stage)
return;
@@ -272,9 +273,9 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
auto packet = tctx.NewTracePacket();
gpu_max_timestamp = MAX2(gpu_max_timestamp, ts_ns + gpu_timestamp_offset);
gpu_max_timestamp = MAX2(gpu_max_timestamp, ts_ns + gpu_ts_offset);
packet->set_timestamp(stage->start_ts + gpu_timestamp_offset);
packet->set_timestamp(stage->start_ts + gpu_ts_offset);
packet->set_timestamp_clock_id(gpu_clock_id);
auto event = packet->set_gpu_render_stage_event();
@@ -315,64 +316,13 @@ tu_perfetto_init(void)
}
static void
sync_timestamp(struct tu_device *dev)
emit_sync_timestamp(uint64_t cpu_ts, uint64_t gpu_ts)
{
uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
uint64_t gpu_ts = 0;
if (cpu_ts < next_clock_sync_ns)
return;
if (tu_device_get_gpu_timestamp(dev, &gpu_ts)) {
PERFETTO_ELOG("Could not sync CPU and GPU clocks");
return;
}
/* get cpu timestamp again because tu_device_get_gpu_timestamp can take
* >100us
*/
cpu_ts = perfetto::base::GetBootTimeNs().count();
uint64_t current_suspend_count = 0;
/* If we fail to get it we will use a fallback */
tu_device_get_suspend_count(dev, &current_suspend_count);
/* convert GPU ts into ns: */
gpu_ts = tu_device_ticks_to_ns(dev, gpu_ts);
/* GPU timestamp is being reset after suspend-resume cycle.
* Perfetto requires clock snapshots to be monotonic,
* so we have to fix-up the time.
*/
if (current_suspend_count != last_suspend_count) {
gpu_timestamp_offset = gpu_max_timestamp;
last_suspend_count = current_suspend_count;
}
gpu_ts += gpu_timestamp_offset;
/* Fallback check, detect non-monotonic cases which would happen
* if we cannot retrieve suspend count.
*/
if (sync_gpu_ts > gpu_ts) {
gpu_ts += (gpu_max_timestamp - gpu_timestamp_offset);
gpu_timestamp_offset = gpu_max_timestamp;
}
if (sync_gpu_ts > gpu_ts) {
PERFETTO_ELOG("Non-monotonic gpu timestamp detected, bailing out");
return;
}
TuRenderpassDataSource::Trace([=](auto tctx) {
MesaRenderpassDataSource<TuRenderpassDataSource,
TuRenderpassTraits>::EmitClockSync(tctx, cpu_ts,
gpu_ts, gpu_clock_id);
});
gpu_max_timestamp = gpu_ts;
sync_gpu_ts = gpu_ts;
next_clock_sync_ns = cpu_ts + 30000000;
}
static void
@@ -390,15 +340,87 @@ emit_submit_id(uint32_t submission_id)
});
}
void
tu_perfetto_submit(struct tu_device *dev, uint32_t submission_id)
struct tu_perfetto_clocks
tu_perfetto_submit(struct tu_device *dev,
uint32_t submission_id,
struct tu_perfetto_clocks *gpu_clocks)
{
/* sync_timestamp isn't free */
if (!u_trace_perfetto_active(tu_device_get_u_trace(dev)))
return;
struct tu_perfetto_clocks clocks {};
if (gpu_clocks) {
clocks = *gpu_clocks;
}
sync_timestamp(dev);
if (!u_trace_perfetto_active(tu_device_get_u_trace(dev)))
return {};
clocks.cpu = perfetto::base::GetBootTimeNs().count();
if (gpu_clocks) {
/* TODO: It would be better to use CPU time that comes
* together with GPU time from the KGSL, but it's not
* equal to GetBootTimeNs.
*/
clocks.gpu_ts_offset = MAX2(gpu_timestamp_offset, clocks.gpu_ts_offset);
gpu_timestamp_offset = clocks.gpu_ts_offset;
sync_gpu_ts = clocks.gpu_ts + clocks.gpu_ts_offset;
} else {
clocks.gpu_ts = 0;
clocks.gpu_ts_offset = gpu_timestamp_offset;
if (clocks.cpu < next_clock_sync_ns)
return clocks;
if (tu_device_get_gpu_timestamp(dev, &clocks.gpu_ts)) {
PERFETTO_ELOG("Could not sync CPU and GPU clocks");
return {};
}
clocks.gpu_ts = tu_device_ticks_to_ns(dev, clocks.gpu_ts);
/* get cpu timestamp again because tu_device_get_gpu_timestamp can take
* >100us
*/
clocks.cpu = perfetto::base::GetBootTimeNs().count();
uint64_t current_suspend_count = 0;
/* If we fail to get it we will use a fallback */
tu_device_get_suspend_count(dev, &current_suspend_count);
/* GPU timestamp is being reset after suspend-resume cycle.
* Perfetto requires clock snapshots to be monotonic,
* so we have to fix-up the time.
*/
if (current_suspend_count != last_suspend_count) {
gpu_timestamp_offset = gpu_max_timestamp;
last_suspend_count = current_suspend_count;
}
clocks.gpu_ts_offset = gpu_timestamp_offset;
uint64_t gpu_absolute_ts = clocks.gpu_ts + clocks.gpu_ts_offset;
/* Fallback check, detect non-monotonic cases which would happen
* if we cannot retrieve suspend count.
*/
if (sync_gpu_ts > gpu_absolute_ts) {
gpu_absolute_ts += (gpu_max_timestamp - gpu_timestamp_offset);
gpu_timestamp_offset = gpu_max_timestamp;
clocks.gpu_ts = gpu_absolute_ts - gpu_timestamp_offset;
}
if (sync_gpu_ts > gpu_absolute_ts) {
PERFETTO_ELOG("Non-monotonic gpu timestamp detected, bailing out");
return {};
}
gpu_max_timestamp = clocks.gpu_ts;
sync_gpu_ts = clocks.gpu_ts;
next_clock_sync_ns = clocks.cpu + 30000000;
}
emit_sync_timestamp(clocks.cpu, clocks.gpu_ts + clocks.gpu_ts_offset);
emit_submit_id(submission_id);
return clocks;
}
/*

View File

@@ -39,7 +39,17 @@ struct tu_perfetto_state {
void tu_perfetto_init(void);
void tu_perfetto_submit(struct tu_device *dev, uint32_t submission_id);
struct tu_perfetto_clocks
{
uint64_t cpu;
uint64_t gpu_ts;
uint64_t gpu_ts_offset;
};
struct tu_perfetto_clocks
tu_perfetto_submit(struct tu_device *dev,
uint32_t submission_id,
struct tu_perfetto_clocks *clocks);
#ifdef __cplusplus
}