tu/perfetto: Allow gpu time to be passed into tu_perfetto_submit
In preparation to support perfetto on KGSL, on KGSL GPU time is retrieved on submission and requires minimal post-processing. Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12805>
This commit is contained in:

committed by
Marge Bot

parent
18a47efb80
commit
7f59e37233
@@ -521,6 +521,11 @@ struct tu_u_trace_submission_data
|
||||
uint32_t cmd_buffer_count;
|
||||
uint32_t last_buffer_with_tracepoints;
|
||||
struct tu_u_trace_cmd_data *cmd_trace_data;
|
||||
|
||||
/* GPU time is reset on GPU power cycle and the GPU time
|
||||
* offset may change between submissions due to power cycle.
|
||||
*/
|
||||
uint64_t gpu_ts_offset;
|
||||
};
|
||||
|
||||
VkResult
|
||||
|
@@ -919,14 +919,18 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
|
||||
|
||||
p_atomic_set(&queue->fence, req.fence);
|
||||
|
||||
uint64_t gpu_offset = 0;
|
||||
#if HAVE_PERFETTO
|
||||
tu_perfetto_submit(queue->device, queue->device->submit_count);
|
||||
struct tu_perfetto_clocks clocks =
|
||||
tu_perfetto_submit(queue->device, queue->device->submit_count, NULL);
|
||||
gpu_offset = clocks.gpu_ts_offset;
|
||||
#endif
|
||||
|
||||
if (submit->u_trace_submission_data) {
|
||||
struct tu_u_trace_submission_data *submission_data =
|
||||
submit->u_trace_submission_data;
|
||||
submission_data->submission_id = queue->device->submit_count;
|
||||
submission_data->gpu_ts_offset = gpu_offset;
|
||||
/* We have to allocate it here since it is different between drm/kgsl */
|
||||
submission_data->syncobj = (struct tu_u_trace_syncobj *)
|
||||
vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj),
|
||||
|
@@ -247,6 +247,7 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
|
||||
auto trace_flush_data =
|
||||
(const struct tu_u_trace_submission_data *) flush_data;
|
||||
uint32_t submission_id = trace_flush_data->submission_id;
|
||||
uint64_t gpu_ts_offset = trace_flush_data->gpu_ts_offset;
|
||||
|
||||
if (!stage)
|
||||
return;
|
||||
@@ -272,9 +273,9 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
|
||||
|
||||
auto packet = tctx.NewTracePacket();
|
||||
|
||||
gpu_max_timestamp = MAX2(gpu_max_timestamp, ts_ns + gpu_timestamp_offset);
|
||||
gpu_max_timestamp = MAX2(gpu_max_timestamp, ts_ns + gpu_ts_offset);
|
||||
|
||||
packet->set_timestamp(stage->start_ts + gpu_timestamp_offset);
|
||||
packet->set_timestamp(stage->start_ts + gpu_ts_offset);
|
||||
packet->set_timestamp_clock_id(gpu_clock_id);
|
||||
|
||||
auto event = packet->set_gpu_render_stage_event();
|
||||
@@ -315,64 +316,13 @@ tu_perfetto_init(void)
|
||||
}
|
||||
|
||||
static void
|
||||
sync_timestamp(struct tu_device *dev)
|
||||
emit_sync_timestamp(uint64_t cpu_ts, uint64_t gpu_ts)
|
||||
{
|
||||
uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
|
||||
uint64_t gpu_ts = 0;
|
||||
|
||||
if (cpu_ts < next_clock_sync_ns)
|
||||
return;
|
||||
|
||||
if (tu_device_get_gpu_timestamp(dev, &gpu_ts)) {
|
||||
PERFETTO_ELOG("Could not sync CPU and GPU clocks");
|
||||
return;
|
||||
}
|
||||
|
||||
/* get cpu timestamp again because tu_device_get_gpu_timestamp can take
|
||||
* >100us
|
||||
*/
|
||||
cpu_ts = perfetto::base::GetBootTimeNs().count();
|
||||
|
||||
uint64_t current_suspend_count = 0;
|
||||
/* If we fail to get it we will use a fallback */
|
||||
tu_device_get_suspend_count(dev, ¤t_suspend_count);
|
||||
|
||||
/* convert GPU ts into ns: */
|
||||
gpu_ts = tu_device_ticks_to_ns(dev, gpu_ts);
|
||||
|
||||
/* GPU timestamp is being reset after suspend-resume cycle.
|
||||
* Perfetto requires clock snapshots to be monotonic,
|
||||
* so we have to fix-up the time.
|
||||
*/
|
||||
if (current_suspend_count != last_suspend_count) {
|
||||
gpu_timestamp_offset = gpu_max_timestamp;
|
||||
last_suspend_count = current_suspend_count;
|
||||
}
|
||||
|
||||
gpu_ts += gpu_timestamp_offset;
|
||||
|
||||
/* Fallback check, detect non-monotonic cases which would happen
|
||||
* if we cannot retrieve suspend count.
|
||||
*/
|
||||
if (sync_gpu_ts > gpu_ts) {
|
||||
gpu_ts += (gpu_max_timestamp - gpu_timestamp_offset);
|
||||
gpu_timestamp_offset = gpu_max_timestamp;
|
||||
}
|
||||
|
||||
if (sync_gpu_ts > gpu_ts) {
|
||||
PERFETTO_ELOG("Non-monotonic gpu timestamp detected, bailing out");
|
||||
return;
|
||||
}
|
||||
|
||||
TuRenderpassDataSource::Trace([=](auto tctx) {
|
||||
MesaRenderpassDataSource<TuRenderpassDataSource,
|
||||
TuRenderpassTraits>::EmitClockSync(tctx, cpu_ts,
|
||||
gpu_ts, gpu_clock_id);
|
||||
});
|
||||
|
||||
gpu_max_timestamp = gpu_ts;
|
||||
sync_gpu_ts = gpu_ts;
|
||||
next_clock_sync_ns = cpu_ts + 30000000;
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -390,15 +340,87 @@ emit_submit_id(uint32_t submission_id)
|
||||
});
|
||||
}
|
||||
|
||||
void
|
||||
tu_perfetto_submit(struct tu_device *dev, uint32_t submission_id)
|
||||
struct tu_perfetto_clocks
|
||||
tu_perfetto_submit(struct tu_device *dev,
|
||||
uint32_t submission_id,
|
||||
struct tu_perfetto_clocks *gpu_clocks)
|
||||
{
|
||||
/* sync_timestamp isn't free */
|
||||
if (!u_trace_perfetto_active(tu_device_get_u_trace(dev)))
|
||||
return;
|
||||
struct tu_perfetto_clocks clocks {};
|
||||
if (gpu_clocks) {
|
||||
clocks = *gpu_clocks;
|
||||
}
|
||||
|
||||
sync_timestamp(dev);
|
||||
if (!u_trace_perfetto_active(tu_device_get_u_trace(dev)))
|
||||
return {};
|
||||
|
||||
clocks.cpu = perfetto::base::GetBootTimeNs().count();
|
||||
|
||||
if (gpu_clocks) {
|
||||
/* TODO: It would be better to use CPU time that comes
|
||||
* together with GPU time from the KGSL, but it's not
|
||||
* equal to GetBootTimeNs.
|
||||
*/
|
||||
|
||||
clocks.gpu_ts_offset = MAX2(gpu_timestamp_offset, clocks.gpu_ts_offset);
|
||||
gpu_timestamp_offset = clocks.gpu_ts_offset;
|
||||
sync_gpu_ts = clocks.gpu_ts + clocks.gpu_ts_offset;
|
||||
} else {
|
||||
clocks.gpu_ts = 0;
|
||||
clocks.gpu_ts_offset = gpu_timestamp_offset;
|
||||
|
||||
if (clocks.cpu < next_clock_sync_ns)
|
||||
return clocks;
|
||||
|
||||
if (tu_device_get_gpu_timestamp(dev, &clocks.gpu_ts)) {
|
||||
PERFETTO_ELOG("Could not sync CPU and GPU clocks");
|
||||
return {};
|
||||
}
|
||||
|
||||
clocks.gpu_ts = tu_device_ticks_to_ns(dev, clocks.gpu_ts);
|
||||
|
||||
/* get cpu timestamp again because tu_device_get_gpu_timestamp can take
|
||||
* >100us
|
||||
*/
|
||||
clocks.cpu = perfetto::base::GetBootTimeNs().count();
|
||||
|
||||
uint64_t current_suspend_count = 0;
|
||||
/* If we fail to get it we will use a fallback */
|
||||
tu_device_get_suspend_count(dev, ¤t_suspend_count);
|
||||
|
||||
/* GPU timestamp is being reset after suspend-resume cycle.
|
||||
* Perfetto requires clock snapshots to be monotonic,
|
||||
* so we have to fix-up the time.
|
||||
*/
|
||||
if (current_suspend_count != last_suspend_count) {
|
||||
gpu_timestamp_offset = gpu_max_timestamp;
|
||||
last_suspend_count = current_suspend_count;
|
||||
}
|
||||
clocks.gpu_ts_offset = gpu_timestamp_offset;
|
||||
|
||||
uint64_t gpu_absolute_ts = clocks.gpu_ts + clocks.gpu_ts_offset;
|
||||
|
||||
/* Fallback check, detect non-monotonic cases which would happen
|
||||
* if we cannot retrieve suspend count.
|
||||
*/
|
||||
if (sync_gpu_ts > gpu_absolute_ts) {
|
||||
gpu_absolute_ts += (gpu_max_timestamp - gpu_timestamp_offset);
|
||||
gpu_timestamp_offset = gpu_max_timestamp;
|
||||
clocks.gpu_ts = gpu_absolute_ts - gpu_timestamp_offset;
|
||||
}
|
||||
|
||||
if (sync_gpu_ts > gpu_absolute_ts) {
|
||||
PERFETTO_ELOG("Non-monotonic gpu timestamp detected, bailing out");
|
||||
return {};
|
||||
}
|
||||
|
||||
gpu_max_timestamp = clocks.gpu_ts;
|
||||
sync_gpu_ts = clocks.gpu_ts;
|
||||
next_clock_sync_ns = clocks.cpu + 30000000;
|
||||
}
|
||||
|
||||
emit_sync_timestamp(clocks.cpu, clocks.gpu_ts + clocks.gpu_ts_offset);
|
||||
emit_submit_id(submission_id);
|
||||
return clocks;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@@ -39,7 +39,17 @@ struct tu_perfetto_state {
|
||||
|
||||
void tu_perfetto_init(void);
|
||||
|
||||
void tu_perfetto_submit(struct tu_device *dev, uint32_t submission_id);
|
||||
struct tu_perfetto_clocks
|
||||
{
|
||||
uint64_t cpu;
|
||||
uint64_t gpu_ts;
|
||||
uint64_t gpu_ts_offset;
|
||||
};
|
||||
|
||||
struct tu_perfetto_clocks
|
||||
tu_perfetto_submit(struct tu_device *dev,
|
||||
uint32_t submission_id,
|
||||
struct tu_perfetto_clocks *clocks);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
Reference in New Issue
Block a user