ac/sqtt: add ac_thread_trace_data

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Acked-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8002>
This commit is contained in:
Pierre-Eric Pelloux-Prayer
2020-12-08 11:06:48 +01:00
parent b94104c0c0
commit 04f6ba113c
8 changed files with 107 additions and 70 deletions

View File

@@ -51,7 +51,8 @@ AMD_COMMON_FILES = \
common/ac_shader_util.c \
common/ac_shader_util.h \
common/ac_shadowed_regs.c \
common/ac_shadowed_regs.h
common/ac_shadowed_regs.h \
common/ac_sqtt.h
AMD_COMMON_LLVM_FILES = \
llvm/ac_llvm_build.c \

40
src/amd/common/ac_sqtt.h Normal file
View File

@@ -0,0 +1,40 @@
/*
* Copyright 2020 Advanced Micro Devices, Inc.
* Copyright 2020 Valve Corporation
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* on the rights to use, copy, modify, merge, publish, distribute, sub
* license, and/or sell copies of the Software, and to permit persons to whom
* the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef AC_SQTT_H
#define AC_SQTT_H
struct ac_thread_trace_data {
struct radeon_cmdbuf *start_cs[2];
struct radeon_cmdbuf *stop_cs[2];
/* struct radeon_winsys_bo or struct pb_buffer */
void *bo;
void *ptr;
uint32_t buffer_size;
int start_frame;
char *trigger_file;
};
#endif

View File

@@ -80,6 +80,7 @@ amd_common_files = files(
'ac_debug.h',
'ac_shadowed_regs.c',
'ac_shadowed_regs.h',
'ac_sqtt.h',
)
libamd_common = static_library(

View File

@@ -437,7 +437,7 @@ radv_describe_begin_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
struct rgp_sqtt_marker_cb_start marker = {0};
struct radeon_cmdbuf *cs = cmd_buffer->cs;
if (likely(!cmd_buffer->device->thread_trace_bo))
if (likely(!cmd_buffer->device->thread_trace.bo))
return;
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_CB_START;
@@ -462,7 +462,7 @@ radv_describe_end_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
struct rgp_sqtt_marker_cb_end marker = {0};
struct radeon_cmdbuf *cs = cmd_buffer->cs;
if (likely(!cmd_buffer->device->thread_trace_bo))
if (likely(!cmd_buffer->device->thread_trace.bo))
return;
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_CB_END;
@@ -476,7 +476,7 @@ radv_describe_end_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
void
radv_describe_draw(struct radv_cmd_buffer *cmd_buffer)
{
if (likely(!cmd_buffer->device->thread_trace_bo))
if (likely(!cmd_buffer->device->thread_trace.bo))
return;
radv_write_event_marker(cmd_buffer, cmd_buffer->state.current_event_type,
@@ -486,7 +486,7 @@ radv_describe_draw(struct radv_cmd_buffer *cmd_buffer)
void
radv_describe_dispatch(struct radv_cmd_buffer *cmd_buffer, int x, int y, int z)
{
if (likely(!cmd_buffer->device->thread_trace_bo))
if (likely(!cmd_buffer->device->thread_trace.bo))
return;
radv_write_event_with_dims_marker(cmd_buffer,
@@ -514,7 +514,7 @@ radv_describe_barrier_end_delayed(struct radv_cmd_buffer *cmd_buffer)
struct rgp_sqtt_marker_barrier_end marker = {0};
struct radeon_cmdbuf *cs = cmd_buffer->cs;
if (likely(!cmd_buffer->device->thread_trace_bo) ||
if (likely(!cmd_buffer->device->thread_trace.bo) ||
!cmd_buffer->state.pending_sqtt_barrier_end)
return;
@@ -571,7 +571,7 @@ radv_describe_barrier_start(struct radv_cmd_buffer *cmd_buffer,
struct rgp_sqtt_marker_barrier_start marker = {0};
struct radeon_cmdbuf *cs = cmd_buffer->cs;
if (likely(!cmd_buffer->device->thread_trace_bo))
if (likely(!cmd_buffer->device->thread_trace.bo))
return;
radv_describe_barrier_end_delayed(cmd_buffer);
@@ -597,7 +597,7 @@ radv_describe_layout_transition(struct radv_cmd_buffer *cmd_buffer,
struct rgp_sqtt_marker_layout_transition marker = {0};
struct radeon_cmdbuf *cs = cmd_buffer->cs;
if (likely(!cmd_buffer->device->thread_trace_bo))
if (likely(!cmd_buffer->device->thread_trace.bo))
return;
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_LAYOUT_TRANSITION;
@@ -635,11 +635,11 @@ radv_handle_thread_trace(VkQueue _queue)
if (radv_get_thread_trace(queue, &thread_trace))
radv_dump_thread_trace(queue->device, &thread_trace);
} else {
bool frame_trigger = num_frames == queue->device->thread_trace_start_frame;
bool frame_trigger = num_frames == queue->device->thread_trace.start_frame;
bool file_trigger = false;
if (queue->device->thread_trace_trigger_file &&
access(queue->device->thread_trace_trigger_file, W_OK) == 0) {
if (unlink(queue->device->thread_trace_trigger_file) == 0) {
if (queue->device->thread_trace.trigger_file &&
access(queue->device->thread_trace.trigger_file, W_OK) == 0) {
if (unlink(queue->device->thread_trace.trigger_file) == 0) {
file_trigger = true;
} else {
/* Do not enable tracing if we cannot remove the file,

View File

@@ -627,7 +627,7 @@ static void
radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer,
enum radv_cmd_flush_bits flags)
{
if (unlikely(cmd_buffer->device->thread_trace_bo)) {
if (unlikely(cmd_buffer->device->thread_trace.bo)) {
radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
}

View File

@@ -2908,13 +2908,13 @@ VkResult radv_CreateDevice(
}
/* Default buffer size set to 1MB per SE. */
device->thread_trace_buffer_size =
device->thread_trace.buffer_size =
radv_get_int_debug_option("RADV_THREAD_TRACE_BUFFER_SIZE", 1024 * 1024);
device->thread_trace_start_frame = radv_get_int_debug_option("RADV_THREAD_TRACE", -1);
device->thread_trace.start_frame = radv_get_int_debug_option("RADV_THREAD_TRACE", -1);
const char *trigger_file = getenv("RADV_THREAD_TRACE_TRIGGER");
if (trigger_file)
device->thread_trace_trigger_file = strdup(trigger_file);
device->thread_trace.trigger_file = strdup(trigger_file);
if (!radv_thread_trace_init(device))
goto fail;
@@ -3013,7 +3013,7 @@ fail:
radv_bo_list_finish(&device->bo_list);
radv_thread_trace_finish(device);
free(device->thread_trace_trigger_file);
free(device->thread_trace.trigger_file);
radv_trap_handler_finish(device);
@@ -3073,7 +3073,7 @@ void radv_DestroyDevice(
u_cnd_monotonic_destroy(&device->timeline_cond);
radv_bo_list_finish(&device->bo_list);
free(device->thread_trace_trigger_file);
free(device->thread_trace.trigger_file);
radv_thread_trace_finish(device);
vk_free(&device->vk.alloc, device);

View File

@@ -67,6 +67,7 @@
#include "radv_descriptor_set.h"
#include "radv_extensions.h"
#include "sid.h"
#include "ac_sqtt.h"
/* Pre-declarations needed for WSI entrypoints */
struct wl_surface;
@@ -846,13 +847,7 @@ struct radv_device {
struct u_cnd_monotonic timeline_cond;
/* Thread trace. */
struct radeon_cmdbuf *thread_trace_start_cs[2];
struct radeon_cmdbuf *thread_trace_stop_cs[2];
struct radeon_winsys_bo *thread_trace_bo;
void *thread_trace_ptr;
uint32_t thread_trace_buffer_size;
int thread_trace_start_frame;
char *thread_trace_trigger_file;
struct ac_thread_trace_data thread_trace;
/* Trap handler. */
struct radv_shader_variant *trap_handler_shader;

View File

@@ -42,7 +42,7 @@ radv_thread_trace_get_data_offset(struct radv_device *device, unsigned se)
data_offset = align64(sizeof(struct radv_thread_trace_info) * 4,
1 << SQTT_BUFFER_ALIGN_SHIFT);
data_offset += device->thread_trace_buffer_size * se;
data_offset += device->thread_trace.buffer_size * se;
return data_offset;
}
@@ -50,14 +50,14 @@ radv_thread_trace_get_data_offset(struct radv_device *device, unsigned se)
static uint64_t
radv_thread_trace_get_info_va(struct radv_device *device, unsigned se)
{
uint64_t va = radv_buffer_get_va(device->thread_trace_bo);
uint64_t va = radv_buffer_get_va(device->thread_trace.bo);
return va + radv_thread_trace_get_info_offset(se);
}
static uint64_t
radv_thread_trace_get_data_va(struct radv_device *device, unsigned se)
{
uint64_t va = radv_buffer_get_va(device->thread_trace_bo);
uint64_t va = radv_buffer_get_va(device->thread_trace.bo);
return va + radv_thread_trace_get_data_offset(device, se);
}
@@ -66,7 +66,7 @@ radv_emit_thread_trace_start(struct radv_device *device,
struct radeon_cmdbuf *cs,
uint32_t queue_family_index)
{
uint32_t shifted_size = device->thread_trace_buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
uint32_t shifted_size = device->thread_trace.buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
unsigned max_se = device->physical_device->rad_info.max_se;
assert(device->physical_device->rad_info.chip_class >= GFX8);
@@ -412,80 +412,80 @@ radv_thread_trace_init_cs(struct radv_device *device)
/* Thread trace start CS. */
for (int family = 0; family < 2; ++family) {
device->thread_trace_start_cs[family] = ws->cs_create(ws, family);
if (!device->thread_trace_start_cs[family])
device->thread_trace.start_cs[family] = ws->cs_create(ws, family);
if (!device->thread_trace.start_cs[family])
return;
switch (family) {
case RADV_QUEUE_GENERAL:
radeon_emit(device->thread_trace_start_cs[family], PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
radeon_emit(device->thread_trace_start_cs[family], CC0_UPDATE_LOAD_ENABLES(1));
radeon_emit(device->thread_trace_start_cs[family], CC1_UPDATE_SHADOW_ENABLES(1));
radeon_emit(device->thread_trace.start_cs[family], PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
radeon_emit(device->thread_trace.start_cs[family], CC0_UPDATE_LOAD_ENABLES(1));
radeon_emit(device->thread_trace.start_cs[family], CC1_UPDATE_SHADOW_ENABLES(1));
break;
case RADV_QUEUE_COMPUTE:
radeon_emit(device->thread_trace_start_cs[family], PKT3(PKT3_NOP, 0, 0));
radeon_emit(device->thread_trace_start_cs[family], 0);
radeon_emit(device->thread_trace.start_cs[family], PKT3(PKT3_NOP, 0, 0));
radeon_emit(device->thread_trace.start_cs[family], 0);
break;
}
radv_cs_add_buffer(ws, device->thread_trace_start_cs[family],
device->thread_trace_bo);
radv_cs_add_buffer(ws, device->thread_trace.start_cs[family],
device->thread_trace.bo);
/* Make sure to wait-for-idle before starting SQTT. */
radv_emit_wait_for_idle(device,
device->thread_trace_start_cs[family],
device->thread_trace.start_cs[family],
family);
/* Enable SQG events that collects thread trace data. */
radv_emit_spi_config_cntl(device,
device->thread_trace_start_cs[family],
device->thread_trace.start_cs[family],
true);
radv_emit_thread_trace_start(device,
device->thread_trace_start_cs[family],
device->thread_trace.start_cs[family],
family);
result = ws->cs_finalize(device->thread_trace_start_cs[family]);
result = ws->cs_finalize(device->thread_trace.start_cs[family]);
if (result != VK_SUCCESS)
return;
}
/* Thread trace stop CS. */
for (int family = 0; family < 2; ++family) {
device->thread_trace_stop_cs[family] = ws->cs_create(ws, family);
if (!device->thread_trace_stop_cs[family])
device->thread_trace.stop_cs[family] = ws->cs_create(ws, family);
if (!device->thread_trace.stop_cs[family])
return;
switch (family) {
case RADV_QUEUE_GENERAL:
radeon_emit(device->thread_trace_stop_cs[family], PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
radeon_emit(device->thread_trace_stop_cs[family], CC0_UPDATE_LOAD_ENABLES(1));
radeon_emit(device->thread_trace_stop_cs[family], CC1_UPDATE_SHADOW_ENABLES(1));
radeon_emit(device->thread_trace.stop_cs[family], PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
radeon_emit(device->thread_trace.stop_cs[family], CC0_UPDATE_LOAD_ENABLES(1));
radeon_emit(device->thread_trace.stop_cs[family], CC1_UPDATE_SHADOW_ENABLES(1));
break;
case RADV_QUEUE_COMPUTE:
radeon_emit(device->thread_trace_stop_cs[family], PKT3(PKT3_NOP, 0, 0));
radeon_emit(device->thread_trace_stop_cs[family], 0);
radeon_emit(device->thread_trace.stop_cs[family], PKT3(PKT3_NOP, 0, 0));
radeon_emit(device->thread_trace.stop_cs[family], 0);
break;
}
radv_cs_add_buffer(ws, device->thread_trace_stop_cs[family],
device->thread_trace_bo);
radv_cs_add_buffer(ws, device->thread_trace.stop_cs[family],
device->thread_trace.bo);
/* Make sure to wait-for-idle before stopping SQTT. */
radv_emit_wait_for_idle(device,
device->thread_trace_stop_cs[family],
device->thread_trace.stop_cs[family],
family);
radv_emit_thread_trace_stop(device,
device->thread_trace_stop_cs[family],
device->thread_trace.stop_cs[family],
family);
/* Restore previous state by disabling SQG events. */
radv_emit_spi_config_cntl(device,
device->thread_trace_stop_cs[family],
device->thread_trace.stop_cs[family],
false);
result = ws->cs_finalize(device->thread_trace_stop_cs[family]);
result = ws->cs_finalize(device->thread_trace.stop_cs[family]);
if (result != VK_SUCCESS)
return;
}
@@ -500,25 +500,25 @@ radv_thread_trace_init_bo(struct radv_device *device)
/* The buffer size and address need to be aligned in HW regs. Align the
* size as early as possible so that we do all the allocation & addressing
* correctly. */
device->thread_trace_buffer_size = align64(device->thread_trace_buffer_size,
device->thread_trace.buffer_size = align64(device->thread_trace.buffer_size,
1u << SQTT_BUFFER_ALIGN_SHIFT);
/* Compute total size of the thread trace BO for 4 SEs. */
size = align64(sizeof(struct radv_thread_trace_info) * 4,
1 << SQTT_BUFFER_ALIGN_SHIFT);
size += device->thread_trace_buffer_size * 4;
size += device->thread_trace.buffer_size * 4;
device->thread_trace_bo = ws->buffer_create(ws, size, 4096,
device->thread_trace.bo = ws->buffer_create(ws, size, 4096,
RADEON_DOMAIN_VRAM,
RADEON_FLAG_CPU_ACCESS |
RADEON_FLAG_NO_INTERPROCESS_SHARING |
RADEON_FLAG_ZERO_VRAM,
RADV_BO_PRIORITY_SCRATCH);
if (!device->thread_trace_bo)
if (!device->thread_trace.bo)
return false;
device->thread_trace_ptr = ws->buffer_map(device->thread_trace_bo);
if (!device->thread_trace_ptr)
device->thread_trace.ptr = ws->buffer_map(device->thread_trace.bo);
if (!device->thread_trace.ptr)
return false;
return true;
@@ -539,14 +539,14 @@ radv_thread_trace_finish(struct radv_device *device)
{
struct radeon_winsys *ws = device->ws;
if (unlikely(device->thread_trace_bo))
ws->buffer_destroy(device->thread_trace_bo);
if (unlikely(device->thread_trace.bo))
ws->buffer_destroy(device->thread_trace.bo);
for (unsigned i = 0; i < 2; i++) {
if (device->thread_trace_start_cs[i])
ws->cs_destroy(device->thread_trace_start_cs[i]);
if (device->thread_trace_stop_cs[i])
ws->cs_destroy(device->thread_trace_stop_cs[i]);
if (device->thread_trace.start_cs[i])
ws->cs_destroy(device->thread_trace.start_cs[i]);
if (device->thread_trace.stop_cs[i])
ws->cs_destroy(device->thread_trace.stop_cs[i]);
}
}
@@ -554,7 +554,7 @@ bool
radv_begin_thread_trace(struct radv_queue *queue)
{
int family = queue->queue_family_index;
struct radeon_cmdbuf *cs = queue->device->thread_trace_start_cs[family];
struct radeon_cmdbuf *cs = queue->device->thread_trace.start_cs[family];
return radv_queue_internal_submit(queue, cs);
}
@@ -562,7 +562,7 @@ bool
radv_end_thread_trace(struct radv_queue *queue)
{
int family = queue->queue_family_index;
struct radeon_cmdbuf *cs = queue->device->thread_trace_stop_cs[family];
struct radeon_cmdbuf *cs = queue->device->thread_trace.stop_cs[family];
return radv_queue_internal_submit(queue, cs);
}
@@ -602,7 +602,7 @@ radv_get_thread_trace(struct radv_queue *queue,
{
struct radv_device *device = queue->device;
unsigned max_se = device->physical_device->rad_info.max_se;
void *thread_trace_ptr = device->thread_trace_ptr;
void *thread_trace_ptr = device->thread_trace.ptr;
memset(thread_trace, 0, sizeof(*thread_trace));
thread_trace->num_traces = max_se;