anv: implement u_trace support

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Acked-by: Antonio Caggiano <antonio.caggiano@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13996>
This commit is contained in:
Lionel Landwerlin
2021-11-18 17:45:57 +02:00
committed by Marge Bot
parent bb541d1159
commit cc5843a573
12 changed files with 759 additions and 59 deletions

View File

@@ -37,6 +37,7 @@
#include "perf/intel_perf.h"
#include "util/debug.h"
#include "util/perf/u_trace.h"
/** \file anv_batch_chain.c
*
@@ -1956,6 +1957,94 @@ setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue)
return VK_SUCCESS;
}
static VkResult
setup_utrace_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue,
struct anv_utrace_flush_copy *flush)
{
struct anv_device *device = queue->device;
VkResult result = anv_execbuf_add_bo(device, execbuf,
flush->batch_bo,
&flush->relocs, 0);
if (result != VK_SUCCESS)
return result;
result = anv_execbuf_add_sync(device, execbuf, flush->sync,
true /* is_signal */, 0 /* value */);
if (result != VK_SUCCESS)
return result;
if (flush->batch_bo->exec_obj_index != execbuf->bo_count - 1) {
uint32_t idx = flush->batch_bo->exec_obj_index;
uint32_t last_idx = execbuf->bo_count - 1;
struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
assert(execbuf->bos[idx] == flush->batch_bo);
execbuf->objects[idx] = execbuf->objects[last_idx];
execbuf->bos[idx] = execbuf->bos[last_idx];
execbuf->bos[idx]->exec_obj_index = idx;
execbuf->objects[last_idx] = tmp_obj;
execbuf->bos[last_idx] = flush->batch_bo;
flush->batch_bo->exec_obj_index = last_idx;
}
if (!device->info.has_llc) {
__builtin_ia32_mfence();
for (uint32_t i = 0; i < flush->batch_bo->size; i += CACHELINE_SIZE)
__builtin_ia32_clflush(flush->batch_bo->map);
}
execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
.buffers_ptr = (uintptr_t) execbuf->objects,
.buffer_count = execbuf->bo_count,
.batch_start_offset = 0,
.batch_len = flush->batch.next - flush->batch.start,
.flags = I915_EXEC_HANDLE_LUT | I915_EXEC_FENCE_ARRAY | queue->exec_flags |
(execbuf->has_relocs ? 0 : I915_EXEC_NO_RELOC),
.rsvd1 = device->context_id,
.rsvd2 = 0,
.num_cliprects = execbuf->syncobj_count,
.cliprects_ptr = (uintptr_t)execbuf->syncobjs,
};
return VK_SUCCESS;
}
static VkResult
anv_queue_exec_utrace_locked(struct anv_queue *queue,
struct anv_utrace_flush_copy *flush)
{
assert(flush->batch_bo);
struct anv_device *device = queue->device;
struct anv_execbuf execbuf;
anv_execbuf_init(&execbuf);
execbuf.alloc = &device->vk.alloc;
execbuf.alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE;
VkResult result = setup_utrace_execbuf(&execbuf, queue, flush);
if (result != VK_SUCCESS)
goto error;
int ret = queue->device->info.no_hw ? 0 :
anv_gem_execbuffer(queue->device, &execbuf.execbuf);
if (ret)
result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
struct drm_i915_gem_exec_object2 *objects = execbuf.objects;
for (uint32_t k = 0; k < execbuf.bo_count; k++) {
if (anv_bo_is_pinned(execbuf.bos[k]))
assert(execbuf.bos[k]->offset == objects[k].offset);
execbuf.bos[k]->offset = objects[k].offset;
}
error:
anv_execbuf_finish(&execbuf);
return result;
}
/* We lock around execbuf for three main reasons:
*
* 1) When a block pool is resized, we create a new gem handle with a
@@ -1992,16 +2081,37 @@ anv_queue_exec_locked(struct anv_queue *queue,
uint32_t perf_query_pass)
{
struct anv_device *device = queue->device;
struct anv_utrace_flush_copy *utrace_flush_data = NULL;
struct anv_execbuf execbuf;
anv_execbuf_init(&execbuf);
execbuf.alloc = &queue->device->vk.alloc;
execbuf.alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE;
execbuf.perf_query_pass = perf_query_pass;
/* Flush the trace points first, they need to be moved */
VkResult result =
anv_device_utrace_flush_cmd_buffers(queue,
cmd_buffer_count,
cmd_buffers,
&utrace_flush_data);
if (result != VK_SUCCESS)
goto error;
if (utrace_flush_data && !utrace_flush_data->batch_bo) {
result = anv_execbuf_add_sync(device, &execbuf,
utrace_flush_data->sync,
true /* is_signal */,
0);
if (result != VK_SUCCESS)
goto error;
utrace_flush_data = NULL;
}
/* Always add the workaround BO as it includes a driver identifier for the
* error_state.
*/
VkResult result =
result =
anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, 0);
if (result != VK_SUCCESS)
goto error;
@@ -2148,6 +2258,9 @@ anv_queue_exec_locked(struct anv_queue *queue,
error:
anv_execbuf_finish(&execbuf);
if (result == VK_SUCCESS && utrace_flush_data)
result = anv_queue_exec_utrace_locked(queue, utrace_flush_data);
return result;
}

View File

@@ -302,6 +302,8 @@ static VkResult anv_create_cmd_buffer(
anv_measure_init(cmd_buffer);
u_trace_init(&cmd_buffer->trace, &device->trace_context);
*pCommandBuffer = anv_cmd_buffer_to_handle(cmd_buffer);
return VK_SUCCESS;
@@ -343,6 +345,8 @@ VkResult anv_AllocateCommandBuffers(
static void
anv_cmd_buffer_destroy(struct anv_cmd_buffer *cmd_buffer)
{
u_trace_fini(&cmd_buffer->trace);
anv_measure_destroy(cmd_buffer);
list_del(&cmd_buffer->pool_link);
@@ -401,6 +405,10 @@ anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer)
&cmd_buffer->device->general_state_pool, 16384);
anv_measure_reset(cmd_buffer);
u_trace_fini(&cmd_buffer->trace);
u_trace_init(&cmd_buffer->trace, &cmd_buffer->device->trace_context);
return VK_SUCCESS;
}

View File

@@ -3297,6 +3297,8 @@ VkResult anv_CreateDevice(
anv_device_perf_init(device);
anv_device_utrace_init(device);
*pDevice = anv_device_to_handle(device);
return VK_SUCCESS;
@@ -3364,6 +3366,8 @@ void anv_DestroyDevice(
if (!device)
return;
anv_device_utrace_finish(device);
anv_device_finish_blorp(device);
anv_device_finish_rt_shaders(device);

View File

@@ -143,8 +143,9 @@ void genX(blorp_exec)(struct blorp_batch *batch,
const struct blorp_params *params);
void genX(cmd_emit_timestamp)(struct anv_batch *batch,
struct anv_bo *bo,
uint32_t offset);
struct anv_device *device,
struct anv_address addr,
bool end_of_pipe);
void
genX(rasterization_mode)(VkPolygonMode raster_mode,

View File

@@ -158,7 +158,11 @@ anv_measure_start_snapshot(struct anv_cmd_buffer *cmd_buffer,
unsigned index = measure->base.index++;
(*device->cmd_emit_timestamp)(batch, measure->bo, index * sizeof(uint64_t));
(*device->cmd_emit_timestamp)(batch, cmd_buffer->device,
(struct anv_address) {
.bo = measure->bo,
.offset = index * sizeof(uint64_t) },
true /* end_of_pipe */);
if (event_name == NULL)
event_name = intel_measure_snapshot_string(type);
@@ -195,7 +199,11 @@ anv_measure_end_snapshot(struct anv_cmd_buffer *cmd_buffer,
unsigned index = measure->base.index++;
assert(index % 2 == 1);
(*device->cmd_emit_timestamp)(batch, measure->bo, index * sizeof(uint64_t));
(*device->cmd_emit_timestamp)(batch, cmd_buffer->device,
(struct anv_address) {
.bo = measure->bo,
.offset = index * sizeof(uint64_t) },
true /* end_of_pipe */);
struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
memset(snapshot, 0, sizeof(*snapshot));

View File

@@ -57,6 +57,7 @@
#include "util/macros.h"
#include "util/hash_table.h"
#include "util/list.h"
#include "util/perf/u_trace.h"
#include "util/sparse_array.h"
#include "util/u_atomic.h"
#include "util/u_vector.h"
@@ -552,6 +553,46 @@ anv_bo_is_pinned(struct anv_bo *bo)
#endif
}
struct anv_address {
struct anv_bo *bo;
int64_t offset;
};
#define ANV_NULL_ADDRESS ((struct anv_address) { NULL, 0 })
static inline struct anv_address
anv_address_from_u64(uint64_t addr_u64)
{
assert(addr_u64 == intel_canonical_address(addr_u64));
return (struct anv_address) {
.bo = NULL,
.offset = addr_u64,
};
}
static inline bool
anv_address_is_null(struct anv_address addr)
{
return addr.bo == NULL && addr.offset == 0;
}
static inline uint64_t
anv_address_physical(struct anv_address addr)
{
if (addr.bo && anv_bo_is_pinned(addr.bo)) {
return intel_canonical_address(addr.bo->offset + addr.offset);
} else {
return intel_canonical_address(addr.offset);
}
}
static inline struct anv_address
anv_address_add(struct anv_address addr, uint64_t offset)
{
addr.offset += offset;
return addr;
}
/* Represents a lock-free linked list of "free" things. This is used by
* both the block pool and the state pools. Unfortunately, in order to
* solve the ABA problem, we can't use a single uint32_t head.
@@ -986,7 +1027,7 @@ struct anv_physical_device {
int64_t master_minor;
struct drm_i915_query_engine_info * engine_info;
void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_bo *, uint32_t );
void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address, bool);
struct intel_measure_device measure_device;
};
@@ -1094,11 +1135,6 @@ anv_device_upload_nir(struct anv_device *device,
const struct nir_shader *nir,
unsigned char sha1_key[20]);
struct anv_address {
struct anv_bo *bo;
int64_t offset;
};
struct anv_device {
struct vk_device vk;
@@ -1179,6 +1215,8 @@ struct anv_device {
const struct intel_l3_config *l3_config;
struct intel_debug_block_frame *debug_frame_desc;
struct u_trace_context trace_context;
};
#if defined(GFX_VERx10) && GFX_VERx10 >= 90
@@ -1506,42 +1544,6 @@ anv_batch_emit_reloc(struct anv_batch *batch,
return address_u64;
}
#define ANV_NULL_ADDRESS ((struct anv_address) { NULL, 0 })
static inline struct anv_address
anv_address_from_u64(uint64_t addr_u64)
{
assert(addr_u64 == intel_canonical_address(addr_u64));
return (struct anv_address) {
.bo = NULL,
.offset = addr_u64,
};
}
static inline bool
anv_address_is_null(struct anv_address addr)
{
return addr.bo == NULL && addr.offset == 0;
}
static inline uint64_t
anv_address_physical(struct anv_address addr)
{
if (addr.bo && anv_bo_is_pinned(addr.bo)) {
return intel_canonical_address(addr.bo->offset + addr.offset);
} else {
return intel_canonical_address(addr.offset);
}
}
static inline struct anv_address
anv_address_add(struct anv_address addr, uint64_t offset)
{
addr.offset += offset;
return addr;
}
static inline void
write_reloc(const struct anv_device *device, void *p, uint64_t v, bool flush)
{
@@ -3088,6 +3090,11 @@ struct anv_cmd_buffer {
* Used to increase allocation size for long command buffers.
*/
uint32_t total_batch_size;
/**
*
*/
struct u_trace trace;
};
/* Determine whether we can chain a given cmd_buffer to another one. We need
@@ -4541,6 +4548,29 @@ struct anv_memcpy_state {
struct anv_vb_cache_range vb_dirty;
};
struct anv_utrace_flush_copy {
struct u_trace trace;
struct anv_reloc_list relocs;
struct anv_batch batch;
struct anv_bo *batch_bo;
struct anv_bo *trace_bo;
struct vk_sync *sync;
struct anv_memcpy_state memcpy_state;
};
void anv_device_utrace_init(struct anv_device *device);
void anv_device_utrace_finish(struct anv_device *device);
VkResult
anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
uint32_t cmd_buffer_count,
struct anv_cmd_buffer **cmd_buffers,
struct anv_utrace_flush_copy **out_flush_data);
#define ANV_FROM_HANDLE(__anv_type, __name, __handle) \
VK_FROM_HANDLE(__anv_type, __name, __handle)

View File

@@ -0,0 +1,163 @@
#
# Copyright © 2021 Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
import argparse
import sys
#
# Tracepoint definitions:
#
def define_tracepoints(args):
from u_trace import Header, HeaderScope
from u_trace import ForwardDecl
from u_trace import Tracepoint
from u_trace import TracepointArg as Arg
from u_trace import TracepointArgStruct as ArgStruct
Header('anv_private.h', scope=HeaderScope.SOURCE)
Header('blorp/blorp_priv.h', scope=HeaderScope.HEADER)
def begin_end_tp(name, tp_args=[], tp_struct=None, end_pipelined=True):
Tracepoint('begin_{0}'.format(name))
Tracepoint('end_{0}'.format(name),
args=tp_args,
tp_struct=tp_struct,
end_of_pipe=end_pipelined)
begin_end_tp('cmd_buffer',
tp_args=[ArgStruct(type='uint8_t', var='level'),],
tp_struct=[Arg(type='uint8_t', name='level', var='level', c_format='%hhu'),],
end_pipelined=False)
begin_end_tp('render_pass',
tp_args=[ArgStruct(type='uint16_t', var='width'),
ArgStruct(type='uint16_t', var='height'),
ArgStruct(type='uint8_t', var='att_count'),
ArgStruct(type='uint8_t', var='msaa'),
ArgStruct(type='uint32_t', var='subpass_count'),],
tp_struct=[Arg(type='uint16_t', name='width', var='width', c_format='%hu'),
Arg(type='uint16_t', name='height', var='height', c_format='%hu'),
Arg(type='uint8_t', name='att_count', var='att_count', c_format='%hhu'),
Arg(type='uint8_t', name='msaa', var='msaa', c_format='%hhu'),
Arg(type='uint32_t', name='subpass_count', var='subpass_count', c_format='%ou'),])
begin_end_tp('blorp',
tp_args=[ArgStruct(type='uint32_t', var='width'),
ArgStruct(type='uint32_t', var='height'),
ArgStruct(type='enum isl_aux_op', var='hiz_op'),
ArgStruct(type='enum isl_aux_op', var='fast_clear_op'),
ArgStruct(type='enum blorp_shader_type', var='shader_type'),
ArgStruct(type='enum blorp_shader_pipeline', var='shader_pipe'),],
tp_struct=[Arg(type='uint32_t', name='width', var='width', c_format='%u'),
Arg(type='uint32_t', name='height', var='height', c_format='%u'),
Arg(type='enum isl_aux_op', name='hiz_op', var='hiz_op', c_format='%s', to_prim_type='isl_aux_op_to_name({})'),
Arg(type='enum isl_aux_op', name='fast_clear_op', var='fast_clear_op', c_format='%s', to_prim_type='isl_aux_op_to_name({})'),
Arg(type='enum blorp_shader_type', name='type', var='shader_type', c_format='%s', to_prim_type='blorp_shader_type_to_name({})'),
Arg(type='enum blorp_shader_pipeline', name='pipe', var='shader_pipe', c_format='%s', to_prim_type='blorp_shader_pipeline_to_name({})'),])
begin_end_tp('draw',
tp_args=[ArgStruct(type='uint32_t', var='count'),],
tp_struct=[Arg(type='uint32_t', name='count', var='count', c_format='%u'),])
begin_end_tp('draw_multi',
tp_args=[ArgStruct(type='uint32_t', var='count'),],
tp_struct=[Arg(type='uint32_t', name='count', var='count', c_format='%u'),])
begin_end_tp('draw_indexed',
tp_args=[ArgStruct(type='uint32_t', var='count'),],
tp_struct=[Arg(type='uint32_t', name='count', var='count', c_format='%u'),])
begin_end_tp('draw_indexed_multi',
tp_args=[ArgStruct(type='uint32_t', var='count'),],
tp_struct=[Arg(type='uint32_t', name='count', var='count', c_format='%u'),])
begin_end_tp('draw_indirect_byte_count',
tp_args=[ArgStruct(type='uint32_t', var='instance_count'),],
tp_struct=[Arg(type='uint32_t', name='instance_count', var='instance_count', c_format='%u'),])
begin_end_tp('draw_indirect',
tp_args=[ArgStruct(type='uint32_t', var='draw_count'),],
tp_struct=[Arg(type='uint32_t', name='draw_count', var='draw_count', c_format='%u'),])
begin_end_tp('draw_indexed_indirect',
tp_args=[ArgStruct(type='uint32_t', var='draw_count'),],
tp_struct=[Arg(type='uint32_t', name='draw_count', var='draw_count', c_format='%u'),])
begin_end_tp('draw_indirect_count',
tp_args=[ArgStruct(type='uint32_t', var='max_draw_count'),],
tp_struct=[Arg(type='uint32_t', name='max_draw_count', var='max_draw_count', c_format='%u'),])
begin_end_tp('draw_indexed_indirect_count',
tp_args=[ArgStruct(type='uint32_t', var='max_draw_count'),],
tp_struct=[Arg(type='uint32_t', name='max_draw_count', var='max_draw_count', c_format='%u'),])
begin_end_tp('compute',
tp_args=[ArgStruct(type='uint32_t', var='group_x'),
ArgStruct(type='uint32_t', var='group_y'),
ArgStruct(type='uint32_t', var='group_z'),],
tp_struct=[Arg(type='uint32_t', name='group_x', var='group_x', c_format='%u'),
Arg(type='uint32_t', name='group_y', var='group_y', c_format='%u'),
Arg(type='uint32_t', name='group_z', var='group_z', c_format='%u'),])
def stall_args(args):
fmt = ''
exprs = []
for a in args:
fmt += '%s'
exprs.append('(__entry->flags & ANV_PIPE_{0}_BIT) ? "+{1}" : ""'.format(a[0], a[1]))
fmt = [fmt]
fmt += exprs
return fmt
Tracepoint('stall',
args=[ArgStruct(type='uint32_t', var='flags'),],
tp_struct=[Arg(type='uint32_t', name='flags', var='flags', c_format='0x%x'),],
tp_print=stall_args([['DEPTH_CACHE_FLUSH', 'depth_flush'],
['DATA_CACHE_FLUSH', 'dc_flush'],
['HDC_PIPELINE_FLUSH', 'hdc_flush'],
['RENDER_TARGET_CACHE_FLUSH', 'rt_flush'],
['TILE_CACHE_FLUSH', 'tile_flush'],
['STATE_CACHE_INVALIDATE', 'state_inval'],
['CONSTANT_CACHE_INVALIDATE', 'const_inval'],
['VF_CACHE_INVALIDATE', 'vf_inval'],
['TEXTURE_CACHE_INVALIDATE', 'tex_inval'],
['INSTRUCTION_CACHE_INVALIDATE', 'ic_inval'],
['STALL_AT_SCOREBOARD', 'pb_stall'],
['DEPTH_STALL', 'depth_stall'],
['CS_STALL', 'cs_stall'],
]))
def generate_code(args):
from u_trace import utrace_generate
utrace_generate(cpath=args.utrace_src, hpath=args.utrace_hdr, ctx_param='struct anv_device *dev')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--import-path', required=True)
parser.add_argument('--utrace-src', required=True)
parser.add_argument('--utrace-hdr', required=True)
args = parser.parse_args()
sys.path.insert(0, args.import_path)
define_tracepoints(args)
generate_code(args)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,279 @@
/*
* Copyright © 2021 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "anv_private.h"
#include "perf/intel_perf.h"
static uint32_t
command_buffers_count_utraces(struct anv_device *device,
uint32_t cmd_buffer_count,
struct anv_cmd_buffer **cmd_buffers,
uint32_t *utrace_copies)
{
if (!u_trace_context_actively_tracing(&device->trace_context))
return 0;
uint32_t utraces = 0;
for (uint32_t i = 0; i < cmd_buffer_count; i++) {
if (u_trace_has_points(&cmd_buffers[i]->trace)) {
utraces++;
if (!(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
*utrace_copies += list_length(&cmd_buffers[i]->trace.trace_chunks);
}
}
return utraces;
}
static void
anv_utrace_delete_flush_data(struct u_trace_context *utctx,
void *flush_data)
{
struct anv_device *device =
container_of(utctx, struct anv_device, trace_context);
struct anv_utrace_flush_copy *flush = flush_data;
u_trace_fini(&flush->trace);
if (flush->trace_bo) {
assert(flush->batch_bo);
anv_reloc_list_finish(&flush->relocs, &device->vk.alloc);
anv_device_release_bo(device, flush->batch_bo);
anv_device_release_bo(device, flush->trace_bo);
}
vk_sync_destroy(&device->vk, flush->sync);
vk_free(&device->vk.alloc, flush);
}
static void
anv_device_utrace_emit_copy_ts_buffer(struct u_trace_context *utctx,
void *cmdstream,
void *ts_from, uint32_t from_offset,
void *ts_to, uint32_t to_offset,
uint32_t count)
{
struct anv_device *device =
container_of(utctx, struct anv_device, trace_context);
struct anv_utrace_flush_copy *flush = cmdstream;
struct anv_address from_addr = (struct anv_address) {
.bo = ts_from, .offset = from_offset * sizeof(uint64_t) };
struct anv_address to_addr = (struct anv_address) {
.bo = ts_to, .offset = to_offset * sizeof(uint64_t) };
anv_genX(&device->info, emit_so_memcpy)(&flush->memcpy_state,
to_addr, from_addr, count * sizeof(uint64_t));
}
VkResult
anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
uint32_t cmd_buffer_count,
struct anv_cmd_buffer **cmd_buffers,
struct anv_utrace_flush_copy **out_flush_data)
{
struct anv_device *device = queue->device;
uint32_t utrace_copies = 0;
uint32_t utraces = command_buffers_count_utraces(device,
cmd_buffer_count,
cmd_buffers,
&utrace_copies);
if (!utraces) {
*out_flush_data = NULL;
return VK_SUCCESS;
}
VkResult result;
struct anv_utrace_flush_copy *flush =
vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_flush_copy),
8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!flush)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
u_trace_init(&flush->trace, &device->trace_context);
result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type,
0, 0, &flush->sync);
if (result != VK_SUCCESS)
goto error_sync;
if (utrace_copies > 0) {
result =
anv_device_alloc_bo(device, "utrace-copy-buf", utrace_copies * 4096,
ANV_BO_ALLOC_MAPPED, 0 /* explicit_address */,
&flush->trace_bo);
if (result != VK_SUCCESS)
goto error_trace_buf;
result =
anv_device_alloc_bo(device, "utrace-copy-batch",
/* 128 dwords of setup + 64 dwords per copy */
align_u32(512 + 64 * utrace_copies, 4096),
ANV_BO_ALLOC_MAPPED, 0 /* explicit_address */,
&flush->batch_bo);
if (result != VK_SUCCESS)
goto error_batch_buf;
result = anv_reloc_list_init(&flush->relocs, &device->vk.alloc);
if (result != VK_SUCCESS)
goto error_reloc_list;
flush->batch.alloc = &device->vk.alloc;
flush->batch.relocs = &flush->relocs;
anv_batch_set_storage(&flush->batch,
(struct anv_address) { .bo = flush->batch_bo, },
flush->batch_bo->map, flush->batch_bo->size);
/* Emit the copies */
anv_genX(&device->info, emit_so_memcpy_init)(&flush->memcpy_state,
device,
&flush->batch);
for (uint32_t i = 0; i < cmd_buffer_count; i++) {
if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
u_trace_flush(&cmd_buffers[i]->trace, flush, false);
} else {
u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
u_trace_end_iterator(&cmd_buffers[i]->trace),
&flush->trace,
flush,
anv_device_utrace_emit_copy_ts_buffer);
}
}
anv_genX(&device->info, emit_so_memcpy_fini)(&flush->memcpy_state);
u_trace_flush(&flush->trace, flush, true);
if (flush->batch.status != VK_SUCCESS) {
result = flush->batch.status;
goto error_batch;
}
} else {
for (uint32_t i = 0; i < cmd_buffer_count; i++) {
assert(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
u_trace_flush(&cmd_buffers[i]->trace, flush, i == (cmd_buffer_count - 1));
}
}
*out_flush_data = flush;
return VK_SUCCESS;
error_batch:
anv_reloc_list_finish(&flush->relocs, &device->vk.alloc);
error_reloc_list:
anv_device_release_bo(device, flush->batch_bo);
error_batch_buf:
anv_device_release_bo(device, flush->trace_bo);
error_trace_buf:
vk_sync_destroy(&device->vk, flush->sync);
error_sync:
vk_free(&device->vk.alloc, flush);
return result;
}
static void *
anv_utrace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size_b)
{
struct anv_device *device =
container_of(utctx, struct anv_device, trace_context);
struct anv_bo *bo = NULL;
UNUSED VkResult result =
anv_device_alloc_bo(device, "utrace-ts", align_u32(size_b, 4096),
ANV_BO_ALLOC_MAPPED, 0, &bo);
assert(result == VK_SUCCESS);
return bo;
}
static void
anv_utrace_destroy_ts_buffer(struct u_trace_context *utctx, void *timestamps)
{
struct anv_device *device =
container_of(utctx, struct anv_device, trace_context);
struct anv_bo *bo = timestamps;
anv_device_release_bo(device, bo);
}
static void
anv_utrace_record_ts(struct u_trace *ut, void *cs, void *timestamps, unsigned idx,
bool end_of_pipe)
{
struct anv_cmd_buffer *cmd_buffer = cs;
struct anv_device *device = cmd_buffer->device;
struct anv_bo *bo = timestamps;
device->physical->cmd_emit_timestamp(&cmd_buffer->batch, device,
(struct anv_address) {
.bo = bo,
.offset = idx * sizeof(uint64_t) },
end_of_pipe);
}
static uint64_t
anv_utrace_read_ts(struct u_trace_context *utctx,
void *timestamps, unsigned idx, void *flush_data)
{
struct anv_device *device =
container_of(utctx, struct anv_device, trace_context);
struct anv_bo *bo = timestamps;
struct anv_utrace_flush_copy *flush = flush_data;
/* Only need to stall on results for the first entry: */
if (idx == 0) {
UNUSED VkResult result =
vk_sync_wait(&device->vk,
flush->sync,
0,
VK_SYNC_WAIT_COMPLETE,
os_time_get_absolute_timeout(OS_TIMEOUT_INFINITE));
assert(result == VK_SUCCESS);
}
uint64_t *ts = bo->map;
/* Don't translate the no-timestamp marker: */
if (ts[idx] == U_TRACE_NO_TIMESTAMP)
return U_TRACE_NO_TIMESTAMP;
return intel_device_info_timebase_scale(&device->info, ts[idx]);
}
void
anv_device_utrace_init(struct anv_device *device)
{
u_trace_context_init(&device->trace_context, device,
anv_utrace_create_ts_buffer,
anv_utrace_destroy_ts_buffer,
anv_utrace_record_ts,
anv_utrace_read_ts,
anv_utrace_delete_flush_data);
}
void
anv_device_utrace_finish(struct anv_device *device)
{
u_trace_context_fini(&device->trace_context);
}

View File

@@ -107,5 +107,7 @@ VkResult anv_QueuePresentKHR(
vk_semaphore_reset_temporary(&queue->device->vk, semaphore);
}
u_trace_context_process(&queue->device->trace_context, true);
return result;
}

View File

@@ -34,10 +34,13 @@
#include "common/intel_l3_config.h"
#include "blorp/blorp_genX_exec.h"
#include "anv_tracepoints.h"
static void blorp_measure_start(struct blorp_batch *_batch,
const struct blorp_params *params)
{
struct anv_cmd_buffer *cmd_buffer = _batch->driver_batch;
trace_begin_blorp(&cmd_buffer->trace, cmd_buffer);
anv_measure_snapshot(cmd_buffer,
params->snapshot_type,
NULL, 0);
@@ -46,6 +49,14 @@ static void blorp_measure_start(struct blorp_batch *_batch,
static void blorp_measure_end(struct blorp_batch *_batch,
const struct blorp_params *params)
{
struct anv_cmd_buffer *cmd_buffer = _batch->driver_batch;
trace_end_blorp(&cmd_buffer->trace, cmd_buffer,
params->x1 - params->x0,
params->y1 - params->y0,
params->hiz_op,
params->fast_clear_op,
params->shader_type,
params->shader_pipeline);
}
static void *

View File

@@ -38,6 +38,8 @@
#include "nir/nir_xfb_info.h"
#include "anv_tracepoints.h"
/* We reserve :
* - GPR 14 for secondary command buffer returns
* - GPR 15 for conditional rendering
@@ -1761,6 +1763,8 @@ genX(BeginCommandBuffer)(
if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
trace_begin_cmd_buffer(&cmd_buffer->trace, cmd_buffer);
genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
/* We sometimes store vertex data in the dynamic state buffer for blorp
@@ -1934,6 +1938,8 @@ genX(EndCommandBuffer)(
emit_isp_disable(cmd_buffer);
trace_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer, cmd_buffer->level);
anv_cmd_buffer_end_batch_buffer(cmd_buffer);
return VK_SUCCESS;
@@ -2399,6 +2405,9 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
else if (bits == 0)
return;
if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | ANV_PIPE_INVALIDATE_BITS))
trace_stall(&cmd_buffer->trace, cmd_buffer, bits);
if ((GFX_VER >= 8 && GFX_VER <= 9) &&
(bits & ANV_PIPE_CS_STALL_BIT) &&
(bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
@@ -3954,6 +3963,7 @@ void genX(CmdDraw)(
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw", count);
trace_begin_draw(&cmd_buffer->trace, cmd_buffer);
genX(cmd_buffer_flush_state)(cmd_buffer);
@@ -3982,6 +3992,8 @@ void genX(CmdDraw)(
}
update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
trace_end_draw(&cmd_buffer->trace, cmd_buffer, count);
}
void genX(CmdDrawMultiEXT)(
@@ -4006,6 +4018,7 @@ void genX(CmdDrawMultiEXT)(
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_DRAW,
"draw_multi", count);
trace_begin_draw_multi(&cmd_buffer->trace, cmd_buffer);
genX(cmd_buffer_flush_state)(cmd_buffer);
@@ -4037,6 +4050,8 @@ void genX(CmdDrawMultiEXT)(
}
update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
trace_end_draw_multi(&cmd_buffer->trace, cmd_buffer, count);
}
void genX(CmdDrawIndexed)(
@@ -4062,6 +4077,7 @@ void genX(CmdDrawIndexed)(
INTEL_SNAPSHOT_DRAW,
"draw indexed",
count);
trace_begin_draw_indexed(&cmd_buffer->trace, cmd_buffer);
genX(cmd_buffer_flush_state)(cmd_buffer);
@@ -4088,6 +4104,8 @@ void genX(CmdDrawIndexed)(
}
update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
trace_end_draw_indexed(&cmd_buffer->trace, cmd_buffer, count);
}
void genX(CmdDrawMultiIndexedEXT)(
@@ -4114,6 +4132,7 @@ void genX(CmdDrawMultiIndexedEXT)(
INTEL_SNAPSHOT_DRAW,
"draw indexed_multi",
count);
trace_begin_draw_indexed_multi(&cmd_buffer->trace, cmd_buffer);
genX(cmd_buffer_flush_state)(cmd_buffer);
@@ -4200,6 +4219,8 @@ void genX(CmdDrawMultiIndexedEXT)(
}
update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
trace_end_draw_indexed_multi(&cmd_buffer->trace, cmd_buffer, count);
}
/* Auto-Draw / Indirect Registers */
@@ -4235,6 +4256,7 @@ void genX(CmdDrawIndirectByteCountEXT)(
INTEL_SNAPSHOT_DRAW,
"draw indirect byte count",
instanceCount);
trace_begin_draw_indirect_byte_count(&cmd_buffer->trace, cmd_buffer);
genX(cmd_buffer_flush_state)(cmd_buffer);
@@ -4277,6 +4299,9 @@ void genX(CmdDrawIndirectByteCountEXT)(
}
update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
trace_end_draw_indirect_byte_count(&cmd_buffer->trace, cmd_buffer,
instanceCount);
#endif /* GFX_VERx10 >= 75 */
}
@@ -4333,6 +4358,8 @@ void genX(CmdDrawIndirect)(
if (anv_batch_has_error(&cmd_buffer->batch))
return;
trace_begin_draw_indirect(&cmd_buffer->trace, cmd_buffer);
genX(cmd_buffer_flush_state)(cmd_buffer);
if (cmd_buffer->state.conditional_render_enabled)
@@ -4365,6 +4392,8 @@ void genX(CmdDrawIndirect)(
offset += stride;
}
trace_end_draw_indirect(&cmd_buffer->trace, cmd_buffer, drawCount);
}
void genX(CmdDrawIndexedIndirect)(
@@ -4382,6 +4411,8 @@ void genX(CmdDrawIndexedIndirect)(
if (anv_batch_has_error(&cmd_buffer->batch))
return;
trace_begin_draw_indexed_indirect(&cmd_buffer->trace, cmd_buffer);
genX(cmd_buffer_flush_state)(cmd_buffer);
if (cmd_buffer->state.conditional_render_enabled)
@@ -4415,6 +4446,8 @@ void genX(CmdDrawIndexedIndirect)(
offset += stride;
}
trace_end_draw_indexed_indirect(&cmd_buffer->trace, cmd_buffer, drawCount);
}
static struct mi_value
@@ -4541,6 +4574,8 @@ void genX(CmdDrawIndirectCount)(
if (anv_batch_has_error(&cmd_buffer->batch))
return;
trace_begin_draw_indirect_count(&cmd_buffer->trace, cmd_buffer);
genX(cmd_buffer_flush_state)(cmd_buffer);
struct mi_builder b;
@@ -4580,6 +4615,8 @@ void genX(CmdDrawIndirectCount)(
}
mi_value_unref(&b, max);
trace_end_draw_indirect_count(&cmd_buffer->trace, cmd_buffer, maxDrawCount);
}
void genX(CmdDrawIndexedIndirectCount)(
@@ -4601,6 +4638,8 @@ void genX(CmdDrawIndexedIndirectCount)(
if (anv_batch_has_error(&cmd_buffer->batch))
return;
trace_begin_draw_indexed_indirect_count(&cmd_buffer->trace, cmd_buffer);
genX(cmd_buffer_flush_state)(cmd_buffer);
struct mi_builder b;
@@ -4641,6 +4680,9 @@ void genX(CmdDrawIndexedIndirectCount)(
}
mi_value_unref(&b, max);
trace_end_draw_indexed_indirect_count(&cmd_buffer->trace, cmd_buffer, maxDrawCount);
}
void genX(CmdBeginTransformFeedbackEXT)(
@@ -5016,6 +5058,8 @@ void genX(CmdDispatchBase)(
prog_data->local_size[0] * prog_data->local_size[1] *
prog_data->local_size[2]);
trace_begin_compute(&cmd_buffer->trace, cmd_buffer);
if (prog_data->uses_num_work_groups) {
struct anv_state state =
anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
@@ -5039,6 +5083,9 @@ void genX(CmdDispatchBase)(
emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX,
groupCountY, groupCountZ);
trace_end_compute(&cmd_buffer->trace, cmd_buffer,
groupCountX, groupCountY, groupCountZ);
}
#define GPGPU_DISPATCHDIMX 0x2500
@@ -5072,6 +5119,7 @@ void genX(CmdDispatchIndirect)(
INTEL_SNAPSHOT_COMPUTE,
"compute indirect",
0);
trace_begin_compute(&cmd_buffer->trace, cmd_buffer);
if (prog_data->uses_num_work_groups) {
cmd_buffer->state.compute.num_workgroups = addr;
@@ -5145,6 +5193,8 @@ void genX(CmdDispatchIndirect)(
#endif
emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);
trace_end_compute(&cmd_buffer->trace, cmd_buffer, 0, 0, 0);
}
#if GFX_VERx10 >= 125
@@ -6750,6 +6800,7 @@ void genX(CmdBeginRenderPass2)(
cmd_buffer->state.render_area = pRenderPassBeginInfo->renderArea;
anv_measure_beginrenderpass(cmd_buffer);
trace_begin_render_pass(&cmd_buffer->trace, cmd_buffer);
result = genX(cmd_buffer_setup_attachments)(cmd_buffer, pass,
framebuffer,
@@ -6792,6 +6843,14 @@ void genX(CmdEndRenderPass2)(
cmd_buffer_end_subpass(cmd_buffer);
trace_end_render_pass(&cmd_buffer->trace, cmd_buffer,
cmd_buffer->state.render_area.extent.width,
cmd_buffer->state.render_area.extent.height,
cmd_buffer->state.pass->attachment_count,
cmd_buffer->state.pass->attachment_count > 0 ?
cmd_buffer->state.pass->attachments[0].samples : 0,
cmd_buffer->state.pass->subpass_count);
cmd_buffer->state.hiz_enabled = false;
/* Remove references to render pass specific state. This enables us to
@@ -7030,13 +7089,21 @@ VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
return VK_SUCCESS;
}
#define TIMESTAMP 0x2358
void genX(cmd_emit_timestamp)(struct anv_batch *batch,
struct anv_bo *bo,
uint32_t offset) {
struct anv_device *device,
struct anv_address addr,
bool end_of_pipe) {
if (end_of_pipe) {
anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
pc.CommandStreamerStallEnable = true;
pc.PostSyncOperation = WriteTimestamp;
pc.Address = (struct anv_address) {bo, offset};
pc.Address = addr;
anv_debug_dump_pc(pc);
}
} else {
struct mi_builder b;
mi_builder_init(&b, &device->info, batch);
mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
}
}

View File

@@ -33,6 +33,19 @@ anv_entrypoints = custom_target(
depend_files : vk_entrypoints_gen_depend_files,
)
anv_tracepoints = custom_target(
'anv_tracepoints.[ch]',
input: 'anv_tracepoints.py',
output: ['anv_tracepoints.h', 'anv_tracepoints.c'],
command: [
prog_python, '@INPUT@',
'-p', join_paths(meson.source_root(), 'src/util/perf/'),
'--utrace-hdr', '@OUTPUT0@',
'--utrace-src', '@OUTPUT1@',
],
depend_files: u_trace_py,
)
intel_icd = custom_target(
'intel_icd',
input : [vk_icd_gen, vk_api_xml],
@@ -65,7 +78,7 @@ foreach g : [['70', ['gfx7_cmd_buffer.c']], ['75', ['gfx7_cmd_buffer.c']],
_gfx_ver = g[0]
libanv_per_hw_ver_libs += static_library(
'anv_per_hw_ver@0@'.format(_gfx_ver),
[anv_per_hw_ver_files, g[1], anv_entrypoints[0]],
[anv_per_hw_ver_files, g[1], anv_entrypoints[0], anv_tracepoints[0]],
include_directories : [
inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_compiler, inc_intel,
],
@@ -111,6 +124,7 @@ libanv_files = files(
'anv_private.h',
'anv_queue.c',
'anv_util.c',
'anv_utrace.c',
'anv_wsi.c',
)
@@ -154,7 +168,7 @@ libanv_common = static_library(
'anv_common',
[
libanv_files, anv_entrypoints, sha1_h,
gen_xml_pack,
gen_xml_pack
],
include_directories : [
inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler,
@@ -167,7 +181,7 @@ libanv_common = static_library(
libvulkan_intel = shared_library(
'vulkan_intel',
[files('anv_gem.c'), anv_entrypoints[0]],
[files('anv_gem.c'), anv_entrypoints[0], anv_tracepoints],
include_directories : [
inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler,
],
@@ -202,7 +216,7 @@ endif
if with_tests
libvulkan_intel_test = static_library(
'vulkan_intel_test',
[files('anv_gem_stubs.c'), anv_entrypoints[0]],
[files('anv_gem_stubs.c'), anv_entrypoints[0], anv_tracepoints[0]],
include_directories : [
inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler,
],
@@ -227,7 +241,7 @@ if with_tests
'anv_@0@'.format(t),
executable(
t,
['tests/@0@.c'.format(t), anv_entrypoints[0]],
['tests/@0@.c'.format(t), anv_entrypoints[0], anv_tracepoints[0]],
c_args : [ c_sse2_args ],
link_with : libvulkan_intel_test,
dependencies : [