iris: Wait for drm_xe_exec_queue to be idle before destroying it

Xe KMD don't refcount anything, so resources could be freed while they
are still in use if we don't wait for exec_queue to be idle.

This issue was found with Xe KMD error capture, VM was already
destroyed when it attemped to capture error state but it can also
happen in applications that did not hang.

This fixed the '*ERROR* GT0: TLB invalidation' errors when running
piglit all test list.

Signed-off-by: José Roberto de Souza <jose.souza@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27500>
This commit is contained in:
José Roberto de Souza
2024-02-06 10:27:54 -08:00
committed by Marge Bot
parent 138303fb9d
commit 665d30b544
3 changed files with 55 additions and 5 deletions

View File

@@ -862,8 +862,8 @@ iris_batch_name_to_string(enum iris_batch_name name)
return names[name];
}
static inline bool
context_or_exec_queue_was_banned(struct iris_bufmgr *bufmgr, int ret)
bool
iris_batch_is_banned(struct iris_bufmgr *bufmgr, int ret)
{
enum intel_kmd_type kmd_type = iris_bufmgr_get_device_info(bufmgr)->kmd_type;
@@ -960,7 +960,7 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line)
* has been lost and needs to be re-initialized. If this succeeds,
* dubiously claim success...
*/
if (ret && context_or_exec_queue_was_banned(bufmgr, ret)) {
if (ret && iris_batch_is_banned(bufmgr, ret)) {
enum pipe_reset_status status = iris_batch_check_for_reset(batch);
if (status != PIPE_NO_RESET || ice->context_reset_signaled)

View File

@@ -446,6 +446,9 @@ iris_batch_mark_reset_sync(struct iris_batch *batch)
const char *
iris_batch_name_to_string(enum iris_batch_name name);
bool
iris_batch_is_banned(struct iris_bufmgr *bufmgr, int ret);
#define iris_foreach_batch(ice, batch) \
for (struct iris_batch *batch = &ice->batches[0]; \
batch <= &ice->batches[((struct iris_screen *)ice->ctx.screen)->devinfo->ver >= 12 ? IRIS_BATCH_BLITTER : IRIS_BATCH_COMPUTE]; \

View File

@@ -151,7 +151,45 @@ void iris_xe_init_batches(struct iris_context *ice)
free(engines_info);
}
void iris_xe_destroy_batch(struct iris_batch *batch)
/*
* Wait for all previous DRM_IOCTL_XE_EXEC calls over the
* drm_xe_exec_queue in this iris_batch to complete.
**/
static void
iris_xe_wait_exec_queue_idle(struct iris_batch *batch)
{
struct iris_bufmgr *bufmgr = batch->screen->bufmgr;
struct iris_syncobj *syncobj = iris_create_syncobj(bufmgr);
struct drm_xe_sync xe_sync = {
.type = DRM_XE_SYNC_TYPE_SYNCOBJ,
.flags = DRM_XE_SYNC_FLAG_SIGNAL,
};
struct drm_xe_exec exec = {
.exec_queue_id = batch->xe.exec_queue_id,
.num_syncs = 1,
.syncs = (uintptr_t)&xe_sync,
};
int ret;
if (!syncobj)
return;
xe_sync.handle = syncobj->handle;
/* Using the special exec.num_batch_buffer == 0 handling to get syncobj
* signaled when the last DRM_IOCTL_XE_EXEC is completed.
*/
ret = intel_ioctl(iris_bufmgr_get_fd(bufmgr), DRM_IOCTL_XE_EXEC, &exec);
if (ret == 0) {
assert(iris_wait_syncobj(bufmgr, syncobj, INT64_MAX));
} else {
assert(iris_batch_is_banned(bufmgr, errno) == true);
}
iris_syncobj_destroy(bufmgr, syncobj);
}
static void
iris_xe_destroy_exec_queue(struct iris_batch *batch)
{
struct iris_screen *screen = batch->screen;
struct iris_bufmgr *bufmgr = screen->bufmgr;
@@ -165,6 +203,15 @@ void iris_xe_destroy_batch(struct iris_batch *batch)
assert(ret == 0);
}
void iris_xe_destroy_batch(struct iris_batch *batch)
{
/* Xe KMD don't refcount anything, so resources could be freed while they
* are still in use if we don't wait for exec_queue to be idle.
*/
iris_xe_wait_exec_queue_idle(batch);
iris_xe_destroy_exec_queue(batch);
}
bool iris_xe_replace_batch(struct iris_batch *batch)
{
enum intel_engine_class engine_classes[IRIS_BATCH_COUNT];
@@ -184,7 +231,7 @@ bool iris_xe_replace_batch(struct iris_batch *batch)
ret = iris_xe_init_batch(bufmgr, engines_info, engine_classes[batch->name],
ice->priority, &new_exec_queue_id);
if (ret) {
iris_xe_destroy_batch(batch);
iris_xe_destroy_exec_queue(batch);
batch->xe.exec_queue_id = new_exec_queue_id;
iris_lost_context_state(batch);
}