anv: Query the kernel for reset status
When a client causes a GPU hang (or experiences issues due to a hang in another client) we want to let it know as soon as possible. In particular, if it submits work with a fence and calls vkWaitForFences or vkQueueQaitIdle and it returns VK_SUCCESS, then the client should be able to trust the results of that rendering. In order to provide this guarantee, we have to ask the kernel for context status in a few key locations. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
@@ -888,8 +888,6 @@ anv_device_submit_simple_batch(struct anv_device *device,
|
||||
struct anv_bo bo, *exec_bos[1];
|
||||
VkResult result = VK_SUCCESS;
|
||||
uint32_t size;
|
||||
int64_t timeout;
|
||||
int ret;
|
||||
|
||||
/* Kernel driver requires 8 byte aligned batch length */
|
||||
size = align_u32(batch->next - batch->start, 8);
|
||||
@@ -929,14 +927,7 @@ anv_device_submit_simple_batch(struct anv_device *device,
|
||||
if (result != VK_SUCCESS)
|
||||
goto fail;
|
||||
|
||||
timeout = INT64_MAX;
|
||||
ret = anv_gem_wait(device, bo.gem_handle, &timeout);
|
||||
if (ret != 0) {
|
||||
/* We don't know the real error. */
|
||||
device->lost = true;
|
||||
result = vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m");
|
||||
goto fail;
|
||||
}
|
||||
result = anv_device_wait(device, &bo, INT64_MAX);
|
||||
|
||||
fail:
|
||||
anv_bo_pool_free(&device->batch_bo_pool, &bo);
|
||||
@@ -1268,6 +1259,58 @@ anv_device_execbuf(struct anv_device *device,
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
VkResult
|
||||
anv_device_query_status(struct anv_device *device)
|
||||
{
|
||||
/* This isn't likely as most of the callers of this function already check
|
||||
* for it. However, it doesn't hurt to check and it potentially lets us
|
||||
* avoid an ioctl.
|
||||
*/
|
||||
if (unlikely(device->lost))
|
||||
return VK_ERROR_DEVICE_LOST;
|
||||
|
||||
uint32_t active, pending;
|
||||
int ret = anv_gem_gpu_get_reset_stats(device, &active, &pending);
|
||||
if (ret == -1) {
|
||||
/* We don't know the real error. */
|
||||
device->lost = true;
|
||||
return vk_errorf(VK_ERROR_DEVICE_LOST, "get_reset_stats failed: %m");
|
||||
}
|
||||
|
||||
if (active) {
|
||||
device->lost = true;
|
||||
return vk_errorf(VK_ERROR_DEVICE_LOST,
|
||||
"GPU hung on one of our command buffers");
|
||||
} else if (pending) {
|
||||
device->lost = true;
|
||||
return vk_errorf(VK_ERROR_DEVICE_LOST,
|
||||
"GPU hung with commands in-flight");
|
||||
}
|
||||
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
VkResult
|
||||
anv_device_wait(struct anv_device *device, struct anv_bo *bo,
|
||||
int64_t timeout)
|
||||
{
|
||||
int ret = anv_gem_wait(device, bo->gem_handle, &timeout);
|
||||
if (ret == -1 && errno == ETIME) {
|
||||
return VK_TIMEOUT;
|
||||
} else if (ret == -1) {
|
||||
/* We don't know the real error. */
|
||||
device->lost = true;
|
||||
return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m");
|
||||
}
|
||||
|
||||
/* Query for device status after the wait. If the BO we're waiting on got
|
||||
* caught in a GPU hang we don't want to return VK_SUCCESS to the client
|
||||
* because it clearly doesn't have valid data. Yes, this most likely means
|
||||
* an ioctl, but we just did an ioctl to wait so it's no great loss.
|
||||
*/
|
||||
return anv_device_query_status(device);
|
||||
}
|
||||
|
||||
VkResult anv_QueueSubmit(
|
||||
VkQueue _queue,
|
||||
uint32_t submitCount,
|
||||
@@ -1277,10 +1320,17 @@ VkResult anv_QueueSubmit(
|
||||
ANV_FROM_HANDLE(anv_queue, queue, _queue);
|
||||
ANV_FROM_HANDLE(anv_fence, fence, _fence);
|
||||
struct anv_device *device = queue->device;
|
||||
if (unlikely(device->lost))
|
||||
return VK_ERROR_DEVICE_LOST;
|
||||
|
||||
VkResult result = VK_SUCCESS;
|
||||
/* Query for device status prior to submitting. Technically, we don't need
|
||||
* to do this. However, if we have a client that's submitting piles of
|
||||
* garbage, we would rather break as early as possible to keep the GPU
|
||||
* hanging contained. If we don't check here, we'll either be waiting for
|
||||
* the kernel to kick us or we'll have to wait until the client waits on a
|
||||
* fence before we actually know whether or not we've hung.
|
||||
*/
|
||||
VkResult result = anv_device_query_status(device);
|
||||
if (result != VK_SUCCESS)
|
||||
return result;
|
||||
|
||||
/* We lock around QueueSubmit for three main reasons:
|
||||
*
|
||||
@@ -1806,9 +1856,6 @@ VkResult anv_GetFenceStatus(
|
||||
if (unlikely(device->lost))
|
||||
return VK_ERROR_DEVICE_LOST;
|
||||
|
||||
int64_t t = 0;
|
||||
int ret;
|
||||
|
||||
switch (fence->state) {
|
||||
case ANV_FENCE_STATE_RESET:
|
||||
/* If it hasn't even been sent off to the GPU yet, it's not ready */
|
||||
@@ -1818,15 +1865,18 @@ VkResult anv_GetFenceStatus(
|
||||
/* It's been signaled, return success */
|
||||
return VK_SUCCESS;
|
||||
|
||||
case ANV_FENCE_STATE_SUBMITTED:
|
||||
/* It's been submitted to the GPU but we don't know if it's done yet. */
|
||||
ret = anv_gem_wait(device, fence->bo.gem_handle, &t);
|
||||
if (ret == 0) {
|
||||
case ANV_FENCE_STATE_SUBMITTED: {
|
||||
VkResult result = anv_device_wait(device, &fence->bo, 0);
|
||||
switch (result) {
|
||||
case VK_SUCCESS:
|
||||
fence->state = ANV_FENCE_STATE_SIGNALED;
|
||||
return VK_SUCCESS;
|
||||
} else {
|
||||
case VK_TIMEOUT:
|
||||
return VK_NOT_READY;
|
||||
default:
|
||||
return result;
|
||||
}
|
||||
}
|
||||
default:
|
||||
unreachable("Invalid fence status");
|
||||
}
|
||||
@@ -1888,20 +1938,20 @@ VkResult anv_WaitForFences(
|
||||
/* These are the fences we really care about. Go ahead and wait
|
||||
* on it until we hit a timeout.
|
||||
*/
|
||||
ret = anv_gem_wait(device, fence->bo.gem_handle, &timeout);
|
||||
if (ret == -1 && errno == ETIME) {
|
||||
result = VK_TIMEOUT;
|
||||
goto done;
|
||||
} else if (ret == -1) {
|
||||
/* We don't know the real error. */
|
||||
device->lost = true;
|
||||
return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m");
|
||||
} else {
|
||||
result = anv_device_wait(device, &fence->bo, timeout);
|
||||
switch (result) {
|
||||
case VK_SUCCESS:
|
||||
fence->state = ANV_FENCE_STATE_SIGNALED;
|
||||
signaled_fences = true;
|
||||
if (!waitAll)
|
||||
return VK_SUCCESS;
|
||||
continue;
|
||||
goto done;
|
||||
break;
|
||||
|
||||
case VK_TIMEOUT:
|
||||
goto done;
|
||||
|
||||
default:
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -301,6 +301,23 @@ anv_gem_get_aperture(int fd, uint64_t *size)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
anv_gem_gpu_get_reset_stats(struct anv_device *device,
|
||||
uint32_t *active, uint32_t *pending)
|
||||
{
|
||||
struct drm_i915_reset_stats stats = {
|
||||
.ctx_id = device->context_id,
|
||||
};
|
||||
|
||||
int ret = anv_ioctl(device->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats);
|
||||
if (ret == 0) {
|
||||
*active = stats.batch_active;
|
||||
*pending = stats.batch_pending;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int
|
||||
anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle)
|
||||
{
|
||||
|
@@ -637,6 +637,9 @@ void anv_device_finish_blorp(struct anv_device *device);
|
||||
VkResult anv_device_execbuf(struct anv_device *device,
|
||||
struct drm_i915_gem_execbuffer2 *execbuf,
|
||||
struct anv_bo **execbuf_bos);
|
||||
VkResult anv_device_query_status(struct anv_device *device);
|
||||
VkResult anv_device_wait(struct anv_device *device, struct anv_bo *bo,
|
||||
int64_t timeout);
|
||||
|
||||
void* anv_gem_mmap(struct anv_device *device,
|
||||
uint32_t gem_handle, uint64_t offset, uint64_t size, uint32_t flags);
|
||||
@@ -654,6 +657,8 @@ int anv_gem_destroy_context(struct anv_device *device, int context);
|
||||
int anv_gem_get_param(int fd, uint32_t param);
|
||||
bool anv_gem_get_bit6_swizzle(int fd, uint32_t tiling);
|
||||
int anv_gem_get_aperture(int fd, uint64_t *size);
|
||||
int anv_gem_gpu_get_reset_stats(struct anv_device *device,
|
||||
uint32_t *active, uint32_t *pending);
|
||||
int anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle);
|
||||
uint32_t anv_gem_fd_to_handle(struct anv_device *device, int fd);
|
||||
int anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle, uint32_t caching);
|
||||
|
@@ -143,8 +143,6 @@ VkResult genX(GetQueryPoolResults)(
|
||||
{
|
||||
ANV_FROM_HANDLE(anv_device, device, _device);
|
||||
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
|
||||
int64_t timeout = INT64_MAX;
|
||||
int ret;
|
||||
|
||||
assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
|
||||
pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
|
||||
@@ -157,12 +155,9 @@ VkResult genX(GetQueryPoolResults)(
|
||||
return VK_SUCCESS;
|
||||
|
||||
if (flags & VK_QUERY_RESULT_WAIT_BIT) {
|
||||
ret = anv_gem_wait(device, pool->bo.gem_handle, &timeout);
|
||||
if (ret == -1) {
|
||||
/* We don't know the real error. */
|
||||
return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
|
||||
"gem_wait failed %m");
|
||||
}
|
||||
VkResult result = anv_device_wait(device, &pool->bo, INT64_MAX);
|
||||
if (result != VK_SUCCESS)
|
||||
return result;
|
||||
}
|
||||
|
||||
void *data_end = pData + dataSize;
|
||||
|
Reference in New Issue
Block a user