From 271d5edf876f6d284301276de611782371d1710d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 3 Jan 2025 11:29:39 -0500 Subject: [PATCH] radeonsi: fix a perf regression due to slow reply from GEM_WAIT_IDLE for timeout=0 It sometimes takes 1 ms to return with timeout=0, which is unacceptable. Fixes: 4194774edf5 - radeonsi: move barriers out of si_launch_grid_internal_ssbos Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_barrier.c | 2 +- src/gallium/include/winsys/radeon_winsys.h | 14 +++++++++++++- src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 6 ++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_barrier.c b/src/gallium/drivers/radeonsi/si_barrier.c index 7186de80ded..4951fd84b83 100644 --- a/src/gallium/drivers/radeonsi/si_barrier.c +++ b/src/gallium/drivers/radeonsi/si_barrier.c @@ -497,7 +497,7 @@ static bool si_is_buffer_idle(struct si_context *sctx, struct si_resource *buf, unsigned usage) { return !si_cs_is_buffer_referenced(sctx, buf->buf, usage) && - sctx->ws->buffer_wait(sctx->ws, buf->buf, 0, usage); + sctx->ws->buffer_wait(sctx->ws, buf->buf, 0, usage | RADEON_USAGE_DISALLOW_SLOW_REPLY); } void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags, diff --git a/src/gallium/include/winsys/radeon_winsys.h b/src/gallium/include/winsys/radeon_winsys.h index cf9345cf966..fe81edecdef 100644 --- a/src/gallium/include/winsys/radeon_winsys.h +++ b/src/gallium/include/winsys/radeon_winsys.h @@ -188,7 +188,12 @@ enum radeon_ctx_pstate #define RADEON_PRIO_SHADER_RINGS (1 << 22) #define RADEON_PRIO_SCRATCH_BUFFER (1 << 23) -#define RADEON_ALL_PRIORITIES (RADEON_USAGE_READ - 1) +#define RADEON_ALL_PRIORITIES BITFIELD_MASK(24) + +/* When passed to radeon_winsys::buffer_wait, it disallows using the DRM ioctl for timeout=0 + * queries because it can take ~1 ms to return, reducing FPS. + */ +#define RADEON_USAGE_DISALLOW_SLOW_REPLY (1 << 26) /* Upper bits of priorities are used by usage flags. */ #define RADEON_USAGE_READ (1 << 27) @@ -375,6 +380,13 @@ struct radeon_winsys { * The timeout of 0 will only return the status. * The timeout of OS_TIMEOUT_INFINITE will always wait until the buffer * is idle. + * + * usage is RADEON_USAGE_READ/WRITE. + * + * Checking whether a buffer is idle using timeout=0 can take 1 ms even if the DRM ioctl is + * used, reducing our FPS to several hundreds. To prevent that, set + * RADEON_USAGE_DISALLOW_SLOW_REPLY, which will return busy. This is a workaround for kernel + * inefficiency. */ bool (*buffer_wait)(struct radeon_winsys *ws, struct pb_buffer_lean *buf, uint64_t timeout, unsigned usage); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index 5a24ea85bc4..373f3c330f2 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -103,6 +103,12 @@ static bool amdgpu_bo_wait(struct radeon_winsys *rws, bool buffer_busy = true; int r; + /* The GEM_WAIT_IDLE ioctl with timeout=0 can take up to 1 ms to return. This is a kernel + * inefficiency. This flag indicates whether it's better to return busy than wait for 1 ms. + */ + if (timeout == 0 && usage & RADEON_USAGE_DISALLOW_SLOW_REPLY) + return false; + r = ac_drm_bo_wait_for_idle(aws->fd, get_real_bo(bo)->kms_handle, timeout, &buffer_busy); if (r) fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__, r);