radeonsi: fix a perf regression due to slow reply from GEM_WAIT_IDLE for timeout=0

It sometimes takes 1 ms to return with timeout=0, which is unacceptable.

Fixes: 4194774edf - radeonsi: move barriers out of si_launch_grid_internal_ssbos

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32877>
This commit is contained in:
Marek Olšák
2025-01-03 11:29:39 -05:00
committed by Marge Bot
parent dd11eec06b
commit 271d5edf87
3 changed files with 20 additions and 2 deletions

View File

@@ -497,7 +497,7 @@ static bool si_is_buffer_idle(struct si_context *sctx, struct si_resource *buf,
unsigned usage)
{
return !si_cs_is_buffer_referenced(sctx, buf->buf, usage) &&
sctx->ws->buffer_wait(sctx->ws, buf->buf, 0, usage);
sctx->ws->buffer_wait(sctx->ws, buf->buf, 0, usage | RADEON_USAGE_DISALLOW_SLOW_REPLY);
}
void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags,

View File

@@ -188,7 +188,12 @@ enum radeon_ctx_pstate
#define RADEON_PRIO_SHADER_RINGS (1 << 22)
#define RADEON_PRIO_SCRATCH_BUFFER (1 << 23)
#define RADEON_ALL_PRIORITIES (RADEON_USAGE_READ - 1)
#define RADEON_ALL_PRIORITIES BITFIELD_MASK(24)
/* When passed to radeon_winsys::buffer_wait, it disallows using the DRM ioctl for timeout=0
* queries because it can take ~1 ms to return, reducing FPS.
*/
#define RADEON_USAGE_DISALLOW_SLOW_REPLY (1 << 26)
/* Upper bits of priorities are used by usage flags. */
#define RADEON_USAGE_READ (1 << 27)
@@ -375,6 +380,13 @@ struct radeon_winsys {
* The timeout of 0 will only return the status.
* The timeout of OS_TIMEOUT_INFINITE will always wait until the buffer
* is idle.
*
* usage is RADEON_USAGE_READ/WRITE.
*
* Checking whether a buffer is idle using timeout=0 can take 1 ms even if the DRM ioctl is
* used, reducing our FPS to several hundreds. To prevent that, set
* RADEON_USAGE_DISALLOW_SLOW_REPLY, which will return busy. This is a workaround for kernel
* inefficiency.
*/
bool (*buffer_wait)(struct radeon_winsys *ws, struct pb_buffer_lean *buf,
uint64_t timeout, unsigned usage);

View File

@@ -103,6 +103,12 @@ static bool amdgpu_bo_wait(struct radeon_winsys *rws,
bool buffer_busy = true;
int r;
/* The GEM_WAIT_IDLE ioctl with timeout=0 can take up to 1 ms to return. This is a kernel
* inefficiency. This flag indicates whether it's better to return busy than wait for 1 ms.
*/
if (timeout == 0 && usage & RADEON_USAGE_DISALLOW_SLOW_REPLY)
return false;
r = ac_drm_bo_wait_for_idle(aws->fd, get_real_bo(bo)->kms_handle, timeout, &buffer_busy);
if (r)
fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__, r);