From d690f293c623b87a605b289cb8d8472b0628e5ea Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 20 Aug 2024 15:39:56 +0200 Subject: [PATCH] radv/winsys: pad gfx and compute IBs with only one NOP 1-dword NOPs are slow and it's better to emit a sized NOP packet when possible. Based on RadeonSI. Signed-off-by: Samuel Pitoiset Part-of: --- src/amd/vulkan/radv_radeon_winsys.h | 2 + src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 46 +++++++++++++++---- src/amd/vulkan/winsys/null/radv_null_cs.c | 6 +++ 3 files changed, 44 insertions(+), 10 deletions(-) diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h index d578a7ea6d8..336b0c7d216 100644 --- a/src/amd/vulkan/radv_radeon_winsys.h +++ b/src/amd/vulkan/radv_radeon_winsys.h @@ -297,6 +297,8 @@ struct radeon_winsys { void (*cs_annotate)(struct radeon_cmdbuf *cs, const char *marker); + void (*cs_pad)(struct radeon_cmdbuf *cs, unsigned leave_dw_space); + void (*dump_bo_ranges)(struct radeon_winsys *ws, FILE *file); void (*dump_bo_log)(struct radeon_winsys *ws, FILE *file); diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c index 94d21ad83b0..fe59de4c855 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -421,6 +421,36 @@ radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size) cs->base.max_dw = ib_size / 4 - 4; } +static void +radv_amdgpu_winsys_cs_pad(struct radeon_cmdbuf *_cs, unsigned leave_dw_space) +{ + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs); + const enum amd_ip_type ip_type = cs->hw_ip; + const uint32_t pad_dw_mask = cs->ws->info.ip[ip_type].ib_pad_dw_mask; + const uint32_t unaligned_dw = (cs->base.cdw + leave_dw_space) & pad_dw_mask; + + assert(ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE); + + if (unaligned_dw) { + const int remaining = pad_dw_mask + 1 - unaligned_dw; + + /* Only pad by 1 dword with the type-2 NOP if necessary. */ + if (remaining == 1 && cs->ws->info.gfx_ib_pad_with_type2) { + radeon_emit_unchecked(&cs->base, PKT2_NOP_PAD); + } else { + /* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized + * packet. The size of the packet body after the header is always count + 1. + * If count == -1, there is no packet body. NOP is the only packet that can have + * count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1). + */ + radeon_emit_unchecked(&cs->base, PKT3(PKT3_NOP, remaining - 2, 0)); + cs->base.cdw += remaining - 1; + } + } + + assert(((cs->base.cdw + leave_dw_space) & pad_dw_mask) == 0); +} + static VkResult radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs) { @@ -429,15 +459,11 @@ radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs) assert(cs->base.cdw <= cs->base.reserved_dw); - uint32_t ib_pad_dw_mask = MAX2(3, cs->ws->info.ip[ip_type].ib_pad_dw_mask); - uint32_t nop_packet = get_nop_packet(cs); - if (cs->use_ib) { - /* Ensure that with the 4 dword reservation we subtract from max_dw we always - * have 4 nops at the end for chaining. - */ - while (!cs->base.cdw || (cs->base.cdw & ib_pad_dw_mask) != ib_pad_dw_mask - 3) - radeon_emit_unchecked(&cs->base, nop_packet); + const uint32_t nop_packet = get_nop_packet(cs); + + /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */ + radv_amdgpu_winsys_cs_pad(_cs, 4); radeon_emit_unchecked(&cs->base, nop_packet); radeon_emit_unchecked(&cs->base, nop_packet); @@ -458,8 +484,7 @@ radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs) pad = false; if (pad) { - while (!cs->base.cdw || (cs->base.cdw & ib_pad_dw_mask)) - radeon_emit_unchecked(&cs->base, nop_packet); + radv_amdgpu_winsys_cs_pad(_cs, 0); } } @@ -1909,4 +1934,5 @@ radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws) ws->base.cs_submit = radv_amdgpu_winsys_cs_submit; ws->base.cs_dump = radv_amdgpu_winsys_cs_dump; ws->base.cs_annotate = radv_amdgpu_winsys_cs_annotate; + ws->base.cs_pad = radv_amdgpu_winsys_cs_pad; } diff --git a/src/amd/vulkan/winsys/null/radv_null_cs.c b/src/amd/vulkan/winsys/null/radv_null_cs.c index ffe3a84cfd1..60fd97cba88 100644 --- a/src/amd/vulkan/winsys/null/radv_null_cs.c +++ b/src/amd/vulkan/winsys/null/radv_null_cs.c @@ -66,6 +66,11 @@ radv_null_cs_create(struct radeon_winsys *ws, enum amd_ip_type ip_type, UNUSED b return &cs->base; } +static void +radv_null_cs_pad(struct radeon_cmdbuf *_cs, unsigned leave_dw_space) +{ +} + static VkResult radv_null_cs_finalize(struct radeon_cmdbuf *_cs) { @@ -89,4 +94,5 @@ radv_null_cs_init_functions(struct radv_null_winsys *ws) ws->base.cs_create = radv_null_cs_create; ws->base.cs_finalize = radv_null_cs_finalize; ws->base.cs_destroy = radv_null_cs_destroy; + ws->base.cs_pad = radv_null_cs_pad; }