radv/winsys: pad gfx and compute IBs with only one NOP

1-dword NOPs are slow and it's better to emit a sized NOP packet when
possible.

Based on RadeonSI.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30743>
This commit is contained in:
Samuel Pitoiset
2024-08-20 15:39:56 +02:00
committed by Marge Bot
parent 29a2e5358d
commit d690f293c6
3 changed files with 44 additions and 10 deletions

View File

@@ -297,6 +297,8 @@ struct radeon_winsys {
void (*cs_annotate)(struct radeon_cmdbuf *cs, const char *marker);
void (*cs_pad)(struct radeon_cmdbuf *cs, unsigned leave_dw_space);
void (*dump_bo_ranges)(struct radeon_winsys *ws, FILE *file);
void (*dump_bo_log)(struct radeon_winsys *ws, FILE *file);

View File

@@ -421,6 +421,36 @@ radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size)
cs->base.max_dw = ib_size / 4 - 4;
}
static void
radv_amdgpu_winsys_cs_pad(struct radeon_cmdbuf *_cs, unsigned leave_dw_space)
{
struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
const enum amd_ip_type ip_type = cs->hw_ip;
const uint32_t pad_dw_mask = cs->ws->info.ip[ip_type].ib_pad_dw_mask;
const uint32_t unaligned_dw = (cs->base.cdw + leave_dw_space) & pad_dw_mask;
assert(ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
if (unaligned_dw) {
const int remaining = pad_dw_mask + 1 - unaligned_dw;
/* Only pad by 1 dword with the type-2 NOP if necessary. */
if (remaining == 1 && cs->ws->info.gfx_ib_pad_with_type2) {
radeon_emit_unchecked(&cs->base, PKT2_NOP_PAD);
} else {
/* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
* packet. The size of the packet body after the header is always count + 1.
* If count == -1, there is no packet body. NOP is the only packet that can have
* count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
*/
radeon_emit_unchecked(&cs->base, PKT3(PKT3_NOP, remaining - 2, 0));
cs->base.cdw += remaining - 1;
}
}
assert(((cs->base.cdw + leave_dw_space) & pad_dw_mask) == 0);
}
static VkResult
radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
{
@@ -429,15 +459,11 @@ radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
assert(cs->base.cdw <= cs->base.reserved_dw);
uint32_t ib_pad_dw_mask = MAX2(3, cs->ws->info.ip[ip_type].ib_pad_dw_mask);
uint32_t nop_packet = get_nop_packet(cs);
if (cs->use_ib) {
/* Ensure that with the 4 dword reservation we subtract from max_dw we always
* have 4 nops at the end for chaining.
*/
while (!cs->base.cdw || (cs->base.cdw & ib_pad_dw_mask) != ib_pad_dw_mask - 3)
radeon_emit_unchecked(&cs->base, nop_packet);
const uint32_t nop_packet = get_nop_packet(cs);
/* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
radv_amdgpu_winsys_cs_pad(_cs, 4);
radeon_emit_unchecked(&cs->base, nop_packet);
radeon_emit_unchecked(&cs->base, nop_packet);
@@ -458,8 +484,7 @@ radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
pad = false;
if (pad) {
while (!cs->base.cdw || (cs->base.cdw & ib_pad_dw_mask))
radeon_emit_unchecked(&cs->base, nop_packet);
radv_amdgpu_winsys_cs_pad(_cs, 0);
}
}
@@ -1909,4 +1934,5 @@ radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws)
ws->base.cs_submit = radv_amdgpu_winsys_cs_submit;
ws->base.cs_dump = radv_amdgpu_winsys_cs_dump;
ws->base.cs_annotate = radv_amdgpu_winsys_cs_annotate;
ws->base.cs_pad = radv_amdgpu_winsys_cs_pad;
}

View File

@@ -66,6 +66,11 @@ radv_null_cs_create(struct radeon_winsys *ws, enum amd_ip_type ip_type, UNUSED b
return &cs->base;
}
static void
radv_null_cs_pad(struct radeon_cmdbuf *_cs, unsigned leave_dw_space)
{
}
static VkResult
radv_null_cs_finalize(struct radeon_cmdbuf *_cs)
{
@@ -89,4 +94,5 @@ radv_null_cs_init_functions(struct radv_null_winsys *ws)
ws->base.cs_create = radv_null_cs_create;
ws->base.cs_finalize = radv_null_cs_finalize;
ws->base.cs_destroy = radv_null_cs_destroy;
ws->base.cs_pad = radv_null_cs_pad;
}