radv/winsys: pad gfx and compute IBs with only one NOP
1-dword NOPs are slow and it's better to emit a sized NOP packet when possible. Based on RadeonSI. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30743>
This commit is contained in:

committed by
Marge Bot

parent
29a2e5358d
commit
d690f293c6
@@ -297,6 +297,8 @@ struct radeon_winsys {
|
|||||||
|
|
||||||
void (*cs_annotate)(struct radeon_cmdbuf *cs, const char *marker);
|
void (*cs_annotate)(struct radeon_cmdbuf *cs, const char *marker);
|
||||||
|
|
||||||
|
void (*cs_pad)(struct radeon_cmdbuf *cs, unsigned leave_dw_space);
|
||||||
|
|
||||||
void (*dump_bo_ranges)(struct radeon_winsys *ws, FILE *file);
|
void (*dump_bo_ranges)(struct radeon_winsys *ws, FILE *file);
|
||||||
|
|
||||||
void (*dump_bo_log)(struct radeon_winsys *ws, FILE *file);
|
void (*dump_bo_log)(struct radeon_winsys *ws, FILE *file);
|
||||||
|
@@ -421,6 +421,36 @@ radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size)
|
|||||||
cs->base.max_dw = ib_size / 4 - 4;
|
cs->base.max_dw = ib_size / 4 - 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
radv_amdgpu_winsys_cs_pad(struct radeon_cmdbuf *_cs, unsigned leave_dw_space)
|
||||||
|
{
|
||||||
|
struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
|
||||||
|
const enum amd_ip_type ip_type = cs->hw_ip;
|
||||||
|
const uint32_t pad_dw_mask = cs->ws->info.ip[ip_type].ib_pad_dw_mask;
|
||||||
|
const uint32_t unaligned_dw = (cs->base.cdw + leave_dw_space) & pad_dw_mask;
|
||||||
|
|
||||||
|
assert(ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
|
||||||
|
|
||||||
|
if (unaligned_dw) {
|
||||||
|
const int remaining = pad_dw_mask + 1 - unaligned_dw;
|
||||||
|
|
||||||
|
/* Only pad by 1 dword with the type-2 NOP if necessary. */
|
||||||
|
if (remaining == 1 && cs->ws->info.gfx_ib_pad_with_type2) {
|
||||||
|
radeon_emit_unchecked(&cs->base, PKT2_NOP_PAD);
|
||||||
|
} else {
|
||||||
|
/* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
|
||||||
|
* packet. The size of the packet body after the header is always count + 1.
|
||||||
|
* If count == -1, there is no packet body. NOP is the only packet that can have
|
||||||
|
* count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
|
||||||
|
*/
|
||||||
|
radeon_emit_unchecked(&cs->base, PKT3(PKT3_NOP, remaining - 2, 0));
|
||||||
|
cs->base.cdw += remaining - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(((cs->base.cdw + leave_dw_space) & pad_dw_mask) == 0);
|
||||||
|
}
|
||||||
|
|
||||||
static VkResult
|
static VkResult
|
||||||
radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
|
radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
|
||||||
{
|
{
|
||||||
@@ -429,15 +459,11 @@ radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
|
|||||||
|
|
||||||
assert(cs->base.cdw <= cs->base.reserved_dw);
|
assert(cs->base.cdw <= cs->base.reserved_dw);
|
||||||
|
|
||||||
uint32_t ib_pad_dw_mask = MAX2(3, cs->ws->info.ip[ip_type].ib_pad_dw_mask);
|
|
||||||
uint32_t nop_packet = get_nop_packet(cs);
|
|
||||||
|
|
||||||
if (cs->use_ib) {
|
if (cs->use_ib) {
|
||||||
/* Ensure that with the 4 dword reservation we subtract from max_dw we always
|
const uint32_t nop_packet = get_nop_packet(cs);
|
||||||
* have 4 nops at the end for chaining.
|
|
||||||
*/
|
/* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
|
||||||
while (!cs->base.cdw || (cs->base.cdw & ib_pad_dw_mask) != ib_pad_dw_mask - 3)
|
radv_amdgpu_winsys_cs_pad(_cs, 4);
|
||||||
radeon_emit_unchecked(&cs->base, nop_packet);
|
|
||||||
|
|
||||||
radeon_emit_unchecked(&cs->base, nop_packet);
|
radeon_emit_unchecked(&cs->base, nop_packet);
|
||||||
radeon_emit_unchecked(&cs->base, nop_packet);
|
radeon_emit_unchecked(&cs->base, nop_packet);
|
||||||
@@ -458,8 +484,7 @@ radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
|
|||||||
pad = false;
|
pad = false;
|
||||||
|
|
||||||
if (pad) {
|
if (pad) {
|
||||||
while (!cs->base.cdw || (cs->base.cdw & ib_pad_dw_mask))
|
radv_amdgpu_winsys_cs_pad(_cs, 0);
|
||||||
radeon_emit_unchecked(&cs->base, nop_packet);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1909,4 +1934,5 @@ radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws)
|
|||||||
ws->base.cs_submit = radv_amdgpu_winsys_cs_submit;
|
ws->base.cs_submit = radv_amdgpu_winsys_cs_submit;
|
||||||
ws->base.cs_dump = radv_amdgpu_winsys_cs_dump;
|
ws->base.cs_dump = radv_amdgpu_winsys_cs_dump;
|
||||||
ws->base.cs_annotate = radv_amdgpu_winsys_cs_annotate;
|
ws->base.cs_annotate = radv_amdgpu_winsys_cs_annotate;
|
||||||
|
ws->base.cs_pad = radv_amdgpu_winsys_cs_pad;
|
||||||
}
|
}
|
||||||
|
@@ -66,6 +66,11 @@ radv_null_cs_create(struct radeon_winsys *ws, enum amd_ip_type ip_type, UNUSED b
|
|||||||
return &cs->base;
|
return &cs->base;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
radv_null_cs_pad(struct radeon_cmdbuf *_cs, unsigned leave_dw_space)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
static VkResult
|
static VkResult
|
||||||
radv_null_cs_finalize(struct radeon_cmdbuf *_cs)
|
radv_null_cs_finalize(struct radeon_cmdbuf *_cs)
|
||||||
{
|
{
|
||||||
@@ -89,4 +94,5 @@ radv_null_cs_init_functions(struct radv_null_winsys *ws)
|
|||||||
ws->base.cs_create = radv_null_cs_create;
|
ws->base.cs_create = radv_null_cs_create;
|
||||||
ws->base.cs_finalize = radv_null_cs_finalize;
|
ws->base.cs_finalize = radv_null_cs_finalize;
|
||||||
ws->base.cs_destroy = radv_null_cs_destroy;
|
ws->base.cs_destroy = radv_null_cs_destroy;
|
||||||
|
ws->base.cs_pad = radv_null_cs_pad;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user