radv: implement IB chaining for DGC when it's executed on compute

The IB2 packet is only supported on the graphics queue. To execute DGC
IB on compute, the previous solution was to submit it separately
without any chaining. Though this solution was incomplete because it's
easy to reach the maximum number of IBs per submit when there is a lot
of ExecuteIndirect() calls.

To fix that, the proposed solution is to implement DGC IB chaining when
it's executed on the compute only. The idea is to add a trailer that is
added at the beginning of the DGC IB (to know the offset). This trailer
is used to chain back back the DGC IB to a normal CS, it's patched at
execution time. Patching is fine because it's not allowed to execute
the same DGC IB concurrently and the entire solution relies on that.

When the DGC IB is executed on graphics, the trailer isn't patched and
it only contains NOPs padding. Performance should be mostly similar.

This fixes
dEQP-VK.dgc.nv.compute.misc.execute_many_*_primary_cmd_compute_queue.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30809>
This commit is contained in:
Samuel Pitoiset
2024-08-20 15:00:29 +02:00
committed by Marge Bot
parent 303a456aa5
commit c1b2cb6ef7
6 changed files with 286 additions and 31 deletions

View File

@@ -11601,16 +11601,20 @@ radv_dgc_execute_ib(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommand
const uint32_t cmdbuf_size = radv_get_indirect_gfx_cmdbuf_size(pGeneratedCommandsInfo);
const uint64_t ib_va =
radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset + pGeneratedCommandsInfo->preprocessOffset;
const uint64_t gfx_trailer_va = ib_va + radv_get_indirect_gfx_trailer_offset(pGeneratedCommandsInfo);
const uint64_t gfx_ib_va = ib_va + radv_get_indirect_gfx_cmdbuf_offset(pGeneratedCommandsInfo);
device->ws->cs_execute_ib(cmd_buffer->cs, NULL, ib_va, cmdbuf_size >> 2, cmd_buffer->state.predicating);
device->ws->cs_chain_dgc_ib(cmd_buffer->cs, gfx_ib_va, cmdbuf_size >> 2, gfx_trailer_va,
cmd_buffer->state.predicating);
if (has_task_shader) {
const uint32_t ace_cmdbuf_size = radv_get_indirect_ace_cmdbuf_size(pGeneratedCommandsInfo);
const uint64_t ace_trailer_va = ib_va + radv_get_indirect_ace_trailer_offset(pGeneratedCommandsInfo);
const uint64_t ace_ib_va = ib_va + radv_get_indirect_ace_cmdbuf_offset(pGeneratedCommandsInfo);
assert(cmd_buffer->gang.cs);
device->ws->cs_execute_ib(cmd_buffer->gang.cs, NULL, ace_ib_va, ace_cmdbuf_size >> 2,
cmd_buffer->state.predicating);
device->ws->cs_chain_dgc_ib(cmd_buffer->gang.cs, ace_ib_va, ace_cmdbuf_size >> 2, ace_trailer_va,
cmd_buffer->state.predicating);
}
}

View File

@@ -23,16 +23,26 @@
*
* Without the DGC preamble, the default layout looks like:
*
* +----------+---------+
* | commands | padding |
* +----------+---------+
* +---------+----------+---------+-----------------+
* | trailer | commands | padding | jump to trailer |
* +---------+----------+---------+-----------------+
*
* With the DGC preamble, which is used to optimize large empty indirect sequence count by removing
* a ton of padding, the layout looks like:
* The trailer is used to implement IB chaining for compute queue because IB2
* isn't supported. The trailer is patched at execute time on the CPU to chain
* back the DGC command buffer. The trailer is added at the beginning to make
* sure the offset is fixed (ie. not possible to know the offset with a
* preamble). In practice the execution looks like:
*
* +---------+-----------------+ +----------+---------+
* | padding | INDIRECT_BUFFER | -> | commands | padding |
* +---------+-----------------+ +----------+---------+
* +----------+---------+-----------------+ +---------+ +-----------------------+
* | commands | padding | jump to trailer | -> | trailer | -> | postamble (normal CS) |
* +----------+---------+-----------------+ +---------+ +-----------------------+
*
* When DGC uses a preamble (to optimize large empty indirect sequence count by removing a ton of
* padding), the trailer is still used but the layout looks like:
*
* +---------+---------+-----------------+ +----------+---------+-----------------+
* | trailer | padding | INDIRECT_BUFFER | -> | commands | padding | jump to trailer |
* +---------+---------+-----------------+ +----------+---------+-----------------+
*
* When DGC uses task shaders, the command buffer is split in two parts (GFX/COMPUTE), the
* default layout looks like:
@@ -43,8 +53,8 @@
*
* The execution of this DGC command buffer is different if it's GFX or COMPUTE queue:
* - on GFX, the driver uses the IB2 packet which the easiest solution
* - on COMPUTE, IB2 isn't supported and the driver submits the DGC command buffer separately
* without chaining
* - on COMPUTE, IB2 isn't supported and the driver chains the DGC command
* buffer by patching the trailer
*/
static void
radv_get_sequence_size_compute(const struct radv_indirect_command_layout *layout,
@@ -259,6 +269,12 @@ radv_dgc_preamble_cmdbuf_size(const struct radv_device *device, enum amd_ip_type
return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type);
}
static unsigned
radv_dgc_trailer_cmdbuf_size(const struct radv_device *device, enum amd_ip_type ip_type)
{
return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type);
}
static bool
radv_dgc_use_preamble(const VkGeneratedCommandsInfoNV *cmd_info)
{
@@ -271,11 +287,14 @@ struct dgc_cmdbuf_layout {
bool use_preamble;
uint32_t alloc_size;
uint32_t main_trailer_offset;
uint32_t main_preamble_offset;
uint32_t main_offset;
uint32_t main_cmd_stride;
uint32_t main_preamble_size;
uint32_t main_size;
uint32_t ace_trailer_offset;
uint32_t ace_preamble_offset;
uint32_t ace_main_offset;
uint32_t ace_cmd_stride;
@@ -305,11 +324,19 @@ get_dgc_cmdbuf_layout(const struct radv_device *device, const struct radv_indire
layout->ace_preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_COMPUTE);
}
layout->main_size = radv_pad_cmdbuf(device, layout->main_cmd_stride * sequences_count, AMD_IP_GFX);
layout->ace_size = radv_pad_cmdbuf(device, layout->ace_cmd_stride * sequences_count, AMD_IP_COMPUTE);
layout->main_size =
radv_pad_cmdbuf(device, (layout->main_cmd_stride * sequences_count) + PKT3_INDIRECT_BUFFER_BYTES, AMD_IP_GFX);
layout->ace_size =
radv_pad_cmdbuf(device, (layout->ace_cmd_stride * sequences_count) + PKT3_INDIRECT_BUFFER_BYTES, AMD_IP_COMPUTE);
layout->upload_size = layout->upload_stride * sequences_count;
/* Main */
layout->main_trailer_offset = 0;
offset += radv_dgc_trailer_cmdbuf_size(device, AMD_IP_GFX);
offset = radv_align_cmdbuf(device, offset, AMD_IP_GFX);
layout->main_preamble_offset = offset;
if (layout->use_preamble)
offset += layout->main_preamble_size;
offset = radv_align_cmdbuf(device, offset, AMD_IP_GFX);
@@ -321,6 +348,10 @@ get_dgc_cmdbuf_layout(const struct radv_device *device, const struct radv_indire
if (layout->ace_cmd_stride) {
offset = radv_align_cmdbuf(device, offset, AMD_IP_COMPUTE);
layout->ace_trailer_offset = offset;
offset += radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE);
offset = radv_align_cmdbuf(device, offset, AMD_IP_COMPUTE);
layout->ace_preamble_offset = offset;
if (layout->use_preamble)
@@ -356,14 +387,8 @@ radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info, enum am
return ip_type == AMD_IP_GFX ? cmdbuf_layout.main_size : cmdbuf_layout.ace_size;
}
uint32_t
radv_get_indirect_gfx_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info)
{
return radv_get_indirect_cmdbuf_size(cmd_info, AMD_IP_GFX);
}
uint32_t
radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info)
static uint32_t
radv_get_indirect_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info, enum amd_ip_type ip_type)
{
VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout);
VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline);
@@ -374,7 +399,48 @@ radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info)
get_dgc_cmdbuf_layout(device, layout, pipeline, sequences_count, use_preamble, &cmdbuf_layout);
return cmdbuf_layout.ace_preamble_offset;
return ip_type == AMD_IP_GFX ? cmdbuf_layout.main_preamble_offset : cmdbuf_layout.ace_preamble_offset;
}
static uint32_t
radv_get_indirect_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info, enum amd_ip_type ip_type)
{
VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout);
VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline);
const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk);
const bool use_preamble = radv_dgc_use_preamble(cmd_info);
const uint32_t sequences_count = cmd_info->sequencesCount;
struct dgc_cmdbuf_layout cmdbuf_layout;
get_dgc_cmdbuf_layout(device, layout, pipeline, sequences_count, use_preamble, &cmdbuf_layout);
const uint32_t offset = ip_type == AMD_IP_GFX ? cmdbuf_layout.main_trailer_offset : cmdbuf_layout.ace_trailer_offset;
return offset + radv_dgc_trailer_cmdbuf_size(device, ip_type) - PKT3_INDIRECT_BUFFER_BYTES;
}
uint32_t
radv_get_indirect_gfx_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info)
{
return radv_get_indirect_cmdbuf_offset(cmd_info, AMD_IP_GFX);
}
uint32_t
radv_get_indirect_gfx_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info)
{
return radv_get_indirect_cmdbuf_size(cmd_info, AMD_IP_GFX);
}
uint32_t
radv_get_indirect_gfx_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info)
{
return radv_get_indirect_trailer_offset(cmd_info, AMD_IP_GFX);
}
uint32_t
radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info)
{
return radv_get_indirect_cmdbuf_offset(cmd_info, AMD_IP_COMPUTE);
}
uint32_t
@@ -383,10 +449,18 @@ radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info)
return radv_get_indirect_cmdbuf_size(cmd_info, AMD_IP_COMPUTE);
}
uint32_t
radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info)
{
return radv_get_indirect_trailer_offset(cmd_info, AMD_IP_COMPUTE);
}
struct radv_dgc_params {
uint32_t cmd_buf_preamble_offset;
uint32_t cmd_buf_main_offset;
uint32_t cmd_buf_stride;
uint32_t cmd_buf_size;
uint32_t ace_cmd_buf_trailer_offset;
uint32_t ace_cmd_buf_preamble_offset;
uint32_t ace_cmd_buf_main_offset;
uint32_t ace_cmd_buf_stride;
@@ -900,7 +974,7 @@ dgc_cmd_buf_size(nir_builder *b, nir_def *sequence_count, bool is_ace, const str
const enum amd_ip_type ip_type = is_ace ? AMD_IP_COMPUTE : AMD_IP_GFX;
nir_def *use_preamble = nir_ine_imm(b, load_param8(b, use_preamble), 0);
nir_def *size = nir_imul(b, cmd_buf_stride, sequence_count);
nir_def *size = nir_iadd_imm(b, nir_imul(b, cmd_buf_stride, sequence_count), PKT3_INDIRECT_BUFFER_BYTES);
unsigned align_mask = radv_pad_cmdbuf(device, 1, ip_type) - 1;
size = nir_iand_imm(b, nir_iadd_imm(b, size, align_mask), ~align_mask);
@@ -913,7 +987,8 @@ dgc_cmd_buf_size(nir_builder *b, nir_def *sequence_count, bool is_ace, const str
static void
build_dgc_buffer_tail(nir_builder *b, nir_def *cmd_buf_offset, nir_def *cmd_buf_size, nir_def *cmd_buf_stride,
nir_def *sequence_count, const struct radv_device *device)
nir_def *cmd_buf_trailer_offset, nir_def *sequence_count, unsigned trailer_size,
const struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
@@ -926,6 +1001,9 @@ build_dgc_buffer_tail(nir_builder *b, nir_def *cmd_buf_offset, nir_def *cmd_buf_
nir_variable *offset = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "offset");
nir_store_var(b, offset, cmd_buf_tail_start, 0x1);
/* Add NOPs padding but leave space for the INDIRECT_BUFFER packet. */
cmd_buf_size = nir_iadd_imm(b, cmd_buf_size, -PKT3_INDIRECT_BUFFER_BYTES);
nir_def *va = nir_pack_64_2x32_split(b, load_param32(b, upload_addr), nir_imm_int(b, pdev->info.address32_hi));
nir_push_loop(b);
{
@@ -949,6 +1027,17 @@ build_dgc_buffer_tail(nir_builder *b, nir_def *cmd_buf_offset, nir_def *cmd_buf_
nir_store_var(b, offset, nir_iadd(b, curr_offset, packet_size), 0x1);
}
nir_pop_loop(b, NULL);
nir_def *chain_packet[] = {
nir_imm_int(b, PKT3(PKT3_INDIRECT_BUFFER, 2, 0)),
nir_iadd(b, load_param32(b, upload_addr), cmd_buf_trailer_offset),
nir_imm_int(b, pdev->info.address32_hi),
nir_imm_int(b, trailer_size | S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(false)),
};
nir_build_store_global(b, nir_vec(b, chain_packet, 4),
nir_iadd(b, va, nir_u2u64(b, nir_iadd(b, nir_load_var(b, offset), cmd_buf_offset))),
.access = ACCESS_NON_READABLE);
}
nir_pop_if(b, NULL);
}
@@ -959,8 +1048,11 @@ build_dgc_buffer_tail_gfx(nir_builder *b, nir_def *sequence_count, const struct
nir_def *cmd_buf_offset = load_param32(b, cmd_buf_main_offset);
nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, false, device);
nir_def *cmd_buf_stride = load_param32(b, cmd_buf_stride);
nir_def *cmd_buf_trailer_offset = nir_imm_int(b, 0);
unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_GFX) / 4;
build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, sequence_count, device);
build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, cmd_buf_trailer_offset, sequence_count,
trailer_size, device);
}
static void
@@ -969,8 +1061,63 @@ build_dgc_buffer_tail_ace(nir_builder *b, nir_def *sequence_count, const struct
nir_def *cmd_buf_offset = load_param32(b, ace_cmd_buf_main_offset);
nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, true, device);
nir_def *cmd_buf_stride = load_param32(b, ace_cmd_buf_stride);
nir_def *cmd_buf_trailer_offset = load_param32(b, ace_cmd_buf_trailer_offset);
unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE) / 4;
build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, sequence_count, device);
build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, cmd_buf_trailer_offset, sequence_count,
trailer_size, device);
}
static void
build_dgc_buffer_trailer(nir_builder *b, nir_def *cmd_buf_offset, unsigned trailer_size,
const struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
nir_def *global_id = get_global_ids(b, 1);
nir_push_if(b, nir_ieq_imm(b, global_id, 0));
{
nir_def *va = nir_pack_64_2x32_split(b, load_param32(b, upload_addr), nir_imm_int(b, pdev->info.address32_hi));
va = nir_iadd(b, va, nir_u2u64(b, cmd_buf_offset));
const uint32_t pad_size = trailer_size - PKT3_INDIRECT_BUFFER_BYTES;
const uint32_t pad_size_dw = pad_size >> 2;
nir_def *len = nir_imm_int(b, pad_size_dw - 2);
nir_def *packet = nir_pkt3(b, PKT3_NOP, len);
nir_build_store_global(b, packet, va, .access = ACCESS_NON_READABLE);
nir_def *nop_packets[] = {
nir_imm_int(b, PKT3_NOP_PAD),
nir_imm_int(b, PKT3_NOP_PAD),
nir_imm_int(b, PKT3_NOP_PAD),
nir_imm_int(b, PKT3_NOP_PAD),
};
nir_build_store_global(b, nir_vec(b, nop_packets, 4), nir_iadd_imm(b, va, pad_size),
.access = ACCESS_NON_READABLE);
}
nir_pop_if(b, NULL);
}
static void
build_dgc_buffer_trailer_gfx(nir_builder *b, const struct radv_device *device)
{
nir_def *cmd_buf_offset = nir_imm_int(b, 0);
const unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_GFX);
build_dgc_buffer_trailer(b, cmd_buf_offset, trailer_size, device);
}
static void
build_dgc_buffer_trailer_ace(nir_builder *b, const struct radv_device *device)
{
nir_def *cmd_buf_offset = load_param32(b, ace_cmd_buf_trailer_offset);
const unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE);
build_dgc_buffer_trailer(b, cmd_buf_offset, trailer_size, device);
}
static void
@@ -1014,7 +1161,7 @@ build_dgc_buffer_preamble(nir_builder *b, nir_def *cmd_buf_preamble_offset, nir_
static void
build_dgc_buffer_preamble_gfx(nir_builder *b, nir_def *sequence_count, const struct radv_device *device)
{
nir_def *cmd_buf_preamble_offset = nir_imm_int(b, 0);
nir_def *cmd_buf_preamble_offset = load_param32(b, cmd_buf_preamble_offset);
nir_def *cmd_buf_main_offset = load_param32(b, cmd_buf_main_offset);
nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, false, device);
unsigned preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_GFX);
@@ -1948,6 +2095,8 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l
sequence_count = nir_load_var(&b, count_var);
build_dgc_buffer_trailer_gfx(&b, dev);
nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count));
{
struct dgc_cmdbuf cmd_buf = {
@@ -2029,6 +2178,8 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l
nir_def *ace_cmd_buf_stride = load_param32(&b, ace_cmd_buf_stride);
nir_def *ace_cmd_buf_base_offset = load_param32(&b, ace_cmd_buf_main_offset);
build_dgc_buffer_trailer_ace(&b, dev);
nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count));
{
struct dgc_cmdbuf cmd_buf = {
@@ -2511,9 +2662,11 @@ radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsIn
assert((cmdbuf_layout.ace_main_offset + upload_addr) % pdev->info.ip[AMD_IP_COMPUTE].ib_alignment == 0);
struct radv_dgc_params params = {
.cmd_buf_preamble_offset = cmdbuf_layout.main_preamble_offset,
.cmd_buf_main_offset = cmdbuf_layout.main_offset,
.cmd_buf_stride = cmdbuf_layout.main_cmd_stride,
.cmd_buf_size = cmdbuf_layout.main_size,
.ace_cmd_buf_trailer_offset = cmdbuf_layout.ace_trailer_offset,
.ace_cmd_buf_preamble_offset = cmdbuf_layout.ace_preamble_offset,
.ace_cmd_buf_main_offset = cmdbuf_layout.ace_main_offset,
.ace_cmd_buf_stride = cmdbuf_layout.ace_cmd_stride,

View File

@@ -61,8 +61,14 @@ uint32_t radv_get_indirect_gfx_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_
uint32_t radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info);
uint32_t radv_get_indirect_gfx_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info);
uint32_t radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info);
uint32_t radv_get_indirect_gfx_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info);
uint32_t radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info);
bool radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer,
const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo);

View File

@@ -747,8 +747,10 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device
.INTEL_shader_integer_functions2 = true,
.MESA_image_alignment_control = pdev->info.gfx_level >= GFX9 && pdev->info.gfx_level <= GFX11_5,
.NV_compute_shader_derivatives = true,
.NV_device_generated_commands = pdev->info.gfx_level >= GFX8 && instance->drirc.enable_dgc,
.NV_device_generated_commands_compute = pdev->info.gfx_level >= GFX8 && instance->drirc.enable_dgc,
.NV_device_generated_commands =
pdev->info.gfx_level >= GFX8 && instance->drirc.enable_dgc && !(instance->debug_flags & RADV_DEBUG_NO_IBS),
.NV_device_generated_commands_compute =
pdev->info.gfx_level >= GFX8 && instance->drirc.enable_dgc && !(instance->debug_flags & RADV_DEBUG_NO_IBS),
/* Undocumented extension purely for vkd3d-proton. This check is to prevent anyone else from
* using it.
*/

View File

@@ -292,6 +292,9 @@ struct radeon_winsys {
void (*cs_execute_ib)(struct radeon_cmdbuf *cs, struct radeon_winsys_bo *bo, const uint64_t va, const uint32_t cdw,
const bool predicate);
void (*cs_chain_dgc_ib)(struct radeon_cmdbuf *cs, uint64_t va, uint32_t cdw, uint64_t trailer_va,
const bool predicate);
void (*cs_dump)(struct radeon_cmdbuf *cs, FILE *file, const int *trace_ids, int trace_id_count,
enum radv_cs_dump_type type);

View File

@@ -831,6 +831,92 @@ radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo
}
}
static void
radv_amdgpu_cs_chain_dgc_ib(struct radeon_cmdbuf *_cs, uint64_t va, uint32_t cdw, uint64_t trailer_va,
const bool predicate)
{
struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
if (cs->status != VK_SUCCESS)
return;
assert(cs->ws->info.gfx_level >= GFX8);
if (cs->hw_ip == AMD_IP_GFX) {
/* Use IB2 for executing DGC CS on GFX. */
cs->ws->base.cs_execute_ib(_cs, NULL, va, cdw, predicate);
} else {
assert(va && va % cs->ws->info.ip[cs->hw_ip].ib_alignment == 0);
assert(cdw <= ~C_3F2_IB_SIZE);
/* Emit a WRITE_DATA packet to patch the DGC CS. */
const uint32_t chain_data[] = {
PKT3(PKT3_INDIRECT_BUFFER, 2, 0),
0,
0,
S_3F2_CHAIN(1) | S_3F2_VALID(1),
};
radeon_emit(&cs->base, PKT3(PKT3_WRITE_DATA, 2 + ARRAY_SIZE(chain_data), false));
radeon_emit(&cs->base, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
radeon_emit(&cs->base, trailer_va);
radeon_emit(&cs->base, trailer_va >> 32);
radeon_emit_array(&cs->base, chain_data, ARRAY_SIZE(chain_data));
/* Keep pointers for patching later. */
uint64_t *ib_va_ptr = (uint64_t *)(cs->base.buf + cs->base.cdw - 3);
uint32_t *ib_size_ptr = cs->base.buf + cs->base.cdw - 1;
/* Writeback L2 because CP isn't coherent with L2 on GFX6-8. */
if (cs->ws->info.gfx_level == GFX8) {
radeon_emit(&cs->base, PKT3(PKT3_ACQUIRE_MEM, 5, false) | PKT3_SHADER_TYPE_S(1));
radeon_emit(&cs->base, S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));
radeon_emit(&cs->base, 0xffffffff);
radeon_emit(&cs->base, 0xff);
radeon_emit(&cs->base, 0);
radeon_emit(&cs->base, 0);
radeon_emit(&cs->base, 0x0000000A);
}
/* Finalize the current CS. */
cs->ws->base.cs_finalize(_cs);
/* Chain the current CS to the DGC CS. */
_cs->buf[_cs->cdw - 4] = PKT3(PKT3_INDIRECT_BUFFER, 2, 0);
_cs->buf[_cs->cdw - 3] = va;
_cs->buf[_cs->cdw - 2] = va >> 32;
_cs->buf[_cs->cdw - 1] = S_3F2_CHAIN(1) | S_3F2_VALID(1) | cdw;
/* Allocate a new CS BO with initial size. */
const uint64_t ib_size = radv_amdgpu_cs_get_initial_size(cs->ws, cs->hw_ip);
VkResult result = radv_amdgpu_cs_bo_create(cs, ib_size);
if (result != VK_SUCCESS) {
cs->base.cdw = 0;
cs->status = result;
return;
}
cs->ib_mapped = radv_buffer_map(&cs->ws->base, cs->ib_buffer);
if (!cs->ib_mapped) {
cs->base.cdw = 0;
cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
return;
}
cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
/* Chain back the trailer (DGC CS) to the newly created one. */
*ib_va_ptr = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
cs->ib_size_ptr = ib_size_ptr;
cs->base.buf = (uint32_t *)cs->ib_mapped;
cs->base.cdw = 0;
cs->base.reserved_dw = 0;
cs->base.max_dw = ib_size / 4 - 4;
}
}
static unsigned
radv_amdgpu_count_cs_bo(struct radv_amdgpu_cs *start_cs)
{
@@ -1934,6 +2020,7 @@ radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws)
ws->base.cs_add_buffer = radv_amdgpu_cs_add_buffer;
ws->base.cs_execute_secondary = radv_amdgpu_cs_execute_secondary;
ws->base.cs_execute_ib = radv_amdgpu_cs_execute_ib;
ws->base.cs_chain_dgc_ib = radv_amdgpu_cs_chain_dgc_ib;
ws->base.cs_submit = radv_amdgpu_winsys_cs_submit;
ws->base.cs_dump = radv_amdgpu_winsys_cs_dump;
ws->base.cs_annotate = radv_amdgpu_winsys_cs_annotate;