diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 84731c5590b..438c497c81e 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -11601,16 +11601,20 @@ radv_dgc_execute_ib(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommand const uint32_t cmdbuf_size = radv_get_indirect_gfx_cmdbuf_size(pGeneratedCommandsInfo); const uint64_t ib_va = radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset + pGeneratedCommandsInfo->preprocessOffset; + const uint64_t gfx_trailer_va = ib_va + radv_get_indirect_gfx_trailer_offset(pGeneratedCommandsInfo); + const uint64_t gfx_ib_va = ib_va + radv_get_indirect_gfx_cmdbuf_offset(pGeneratedCommandsInfo); - device->ws->cs_execute_ib(cmd_buffer->cs, NULL, ib_va, cmdbuf_size >> 2, cmd_buffer->state.predicating); + device->ws->cs_chain_dgc_ib(cmd_buffer->cs, gfx_ib_va, cmdbuf_size >> 2, gfx_trailer_va, + cmd_buffer->state.predicating); if (has_task_shader) { const uint32_t ace_cmdbuf_size = radv_get_indirect_ace_cmdbuf_size(pGeneratedCommandsInfo); + const uint64_t ace_trailer_va = ib_va + radv_get_indirect_ace_trailer_offset(pGeneratedCommandsInfo); const uint64_t ace_ib_va = ib_va + radv_get_indirect_ace_cmdbuf_offset(pGeneratedCommandsInfo); assert(cmd_buffer->gang.cs); - device->ws->cs_execute_ib(cmd_buffer->gang.cs, NULL, ace_ib_va, ace_cmdbuf_size >> 2, - cmd_buffer->state.predicating); + device->ws->cs_chain_dgc_ib(cmd_buffer->gang.cs, ace_ib_va, ace_cmdbuf_size >> 2, ace_trailer_va, + cmd_buffer->state.predicating); } } diff --git a/src/amd/vulkan/radv_device_generated_commands.c b/src/amd/vulkan/radv_device_generated_commands.c index 2a28c6f8b08..c4f37c47343 100644 --- a/src/amd/vulkan/radv_device_generated_commands.c +++ b/src/amd/vulkan/radv_device_generated_commands.c @@ -23,16 +23,26 @@ * * Without the DGC preamble, the default layout looks like: * - * +----------+---------+ - * | commands | padding | - * +----------+---------+ + * +---------+----------+---------+-----------------+ + * | trailer | commands | padding | jump to trailer | + * +---------+----------+---------+-----------------+ * - * With the DGC preamble, which is used to optimize large empty indirect sequence count by removing - * a ton of padding, the layout looks like: + * The trailer is used to implement IB chaining for compute queue because IB2 + * isn't supported. The trailer is patched at execute time on the CPU to chain + * back the DGC command buffer. The trailer is added at the beginning to make + * sure the offset is fixed (ie. not possible to know the offset with a + * preamble). In practice the execution looks like: * - * +---------+-----------------+ +----------+---------+ - * | padding | INDIRECT_BUFFER | -> | commands | padding | - * +---------+-----------------+ +----------+---------+ + * +----------+---------+-----------------+ +---------+ +-----------------------+ + * | commands | padding | jump to trailer | -> | trailer | -> | postamble (normal CS) | + * +----------+---------+-----------------+ +---------+ +-----------------------+ + * + * When DGC uses a preamble (to optimize large empty indirect sequence count by removing a ton of + * padding), the trailer is still used but the layout looks like: + * + * +---------+---------+-----------------+ +----------+---------+-----------------+ + * | trailer | padding | INDIRECT_BUFFER | -> | commands | padding | jump to trailer | + * +---------+---------+-----------------+ +----------+---------+-----------------+ * * When DGC uses task shaders, the command buffer is split in two parts (GFX/COMPUTE), the * default layout looks like: @@ -43,8 +53,8 @@ * * The execution of this DGC command buffer is different if it's GFX or COMPUTE queue: * - on GFX, the driver uses the IB2 packet which the easiest solution - * - on COMPUTE, IB2 isn't supported and the driver submits the DGC command buffer separately - * without chaining + * - on COMPUTE, IB2 isn't supported and the driver chains the DGC command + * buffer by patching the trailer */ static void radv_get_sequence_size_compute(const struct radv_indirect_command_layout *layout, @@ -259,6 +269,12 @@ radv_dgc_preamble_cmdbuf_size(const struct radv_device *device, enum amd_ip_type return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type); } +static unsigned +radv_dgc_trailer_cmdbuf_size(const struct radv_device *device, enum amd_ip_type ip_type) +{ + return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type); +} + static bool radv_dgc_use_preamble(const VkGeneratedCommandsInfoNV *cmd_info) { @@ -271,11 +287,14 @@ struct dgc_cmdbuf_layout { bool use_preamble; uint32_t alloc_size; + uint32_t main_trailer_offset; + uint32_t main_preamble_offset; uint32_t main_offset; uint32_t main_cmd_stride; uint32_t main_preamble_size; uint32_t main_size; + uint32_t ace_trailer_offset; uint32_t ace_preamble_offset; uint32_t ace_main_offset; uint32_t ace_cmd_stride; @@ -305,11 +324,19 @@ get_dgc_cmdbuf_layout(const struct radv_device *device, const struct radv_indire layout->ace_preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_COMPUTE); } - layout->main_size = radv_pad_cmdbuf(device, layout->main_cmd_stride * sequences_count, AMD_IP_GFX); - layout->ace_size = radv_pad_cmdbuf(device, layout->ace_cmd_stride * sequences_count, AMD_IP_COMPUTE); + layout->main_size = + radv_pad_cmdbuf(device, (layout->main_cmd_stride * sequences_count) + PKT3_INDIRECT_BUFFER_BYTES, AMD_IP_GFX); + layout->ace_size = + radv_pad_cmdbuf(device, (layout->ace_cmd_stride * sequences_count) + PKT3_INDIRECT_BUFFER_BYTES, AMD_IP_COMPUTE); layout->upload_size = layout->upload_stride * sequences_count; /* Main */ + layout->main_trailer_offset = 0; + + offset += radv_dgc_trailer_cmdbuf_size(device, AMD_IP_GFX); + offset = radv_align_cmdbuf(device, offset, AMD_IP_GFX); + layout->main_preamble_offset = offset; + if (layout->use_preamble) offset += layout->main_preamble_size; offset = radv_align_cmdbuf(device, offset, AMD_IP_GFX); @@ -321,6 +348,10 @@ get_dgc_cmdbuf_layout(const struct radv_device *device, const struct radv_indire if (layout->ace_cmd_stride) { offset = radv_align_cmdbuf(device, offset, AMD_IP_COMPUTE); + layout->ace_trailer_offset = offset; + + offset += radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE); + offset = radv_align_cmdbuf(device, offset, AMD_IP_COMPUTE); layout->ace_preamble_offset = offset; if (layout->use_preamble) @@ -356,14 +387,8 @@ radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info, enum am return ip_type == AMD_IP_GFX ? cmdbuf_layout.main_size : cmdbuf_layout.ace_size; } -uint32_t -radv_get_indirect_gfx_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info) -{ - return radv_get_indirect_cmdbuf_size(cmd_info, AMD_IP_GFX); -} - -uint32_t -radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info) +static uint32_t +radv_get_indirect_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info, enum amd_ip_type ip_type) { VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout); VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline); @@ -374,7 +399,48 @@ radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info) get_dgc_cmdbuf_layout(device, layout, pipeline, sequences_count, use_preamble, &cmdbuf_layout); - return cmdbuf_layout.ace_preamble_offset; + return ip_type == AMD_IP_GFX ? cmdbuf_layout.main_preamble_offset : cmdbuf_layout.ace_preamble_offset; +} + +static uint32_t +radv_get_indirect_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info, enum amd_ip_type ip_type) +{ + VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout); + VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline); + const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk); + const bool use_preamble = radv_dgc_use_preamble(cmd_info); + const uint32_t sequences_count = cmd_info->sequencesCount; + struct dgc_cmdbuf_layout cmdbuf_layout; + + get_dgc_cmdbuf_layout(device, layout, pipeline, sequences_count, use_preamble, &cmdbuf_layout); + + const uint32_t offset = ip_type == AMD_IP_GFX ? cmdbuf_layout.main_trailer_offset : cmdbuf_layout.ace_trailer_offset; + + return offset + radv_dgc_trailer_cmdbuf_size(device, ip_type) - PKT3_INDIRECT_BUFFER_BYTES; +} + +uint32_t +radv_get_indirect_gfx_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info) +{ + return radv_get_indirect_cmdbuf_offset(cmd_info, AMD_IP_GFX); +} + +uint32_t +radv_get_indirect_gfx_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info) +{ + return radv_get_indirect_cmdbuf_size(cmd_info, AMD_IP_GFX); +} + +uint32_t +radv_get_indirect_gfx_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info) +{ + return radv_get_indirect_trailer_offset(cmd_info, AMD_IP_GFX); +} + +uint32_t +radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info) +{ + return radv_get_indirect_cmdbuf_offset(cmd_info, AMD_IP_COMPUTE); } uint32_t @@ -383,10 +449,18 @@ radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info) return radv_get_indirect_cmdbuf_size(cmd_info, AMD_IP_COMPUTE); } +uint32_t +radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info) +{ + return radv_get_indirect_trailer_offset(cmd_info, AMD_IP_COMPUTE); +} + struct radv_dgc_params { + uint32_t cmd_buf_preamble_offset; uint32_t cmd_buf_main_offset; uint32_t cmd_buf_stride; uint32_t cmd_buf_size; + uint32_t ace_cmd_buf_trailer_offset; uint32_t ace_cmd_buf_preamble_offset; uint32_t ace_cmd_buf_main_offset; uint32_t ace_cmd_buf_stride; @@ -900,7 +974,7 @@ dgc_cmd_buf_size(nir_builder *b, nir_def *sequence_count, bool is_ace, const str const enum amd_ip_type ip_type = is_ace ? AMD_IP_COMPUTE : AMD_IP_GFX; nir_def *use_preamble = nir_ine_imm(b, load_param8(b, use_preamble), 0); - nir_def *size = nir_imul(b, cmd_buf_stride, sequence_count); + nir_def *size = nir_iadd_imm(b, nir_imul(b, cmd_buf_stride, sequence_count), PKT3_INDIRECT_BUFFER_BYTES); unsigned align_mask = radv_pad_cmdbuf(device, 1, ip_type) - 1; size = nir_iand_imm(b, nir_iadd_imm(b, size, align_mask), ~align_mask); @@ -913,7 +987,8 @@ dgc_cmd_buf_size(nir_builder *b, nir_def *sequence_count, bool is_ace, const str static void build_dgc_buffer_tail(nir_builder *b, nir_def *cmd_buf_offset, nir_def *cmd_buf_size, nir_def *cmd_buf_stride, - nir_def *sequence_count, const struct radv_device *device) + nir_def *cmd_buf_trailer_offset, nir_def *sequence_count, unsigned trailer_size, + const struct radv_device *device) { const struct radv_physical_device *pdev = radv_device_physical(device); @@ -926,6 +1001,9 @@ build_dgc_buffer_tail(nir_builder *b, nir_def *cmd_buf_offset, nir_def *cmd_buf_ nir_variable *offset = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "offset"); nir_store_var(b, offset, cmd_buf_tail_start, 0x1); + /* Add NOPs padding but leave space for the INDIRECT_BUFFER packet. */ + cmd_buf_size = nir_iadd_imm(b, cmd_buf_size, -PKT3_INDIRECT_BUFFER_BYTES); + nir_def *va = nir_pack_64_2x32_split(b, load_param32(b, upload_addr), nir_imm_int(b, pdev->info.address32_hi)); nir_push_loop(b); { @@ -949,6 +1027,17 @@ build_dgc_buffer_tail(nir_builder *b, nir_def *cmd_buf_offset, nir_def *cmd_buf_ nir_store_var(b, offset, nir_iadd(b, curr_offset, packet_size), 0x1); } nir_pop_loop(b, NULL); + + nir_def *chain_packet[] = { + nir_imm_int(b, PKT3(PKT3_INDIRECT_BUFFER, 2, 0)), + nir_iadd(b, load_param32(b, upload_addr), cmd_buf_trailer_offset), + nir_imm_int(b, pdev->info.address32_hi), + nir_imm_int(b, trailer_size | S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(false)), + }; + + nir_build_store_global(b, nir_vec(b, chain_packet, 4), + nir_iadd(b, va, nir_u2u64(b, nir_iadd(b, nir_load_var(b, offset), cmd_buf_offset))), + .access = ACCESS_NON_READABLE); } nir_pop_if(b, NULL); } @@ -959,8 +1048,11 @@ build_dgc_buffer_tail_gfx(nir_builder *b, nir_def *sequence_count, const struct nir_def *cmd_buf_offset = load_param32(b, cmd_buf_main_offset); nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, false, device); nir_def *cmd_buf_stride = load_param32(b, cmd_buf_stride); + nir_def *cmd_buf_trailer_offset = nir_imm_int(b, 0); + unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_GFX) / 4; - build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, sequence_count, device); + build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, cmd_buf_trailer_offset, sequence_count, + trailer_size, device); } static void @@ -969,8 +1061,63 @@ build_dgc_buffer_tail_ace(nir_builder *b, nir_def *sequence_count, const struct nir_def *cmd_buf_offset = load_param32(b, ace_cmd_buf_main_offset); nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, true, device); nir_def *cmd_buf_stride = load_param32(b, ace_cmd_buf_stride); + nir_def *cmd_buf_trailer_offset = load_param32(b, ace_cmd_buf_trailer_offset); + unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE) / 4; - build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, sequence_count, device); + build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, cmd_buf_trailer_offset, sequence_count, + trailer_size, device); +} + +static void +build_dgc_buffer_trailer(nir_builder *b, nir_def *cmd_buf_offset, unsigned trailer_size, + const struct radv_device *device) +{ + const struct radv_physical_device *pdev = radv_device_physical(device); + + nir_def *global_id = get_global_ids(b, 1); + + nir_push_if(b, nir_ieq_imm(b, global_id, 0)); + { + nir_def *va = nir_pack_64_2x32_split(b, load_param32(b, upload_addr), nir_imm_int(b, pdev->info.address32_hi)); + va = nir_iadd(b, va, nir_u2u64(b, cmd_buf_offset)); + + const uint32_t pad_size = trailer_size - PKT3_INDIRECT_BUFFER_BYTES; + const uint32_t pad_size_dw = pad_size >> 2; + + nir_def *len = nir_imm_int(b, pad_size_dw - 2); + nir_def *packet = nir_pkt3(b, PKT3_NOP, len); + + nir_build_store_global(b, packet, va, .access = ACCESS_NON_READABLE); + + nir_def *nop_packets[] = { + nir_imm_int(b, PKT3_NOP_PAD), + nir_imm_int(b, PKT3_NOP_PAD), + nir_imm_int(b, PKT3_NOP_PAD), + nir_imm_int(b, PKT3_NOP_PAD), + }; + + nir_build_store_global(b, nir_vec(b, nop_packets, 4), nir_iadd_imm(b, va, pad_size), + .access = ACCESS_NON_READABLE); + } + nir_pop_if(b, NULL); +} + +static void +build_dgc_buffer_trailer_gfx(nir_builder *b, const struct radv_device *device) +{ + nir_def *cmd_buf_offset = nir_imm_int(b, 0); + const unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_GFX); + + build_dgc_buffer_trailer(b, cmd_buf_offset, trailer_size, device); +} + +static void +build_dgc_buffer_trailer_ace(nir_builder *b, const struct radv_device *device) +{ + nir_def *cmd_buf_offset = load_param32(b, ace_cmd_buf_trailer_offset); + const unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE); + + build_dgc_buffer_trailer(b, cmd_buf_offset, trailer_size, device); } static void @@ -1014,7 +1161,7 @@ build_dgc_buffer_preamble(nir_builder *b, nir_def *cmd_buf_preamble_offset, nir_ static void build_dgc_buffer_preamble_gfx(nir_builder *b, nir_def *sequence_count, const struct radv_device *device) { - nir_def *cmd_buf_preamble_offset = nir_imm_int(b, 0); + nir_def *cmd_buf_preamble_offset = load_param32(b, cmd_buf_preamble_offset); nir_def *cmd_buf_main_offset = load_param32(b, cmd_buf_main_offset); nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, false, device); unsigned preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_GFX); @@ -1948,6 +2095,8 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l sequence_count = nir_load_var(&b, count_var); + build_dgc_buffer_trailer_gfx(&b, dev); + nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count)); { struct dgc_cmdbuf cmd_buf = { @@ -2029,6 +2178,8 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l nir_def *ace_cmd_buf_stride = load_param32(&b, ace_cmd_buf_stride); nir_def *ace_cmd_buf_base_offset = load_param32(&b, ace_cmd_buf_main_offset); + build_dgc_buffer_trailer_ace(&b, dev); + nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count)); { struct dgc_cmdbuf cmd_buf = { @@ -2511,9 +2662,11 @@ radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsIn assert((cmdbuf_layout.ace_main_offset + upload_addr) % pdev->info.ip[AMD_IP_COMPUTE].ib_alignment == 0); struct radv_dgc_params params = { + .cmd_buf_preamble_offset = cmdbuf_layout.main_preamble_offset, .cmd_buf_main_offset = cmdbuf_layout.main_offset, .cmd_buf_stride = cmdbuf_layout.main_cmd_stride, .cmd_buf_size = cmdbuf_layout.main_size, + .ace_cmd_buf_trailer_offset = cmdbuf_layout.ace_trailer_offset, .ace_cmd_buf_preamble_offset = cmdbuf_layout.ace_preamble_offset, .ace_cmd_buf_main_offset = cmdbuf_layout.ace_main_offset, .ace_cmd_buf_stride = cmdbuf_layout.ace_cmd_stride, diff --git a/src/amd/vulkan/radv_device_generated_commands.h b/src/amd/vulkan/radv_device_generated_commands.h index 3af0eb15fe4..444d7c98c8c 100644 --- a/src/amd/vulkan/radv_device_generated_commands.h +++ b/src/amd/vulkan/radv_device_generated_commands.h @@ -61,8 +61,14 @@ uint32_t radv_get_indirect_gfx_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_ uint32_t radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info); +uint32_t radv_get_indirect_gfx_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info); + uint32_t radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info); +uint32_t radv_get_indirect_gfx_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info); + +uint32_t radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info); + bool radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo); diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c index 3776f2676a3..1a1a3eaeeee 100644 --- a/src/amd/vulkan/radv_physical_device.c +++ b/src/amd/vulkan/radv_physical_device.c @@ -747,8 +747,10 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device .INTEL_shader_integer_functions2 = true, .MESA_image_alignment_control = pdev->info.gfx_level >= GFX9 && pdev->info.gfx_level <= GFX11_5, .NV_compute_shader_derivatives = true, - .NV_device_generated_commands = pdev->info.gfx_level >= GFX8 && instance->drirc.enable_dgc, - .NV_device_generated_commands_compute = pdev->info.gfx_level >= GFX8 && instance->drirc.enable_dgc, + .NV_device_generated_commands = + pdev->info.gfx_level >= GFX8 && instance->drirc.enable_dgc && !(instance->debug_flags & RADV_DEBUG_NO_IBS), + .NV_device_generated_commands_compute = + pdev->info.gfx_level >= GFX8 && instance->drirc.enable_dgc && !(instance->debug_flags & RADV_DEBUG_NO_IBS), /* Undocumented extension purely for vkd3d-proton. This check is to prevent anyone else from * using it. */ diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h index 336b0c7d216..d61b0c0f2f2 100644 --- a/src/amd/vulkan/radv_radeon_winsys.h +++ b/src/amd/vulkan/radv_radeon_winsys.h @@ -292,6 +292,9 @@ struct radeon_winsys { void (*cs_execute_ib)(struct radeon_cmdbuf *cs, struct radeon_winsys_bo *bo, const uint64_t va, const uint32_t cdw, const bool predicate); + void (*cs_chain_dgc_ib)(struct radeon_cmdbuf *cs, uint64_t va, uint32_t cdw, uint64_t trailer_va, + const bool predicate); + void (*cs_dump)(struct radeon_cmdbuf *cs, FILE *file, const int *trace_ids, int trace_id_count, enum radv_cs_dump_type type); diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c index e0880d18414..e3e4708ee31 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -831,6 +831,92 @@ radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo } } +static void +radv_amdgpu_cs_chain_dgc_ib(struct radeon_cmdbuf *_cs, uint64_t va, uint32_t cdw, uint64_t trailer_va, + const bool predicate) +{ + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs); + + if (cs->status != VK_SUCCESS) + return; + + assert(cs->ws->info.gfx_level >= GFX8); + + if (cs->hw_ip == AMD_IP_GFX) { + /* Use IB2 for executing DGC CS on GFX. */ + cs->ws->base.cs_execute_ib(_cs, NULL, va, cdw, predicate); + } else { + assert(va && va % cs->ws->info.ip[cs->hw_ip].ib_alignment == 0); + assert(cdw <= ~C_3F2_IB_SIZE); + + /* Emit a WRITE_DATA packet to patch the DGC CS. */ + const uint32_t chain_data[] = { + PKT3(PKT3_INDIRECT_BUFFER, 2, 0), + 0, + 0, + S_3F2_CHAIN(1) | S_3F2_VALID(1), + }; + + radeon_emit(&cs->base, PKT3(PKT3_WRITE_DATA, 2 + ARRAY_SIZE(chain_data), false)); + radeon_emit(&cs->base, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME)); + radeon_emit(&cs->base, trailer_va); + radeon_emit(&cs->base, trailer_va >> 32); + radeon_emit_array(&cs->base, chain_data, ARRAY_SIZE(chain_data)); + + /* Keep pointers for patching later. */ + uint64_t *ib_va_ptr = (uint64_t *)(cs->base.buf + cs->base.cdw - 3); + uint32_t *ib_size_ptr = cs->base.buf + cs->base.cdw - 1; + + /* Writeback L2 because CP isn't coherent with L2 on GFX6-8. */ + if (cs->ws->info.gfx_level == GFX8) { + radeon_emit(&cs->base, PKT3(PKT3_ACQUIRE_MEM, 5, false) | PKT3_SHADER_TYPE_S(1)); + radeon_emit(&cs->base, S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1)); + radeon_emit(&cs->base, 0xffffffff); + radeon_emit(&cs->base, 0xff); + radeon_emit(&cs->base, 0); + radeon_emit(&cs->base, 0); + radeon_emit(&cs->base, 0x0000000A); + } + + /* Finalize the current CS. */ + cs->ws->base.cs_finalize(_cs); + + /* Chain the current CS to the DGC CS. */ + _cs->buf[_cs->cdw - 4] = PKT3(PKT3_INDIRECT_BUFFER, 2, 0); + _cs->buf[_cs->cdw - 3] = va; + _cs->buf[_cs->cdw - 2] = va >> 32; + _cs->buf[_cs->cdw - 1] = S_3F2_CHAIN(1) | S_3F2_VALID(1) | cdw; + + /* Allocate a new CS BO with initial size. */ + const uint64_t ib_size = radv_amdgpu_cs_get_initial_size(cs->ws, cs->hw_ip); + + VkResult result = radv_amdgpu_cs_bo_create(cs, ib_size); + if (result != VK_SUCCESS) { + cs->base.cdw = 0; + cs->status = result; + return; + } + + cs->ib_mapped = radv_buffer_map(&cs->ws->base, cs->ib_buffer); + if (!cs->ib_mapped) { + cs->base.cdw = 0; + cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY; + return; + } + + cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer); + + /* Chain back the trailer (DGC CS) to the newly created one. */ + *ib_va_ptr = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va; + cs->ib_size_ptr = ib_size_ptr; + + cs->base.buf = (uint32_t *)cs->ib_mapped; + cs->base.cdw = 0; + cs->base.reserved_dw = 0; + cs->base.max_dw = ib_size / 4 - 4; + } +} + static unsigned radv_amdgpu_count_cs_bo(struct radv_amdgpu_cs *start_cs) { @@ -1934,6 +2020,7 @@ radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws) ws->base.cs_add_buffer = radv_amdgpu_cs_add_buffer; ws->base.cs_execute_secondary = radv_amdgpu_cs_execute_secondary; ws->base.cs_execute_ib = radv_amdgpu_cs_execute_ib; + ws->base.cs_chain_dgc_ib = radv_amdgpu_cs_chain_dgc_ib; ws->base.cs_submit = radv_amdgpu_winsys_cs_submit; ws->base.cs_dump = radv_amdgpu_winsys_cs_dump; ws->base.cs_annotate = radv_amdgpu_winsys_cs_annotate;