radv,aco,ac/llvm: fix indirect dispatches on the compute queue on GFX7-10

Since neither PKT3_LOAD_SH_REG_INDEX nor PKT3_COPY_DATA work with compute
queues on GFX7-10, we have to load the dispatch size from memory in the
shader.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15064>
This commit is contained in:
Rhys Perry
2022-02-16 20:01:36 +00:00
committed by Marge Bot
parent 973967c49d
commit c4cf92cad7
12 changed files with 66 additions and 21 deletions

View File

@@ -8143,7 +8143,15 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
case nir_intrinsic_scoped_barrier: emit_scoped_barrier(ctx, instr); break;
case nir_intrinsic_load_num_workgroups: {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
if (ctx->options->load_grid_size_from_user_sgpr) {
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.num_work_groups));
} else {
Temp addr = get_arg(ctx, ctx->args->ac.num_work_groups);
assert(addr.regClass() == s2);
bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand::zero()),
bld.smem(aco_opcode::s_load_dword, bld.def(s1), addr, Operand::c32(8)));
}
emit_split_vector(ctx, dst, 3);
break;
}

View File

@@ -3721,7 +3721,14 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
result = ctx->instance_id_replaced ? ctx->instance_id_replaced : ctx->abi->instance_id;
break;
case nir_intrinsic_load_num_workgroups:
result = ac_get_arg(&ctx->ac, ctx->args->num_work_groups);
if (ctx->abi->load_grid_size_from_user_sgpr) {
result = ac_get_arg(&ctx->ac, ctx->args->num_work_groups);
} else {
LLVMTypeRef ptr_type = ac_array_in_const_addr_space(ctx->ac.v3i32);
LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->args->num_work_groups);
ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ptr_type, "");
result = ac_build_load_invariant(&ctx->ac, ptr, ctx->ac.i32_0);
}
break;
case nir_intrinsic_load_local_invocation_index:
result = visit_load_local_invocation_index(ctx);

View File

@@ -175,6 +175,9 @@ struct ac_shader_abi {
* images.
*/
bool disable_aniso_single_level;
/* Whether to inline the compute dispatch size in user sgprs. */
bool load_grid_size_from_user_sgpr;
};
#endif /* AC_SHADER_ABI_H */

View File

@@ -1906,6 +1906,9 @@ radv_CmdCopyAccelerationStructureKHR(VkCommandBuffer commandBuffer,
cmd_buffer->device->meta_state.accel_struct_build.copy_p_layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
cmd_buffer->state.flush_bits |=
radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT_KHR, NULL);
radv_indirect_dispatch(cmd_buffer, src->bo,
src_addr + offsetof(struct radv_accel_struct_header, copy_dispatch_size));
radv_meta_restore(&saved_state, cmd_buffer);
@@ -2052,6 +2055,9 @@ radv_CmdCopyAccelerationStructureToMemoryKHR(
cmd_buffer->device->meta_state.accel_struct_build.copy_p_layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
cmd_buffer->state.flush_bits |=
radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT_KHR, NULL);
radv_indirect_dispatch(cmd_buffer, src->bo,
src_addr + offsetof(struct radv_accel_struct_header, copy_dispatch_size));
radv_meta_restore(&saved_state, cmd_buffer);

View File

@@ -3998,6 +3998,10 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2KHR dst_
{
switch ((VkAccessFlags2KHR)(1 << b)) {
case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT_KHR:
/* SMEM loads are used to read compute dispatch size in shaders */
if (!cmd_buffer->device->load_grid_size_from_user_sgpr)
flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
break;
case VK_ACCESS_2_INDEX_READ_BIT_KHR:
case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
break;
@@ -7263,24 +7267,17 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, struct radv_pipel
radv_cs_add_buffer(ws, cs, info->indirect);
if (loc->sgpr_idx != -1) {
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10_3) {
unsigned reg = R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4;
unsigned reg = R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4;
if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10_3);
radeon_emit(cs, PKT3(PKT3_LOAD_SH_REG_INDEX, 3, 0));
radeon_emit(cs, info->va);
radeon_emit(cs, info->va >> 32);
radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
radeon_emit(cs, 3);
} else {
for (unsigned i = 0; i < 3; ++i) {
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
radeon_emit(cs,
COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG));
radeon_emit(cs, (info->va + 4 * i));
radeon_emit(cs, (info->va + 4 * i) >> 32);
radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4) >> 2) + i);
radeon_emit(cs, 0);
}
radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, reg, info->va, true);
}
}
@@ -7335,12 +7332,22 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, struct radv_pipel
}
if (loc->sgpr_idx != -1) {
assert(loc->num_sgprs == 3);
if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
assert(loc->num_sgprs == 3);
radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
radeon_emit(cs, blocks[0]);
radeon_emit(cs, blocks[1]);
radeon_emit(cs, blocks[2]);
radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
radeon_emit(cs, blocks[0]);
radeon_emit(cs, blocks[1]);
radeon_emit(cs, blocks[2]);
} else {
uint32_t offset;
if (!radv_cmd_buffer_upload_data(cmd_buffer, 12, blocks, &offset))
return;
uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, va, true);
}
}
if (offsets[0] || offsets[1] || offsets[2]) {

View File

@@ -3411,6 +3411,9 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
device->physical_device->rad_info.family == CHIP_NAVY_FLOUNDER ||
device->physical_device->rad_info.family == CHIP_VANGOGH);
/* PKT3_LOAD_SH_REG_INDEX is supported on GFX8+, but it hangs with compute queues until GFX10.3. */
device->load_grid_size_from_user_sgpr = device->physical_device->rad_info.chip_class >= GFX10_3;
device->keep_shader_info = keep_shader_info;
result = radv_device_init_meta(device);
if (result != VK_SUCCESS)

View File

@@ -2311,6 +2311,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
ctx.abi.adjust_frag_coord_z = options->adjust_frag_coord_z;
ctx.abi.robust_buffer_access = options->robust_buffer_access;
ctx.abi.disable_aniso_single_level = options->disable_aniso_single_level;
ctx.abi.load_grid_size_from_user_sgpr = options->load_grid_size_from_user_sgpr;
bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && info->is_ngg;
if (shader_count >= 2 || is_ngg)

View File

@@ -835,6 +835,9 @@ struct radv_device {
*/
bool adjust_frag_coord_z;
/* Whether to inline the compute dispatch size in user sgprs. */
bool load_grid_size_from_user_sgpr;
/* Whether the driver uses a global BO list. */
bool use_global_bo_list;

View File

@@ -1934,6 +1934,7 @@ shader_compile(struct radv_device *device, struct vk_shader_module *module,
module && !is_meta_shader(module->nir) && options->key.ps.enable_mrt_output_nan_fixup;
options->adjust_frag_coord_z = options->key.adjust_frag_coord_z;
options->disable_aniso_single_level = options->key.disable_aniso_single_level;
options->load_grid_size_from_user_sgpr = device->load_grid_size_from_user_sgpr;
options->has_image_load_dcc_bug = device->physical_device->rad_info.has_image_load_dcc_bug;
options->debug.func = radv_compiler_debug;
options->debug.private_data = &debug_data;

View File

@@ -127,6 +127,7 @@ struct radv_nir_compiler_options {
bool wgp_mode;
bool remap_spi_ps_input;
bool disable_aniso_single_level;
bool load_grid_size_from_user_sgpr;
enum radeon_family family;
enum chip_class chip_class;
const struct radeon_info *info;

View File

@@ -211,7 +211,7 @@ allocate_user_sgprs(const struct radv_nir_compiler_options *options,
if (info->cs.uses_sbt)
user_sgpr_count += 1;
if (info->cs.uses_grid_size)
user_sgpr_count += 3;
user_sgpr_count += options->load_grid_size_from_user_sgpr ? 3 : 2;
if (info->cs.uses_ray_launch_size)
user_sgpr_count += 3;
break;
@@ -594,7 +594,10 @@ radv_declare_shader_args(const struct radv_nir_compiler_options *options,
}
if (info->cs.uses_grid_size) {
ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups);
if (options->load_grid_size_from_user_sgpr)
ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups);
else
ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_PTR, &args->ac.num_work_groups);
}
if (info->cs.uses_ray_launch_size) {
@@ -819,7 +822,8 @@ radv_declare_shader_args(const struct radv_nir_compiler_options *options,
set_loc_shader_ptr(args, AC_UD_CS_SBT_DESCRIPTORS, &user_sgpr_idx);
}
if (args->ac.num_work_groups.used) {
set_loc_shader(args, AC_UD_CS_GRID_SIZE, &user_sgpr_idx, 3);
set_loc_shader(args, AC_UD_CS_GRID_SIZE, &user_sgpr_idx,
options->load_grid_size_from_user_sgpr ? 3 : 2);
}
if (args->ac.ray_launch_size.used) {
set_loc_shader(args, AC_UD_CS_RAY_LAUNCH_SIZE, &user_sgpr_idx, 3);

View File

@@ -521,6 +521,7 @@ static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *
ctx->abi.convert_undef_to_zero = true;
ctx->abi.adjust_frag_coord_z = false;
ctx->abi.disable_aniso_single_level = true;
ctx->abi.load_grid_size_from_user_sgpr = true;
const struct si_shader_info *info = &ctx->shader->selector->info;
for (unsigned i = 0; i < info->num_outputs; i++) {