radv,aco,ac/llvm: fix indirect dispatches on the compute queue on GFX7-10
Since neither PKT3_LOAD_SH_REG_INDEX nor PKT3_COPY_DATA work with compute queues on GFX7-10, we have to load the dispatch size from memory in the shader. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15064>
This commit is contained in:
@@ -8143,7 +8143,15 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||
case nir_intrinsic_scoped_barrier: emit_scoped_barrier(ctx, instr); break;
|
||||
case nir_intrinsic_load_num_workgroups: {
|
||||
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
|
||||
bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
|
||||
if (ctx->options->load_grid_size_from_user_sgpr) {
|
||||
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.num_work_groups));
|
||||
} else {
|
||||
Temp addr = get_arg(ctx, ctx->args->ac.num_work_groups);
|
||||
assert(addr.regClass() == s2);
|
||||
bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
|
||||
bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand::zero()),
|
||||
bld.smem(aco_opcode::s_load_dword, bld.def(s1), addr, Operand::c32(8)));
|
||||
}
|
||||
emit_split_vector(ctx, dst, 3);
|
||||
break;
|
||||
}
|
||||
|
@@ -3721,7 +3721,14 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
|
||||
result = ctx->instance_id_replaced ? ctx->instance_id_replaced : ctx->abi->instance_id;
|
||||
break;
|
||||
case nir_intrinsic_load_num_workgroups:
|
||||
result = ac_get_arg(&ctx->ac, ctx->args->num_work_groups);
|
||||
if (ctx->abi->load_grid_size_from_user_sgpr) {
|
||||
result = ac_get_arg(&ctx->ac, ctx->args->num_work_groups);
|
||||
} else {
|
||||
LLVMTypeRef ptr_type = ac_array_in_const_addr_space(ctx->ac.v3i32);
|
||||
LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->args->num_work_groups);
|
||||
ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ptr_type, "");
|
||||
result = ac_build_load_invariant(&ctx->ac, ptr, ctx->ac.i32_0);
|
||||
}
|
||||
break;
|
||||
case nir_intrinsic_load_local_invocation_index:
|
||||
result = visit_load_local_invocation_index(ctx);
|
||||
|
@@ -175,6 +175,9 @@ struct ac_shader_abi {
|
||||
* images.
|
||||
*/
|
||||
bool disable_aniso_single_level;
|
||||
|
||||
/* Whether to inline the compute dispatch size in user sgprs. */
|
||||
bool load_grid_size_from_user_sgpr;
|
||||
};
|
||||
|
||||
#endif /* AC_SHADER_ABI_H */
|
||||
|
@@ -1906,6 +1906,9 @@ radv_CmdCopyAccelerationStructureKHR(VkCommandBuffer commandBuffer,
|
||||
cmd_buffer->device->meta_state.accel_struct_build.copy_p_layout,
|
||||
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
|
||||
|
||||
cmd_buffer->state.flush_bits |=
|
||||
radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT_KHR, NULL);
|
||||
|
||||
radv_indirect_dispatch(cmd_buffer, src->bo,
|
||||
src_addr + offsetof(struct radv_accel_struct_header, copy_dispatch_size));
|
||||
radv_meta_restore(&saved_state, cmd_buffer);
|
||||
@@ -2052,6 +2055,9 @@ radv_CmdCopyAccelerationStructureToMemoryKHR(
|
||||
cmd_buffer->device->meta_state.accel_struct_build.copy_p_layout,
|
||||
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
|
||||
|
||||
cmd_buffer->state.flush_bits |=
|
||||
radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT_KHR, NULL);
|
||||
|
||||
radv_indirect_dispatch(cmd_buffer, src->bo,
|
||||
src_addr + offsetof(struct radv_accel_struct_header, copy_dispatch_size));
|
||||
radv_meta_restore(&saved_state, cmd_buffer);
|
||||
|
@@ -3998,6 +3998,10 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2KHR dst_
|
||||
{
|
||||
switch ((VkAccessFlags2KHR)(1 << b)) {
|
||||
case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT_KHR:
|
||||
/* SMEM loads are used to read compute dispatch size in shaders */
|
||||
if (!cmd_buffer->device->load_grid_size_from_user_sgpr)
|
||||
flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
|
||||
break;
|
||||
case VK_ACCESS_2_INDEX_READ_BIT_KHR:
|
||||
case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
|
||||
break;
|
||||
@@ -7263,24 +7267,17 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, struct radv_pipel
|
||||
radv_cs_add_buffer(ws, cs, info->indirect);
|
||||
|
||||
if (loc->sgpr_idx != -1) {
|
||||
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10_3) {
|
||||
unsigned reg = R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4;
|
||||
unsigned reg = R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4;
|
||||
|
||||
if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
|
||||
assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10_3);
|
||||
radeon_emit(cs, PKT3(PKT3_LOAD_SH_REG_INDEX, 3, 0));
|
||||
radeon_emit(cs, info->va);
|
||||
radeon_emit(cs, info->va >> 32);
|
||||
radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
|
||||
radeon_emit(cs, 3);
|
||||
} else {
|
||||
for (unsigned i = 0; i < 3; ++i) {
|
||||
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
|
||||
radeon_emit(cs,
|
||||
COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG));
|
||||
radeon_emit(cs, (info->va + 4 * i));
|
||||
radeon_emit(cs, (info->va + 4 * i) >> 32);
|
||||
radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4) >> 2) + i);
|
||||
radeon_emit(cs, 0);
|
||||
}
|
||||
radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, reg, info->va, true);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7335,12 +7332,22 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, struct radv_pipel
|
||||
}
|
||||
|
||||
if (loc->sgpr_idx != -1) {
|
||||
assert(loc->num_sgprs == 3);
|
||||
if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
|
||||
assert(loc->num_sgprs == 3);
|
||||
|
||||
radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
|
||||
radeon_emit(cs, blocks[0]);
|
||||
radeon_emit(cs, blocks[1]);
|
||||
radeon_emit(cs, blocks[2]);
|
||||
radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
|
||||
radeon_emit(cs, blocks[0]);
|
||||
radeon_emit(cs, blocks[1]);
|
||||
radeon_emit(cs, blocks[2]);
|
||||
} else {
|
||||
uint32_t offset;
|
||||
if (!radv_cmd_buffer_upload_data(cmd_buffer, 12, blocks, &offset))
|
||||
return;
|
||||
|
||||
uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
|
||||
radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
|
||||
R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, va, true);
|
||||
}
|
||||
}
|
||||
|
||||
if (offsets[0] || offsets[1] || offsets[2]) {
|
||||
|
@@ -3411,6 +3411,9 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
|
||||
device->physical_device->rad_info.family == CHIP_NAVY_FLOUNDER ||
|
||||
device->physical_device->rad_info.family == CHIP_VANGOGH);
|
||||
|
||||
/* PKT3_LOAD_SH_REG_INDEX is supported on GFX8+, but it hangs with compute queues until GFX10.3. */
|
||||
device->load_grid_size_from_user_sgpr = device->physical_device->rad_info.chip_class >= GFX10_3;
|
||||
|
||||
device->keep_shader_info = keep_shader_info;
|
||||
result = radv_device_init_meta(device);
|
||||
if (result != VK_SUCCESS)
|
||||
|
@@ -2311,6 +2311,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
|
||||
ctx.abi.adjust_frag_coord_z = options->adjust_frag_coord_z;
|
||||
ctx.abi.robust_buffer_access = options->robust_buffer_access;
|
||||
ctx.abi.disable_aniso_single_level = options->disable_aniso_single_level;
|
||||
ctx.abi.load_grid_size_from_user_sgpr = options->load_grid_size_from_user_sgpr;
|
||||
|
||||
bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && info->is_ngg;
|
||||
if (shader_count >= 2 || is_ngg)
|
||||
|
@@ -835,6 +835,9 @@ struct radv_device {
|
||||
*/
|
||||
bool adjust_frag_coord_z;
|
||||
|
||||
/* Whether to inline the compute dispatch size in user sgprs. */
|
||||
bool load_grid_size_from_user_sgpr;
|
||||
|
||||
/* Whether the driver uses a global BO list. */
|
||||
bool use_global_bo_list;
|
||||
|
||||
|
@@ -1934,6 +1934,7 @@ shader_compile(struct radv_device *device, struct vk_shader_module *module,
|
||||
module && !is_meta_shader(module->nir) && options->key.ps.enable_mrt_output_nan_fixup;
|
||||
options->adjust_frag_coord_z = options->key.adjust_frag_coord_z;
|
||||
options->disable_aniso_single_level = options->key.disable_aniso_single_level;
|
||||
options->load_grid_size_from_user_sgpr = device->load_grid_size_from_user_sgpr;
|
||||
options->has_image_load_dcc_bug = device->physical_device->rad_info.has_image_load_dcc_bug;
|
||||
options->debug.func = radv_compiler_debug;
|
||||
options->debug.private_data = &debug_data;
|
||||
|
@@ -127,6 +127,7 @@ struct radv_nir_compiler_options {
|
||||
bool wgp_mode;
|
||||
bool remap_spi_ps_input;
|
||||
bool disable_aniso_single_level;
|
||||
bool load_grid_size_from_user_sgpr;
|
||||
enum radeon_family family;
|
||||
enum chip_class chip_class;
|
||||
const struct radeon_info *info;
|
||||
|
@@ -211,7 +211,7 @@ allocate_user_sgprs(const struct radv_nir_compiler_options *options,
|
||||
if (info->cs.uses_sbt)
|
||||
user_sgpr_count += 1;
|
||||
if (info->cs.uses_grid_size)
|
||||
user_sgpr_count += 3;
|
||||
user_sgpr_count += options->load_grid_size_from_user_sgpr ? 3 : 2;
|
||||
if (info->cs.uses_ray_launch_size)
|
||||
user_sgpr_count += 3;
|
||||
break;
|
||||
@@ -594,7 +594,10 @@ radv_declare_shader_args(const struct radv_nir_compiler_options *options,
|
||||
}
|
||||
|
||||
if (info->cs.uses_grid_size) {
|
||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups);
|
||||
if (options->load_grid_size_from_user_sgpr)
|
||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups);
|
||||
else
|
||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_PTR, &args->ac.num_work_groups);
|
||||
}
|
||||
|
||||
if (info->cs.uses_ray_launch_size) {
|
||||
@@ -819,7 +822,8 @@ radv_declare_shader_args(const struct radv_nir_compiler_options *options,
|
||||
set_loc_shader_ptr(args, AC_UD_CS_SBT_DESCRIPTORS, &user_sgpr_idx);
|
||||
}
|
||||
if (args->ac.num_work_groups.used) {
|
||||
set_loc_shader(args, AC_UD_CS_GRID_SIZE, &user_sgpr_idx, 3);
|
||||
set_loc_shader(args, AC_UD_CS_GRID_SIZE, &user_sgpr_idx,
|
||||
options->load_grid_size_from_user_sgpr ? 3 : 2);
|
||||
}
|
||||
if (args->ac.ray_launch_size.used) {
|
||||
set_loc_shader(args, AC_UD_CS_RAY_LAUNCH_SIZE, &user_sgpr_idx, 3);
|
||||
|
@@ -521,6 +521,7 @@ static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *
|
||||
ctx->abi.convert_undef_to_zero = true;
|
||||
ctx->abi.adjust_frag_coord_z = false;
|
||||
ctx->abi.disable_aniso_single_level = true;
|
||||
ctx->abi.load_grid_size_from_user_sgpr = true;
|
||||
|
||||
const struct si_shader_info *info = &ctx->shader->selector->info;
|
||||
for (unsigned i = 0; i < info->num_outputs; i++) {
|
||||
|
Reference in New Issue
Block a user