diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 9264427aae9..f824ada5db2 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -8341,11 +8341,8 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) case nir_intrinsic_load_workgroup_id: { Temp dst = get_ssa_temp(ctx, &instr->def); if (ctx->stage.hw == AC_HW_COMPUTE_SHADER) { - const struct ac_arg* ids = ctx->args->workgroup_ids; - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), - ids[0].used ? Operand(get_arg(ctx, ids[0])) : Operand::zero(), - ids[1].used ? Operand(get_arg(ctx, ids[1])) : Operand::zero(), - ids[2].used ? Operand(get_arg(ctx, ids[2])) : Operand::zero()); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), ctx->workgroup_id[0], + ctx->workgroup_id[1], ctx->workgroup_id[2]); emit_split_vector(ctx, dst, 3); } else { isel_err(&instr->instr, "Unsupported stage for load_workgroup_id"); @@ -11148,6 +11145,9 @@ add_startpgm(struct isel_context* ctx) def_count++; } + if (ctx->stage.hw == AC_HW_COMPUTE_SHADER && ctx->program->gfx_level >= GFX12) + def_count += 2; + Instruction* startpgm = create_instruction(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count); ctx->block->instructions.emplace_back(startpgm); for (unsigned i = 0, arg = 0; i < ctx->args->arg_count; i++) { @@ -11180,6 +11180,32 @@ add_startpgm(struct isel_context* ctx) } } + if (ctx->program->gfx_level >= GFX12 && ctx->stage.hw == AC_HW_COMPUTE_SHADER) { + Temp idx = ctx->program->allocateTmp(s1); + Temp idy = ctx->program->allocateTmp(s1); + startpgm->definitions[def_count - 2] = Definition(idx); + startpgm->definitions[def_count - 2].setFixed(PhysReg(108 + 9 /*ttmp9*/)); + startpgm->definitions[def_count - 1] = Definition(idy); + startpgm->definitions[def_count - 1].setFixed(PhysReg(108 + 7 /*ttmp7*/)); + ctx->workgroup_id[0] = Operand(idx); + if (ctx->args->workgroup_ids[2].used) { + Builder bld(ctx->program, ctx->block); + ctx->workgroup_id[1] = + bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), idy, Operand::zero(), + Operand::c32(16u), Operand::zero()); + ctx->workgroup_id[2] = + bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), idy, Operand::c32(1u), + Operand::c32(16u), Operand::zero()); + } else { + ctx->workgroup_id[1] = Operand(idy); + ctx->workgroup_id[2] = Operand::zero(); + } + } else if (ctx->stage.hw == AC_HW_COMPUTE_SHADER) { + const struct ac_arg* ids = ctx->args->workgroup_ids; + for (unsigned i = 0; i < 3; i++) + ctx->workgroup_id[i] = ids[i].used ? Operand(get_arg(ctx, ids[i])) : Operand::zero(); + } + /* epilog has no scratch */ if (ctx->args->scratch_offset.used) { if (ctx->program->gfx_level < GFX9) { @@ -12289,10 +12315,18 @@ select_rt_prolog(Program* program, ac_shader_config* config, PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors); PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr); PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base); - PhysReg in_wg_id_x = get_arg_reg(in_args, in_args->workgroup_ids[0]); - PhysReg in_wg_id_y = get_arg_reg(in_args, in_args->workgroup_ids[1]); - PhysReg in_wg_id_z = get_arg_reg(in_args, in_args->workgroup_ids[2]); + PhysReg in_wg_id_x; + PhysReg in_wg_id_y; + PhysReg in_wg_id_z; PhysReg in_scratch_offset; + if (options->gfx_level < GFX12) { + in_wg_id_x = get_arg_reg(in_args, in_args->workgroup_ids[0]); + in_wg_id_y = get_arg_reg(in_args, in_args->workgroup_ids[1]); + in_wg_id_z = get_arg_reg(in_args, in_args->workgroup_ids[2]); + } else { + in_wg_id_x = PhysReg(108 + 9 /*ttmp9*/); + in_wg_id_y = PhysReg(108 + 7 /*ttmp7*/); + } if (options->gfx_level < GFX11) in_scratch_offset = get_arg_reg(in_args, in_args->scratch_offset); PhysReg in_local_ids[2] = { @@ -12330,6 +12364,8 @@ select_rt_prolog(Program* program, ac_shader_config* config, num_sgprs += 2; PhysReg tmp_ring_offsets = PhysReg{num_sgprs}; num_sgprs += 2; + PhysReg tmp_wg_id_x_times_size = PhysReg{num_sgprs}; + num_sgprs++; PhysReg tmp_invocation_idx = PhysReg{256 + num_vgprs++}; @@ -12379,9 +12415,18 @@ select_rt_prolog(Program* program, ac_shader_config* config, Operand(in_local_ids[0], v1)); } /* Do this backwards to reduce some RAW hazards on GFX11+ */ - bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1)); - bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[1], v1), Operand(in_wg_id_y, s1), - Operand::c32(program->workgroup_size == 32 ? 4 : 8), Operand(in_local_ids[1], v1)); + if (options->gfx_level >= GFX12) { + bld.vop2_e64(aco_opcode::v_lshrrev_b32, Definition(out_launch_ids[2], v1), Operand::c32(16), + Operand(in_wg_id_y, s1)); + bld.vop3(aco_opcode::v_mad_u32_u16, Definition(out_launch_ids[1], v1), + Operand(in_wg_id_y, s1), Operand::c32(program->workgroup_size == 32 ? 4 : 8), + Operand(in_local_ids[1], v1)); + } else { + bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1)); + bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[1], v1), + Operand(in_wg_id_y, s1), Operand::c32(program->workgroup_size == 32 ? 4 : 8), + Operand(in_local_ids[1], v1)); + } bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[0], v1), Operand(in_wg_id_x, s1), Operand::c32(8), Operand(in_local_ids[0], v1)); @@ -12407,14 +12452,14 @@ select_rt_prolog(Program* program, ac_shader_config* config, /* For 1D dispatches converted into 2D ones, we need to fix up the launch IDs. * Calculating the 1D launch ID is: id = local_invocation_index + (wg_id.x * wg_size). - * in_wg_id_x now holds wg_id.x * wg_size. + * tmp_wg_id_x_times_size now holds wg_id.x * wg_size. */ - bld.sop2(aco_opcode::s_lshl_b32, Definition(in_wg_id_x, s1), Definition(scc, s1), + bld.sop2(aco_opcode::s_lshl_b32, Definition(tmp_wg_id_x_times_size, s1), Definition(scc, s1), Operand(in_wg_id_x, s1), Operand::c32(program->workgroup_size == 32 ? 5 : 6)); /* Calculate and add local_invocation_index */ bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(tmp_invocation_idx, v1), Operand::c32(-1u), - Operand(in_wg_id_x, s1)); + Operand(tmp_wg_id_x_times_size, s1)); if (program->wave_size == 64) { if (program->gfx_level <= GFX7) bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(tmp_invocation_idx, v1), diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h index d5750db0fcb..5ba7a555fe9 100644 --- a/src/amd/compiler/aco_instruction_selection.h +++ b/src/amd/compiler/aco_instruction_selection.h @@ -72,6 +72,7 @@ struct isel_context { nir_unsigned_upper_bound_config ub_config; Temp arg_temps[AC_MAX_ARGS]; + Operand workgroup_id[3]; /* tessellation information */ uint64_t tcs_temp_only_inputs;