diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index 2d35d65bbec..c15fac1aee3 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -267,18 +267,13 @@ static nir_ssa_def * emit_pack_ngg_prim_exp_arg(nir_builder *b, unsigned num_vertices_per_primitives, nir_ssa_def *vertex_indices[3], nir_ssa_def *is_null_prim) { - nir_ssa_def *arg = vertex_indices[0]; + nir_ssa_def *arg = b->shader->info.stage == MESA_SHADER_VERTEX + ? nir_build_load_initial_edgeflags_amd(b) + : nir_imm_int(b, 0); for (unsigned i = 0; i < num_vertices_per_primitives; ++i) { assert(vertex_indices[i]); - - if (i) - arg = nir_ior(b, arg, nir_ishl(b, vertex_indices[i], nir_imm_int(b, 10u * i))); - - if (b->shader->info.stage == MESA_SHADER_VERTEX) { - nir_ssa_def *edgeflag = nir_build_load_initial_edgeflag_amd(b, 32, nir_imm_int(b, i)); - arg = nir_ior(b, arg, nir_ishl(b, edgeflag, nir_imm_int(b, 10u * i + 9u))); - } + arg = nir_ior(b, arg, nir_ishl(b, vertex_indices[i], nir_imm_int(b, 10u * i))); } if (is_null_prim) { diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index bfed664b09b..57793000320 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -8938,14 +8938,16 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) Operand::c32(pos | (9u << 16u))); break; } - case nir_intrinsic_load_initial_edgeflag_amd: { + case nir_intrinsic_load_initial_edgeflags_amd: { assert(ctx->stage.hw == HWStage::NGG); - assert(nir_src_is_const(instr->src[0])); - unsigned i = nir_src_as_uint(instr->src[0]); Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id); - bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), - gs_invocation_id, Operand::c32(8u + i), Operand::c32(1u)); + /* Get initial edgeflags for each vertex at bits 8, 9, 10 of gs_invocation_id. */ + Temp flags = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x700u), gs_invocation_id); + /* Move the bits to their desired position: 8->9, 9->19, 10->29. */ + flags = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), Operand::c32(0x80402u), flags); + /* Remove garbage bits that are a byproduct of the multiplication. */ + bld.vop2(aco_opcode::v_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::c32(0x20080200), flags); break; } case nir_intrinsic_load_packed_passthrough_primitive_amd: { diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index af3508ea62c..f86a5a5a331 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -766,7 +766,7 @@ init_context(isel_context* ctx, nir_shader* shader) case nir_intrinsic_load_buffer_amd: case nir_intrinsic_load_tess_rel_patch_id_amd: case nir_intrinsic_load_gs_vertex_offset_amd: - case nir_intrinsic_load_initial_edgeflag_amd: + case nir_intrinsic_load_initial_edgeflags_amd: case nir_intrinsic_load_packed_passthrough_primitive_amd: case nir_intrinsic_gds_atomic_add_amd: case nir_intrinsic_load_sbt_amd: diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 7eb8d537084..9aecf8aee11 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -513,7 +513,7 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr) case nir_intrinsic_has_input_vertex_amd: case nir_intrinsic_has_input_primitive_amd: case nir_intrinsic_load_packed_passthrough_primitive_amd: - case nir_intrinsic_load_initial_edgeflag_amd: + case nir_intrinsic_load_initial_edgeflags_amd: case nir_intrinsic_gds_atomic_add_amd: is_divergent = true; break; diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 3d2a93236e3..13314a8005c 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1196,8 +1196,8 @@ intrinsic("load_cull_small_primitives_enabled_amd", dest_comp=1, bit_sizes=[1], intrinsic("load_cull_any_enabled_amd", dest_comp=1, bit_sizes=[1], flags=[CAN_ELIMINATE]) # Small primitive culling precision intrinsic("load_cull_small_prim_precision_amd", dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER]) -# Initial edge flag in a Vertex Shader. src = {vertex index}. -intrinsic("load_initial_edgeflag_amd", src_comp=[1], dest_comp=1, indices=[]) +# Initial edge flags in a Vertex Shader, packed into the format the HW needs for primitive export. +intrinsic("load_initial_edgeflags_amd", src_comp=[], dest_comp=1, bit_sizes=[32], indices=[]) # Exports the current invocation's vertex. This is a placeholder where all vertex attribute export instructions should be emitted. intrinsic("export_vertex_amd", src_comp=[], indices=[]) # Exports the current invocation's primitive. src[] = {packed_primitive_data}.