aco, nir, ac: Simplify sequence of getting initial NGG VS edge flags.
Instead of v_bfe + v_lshl_or for each vertex, get all 3 edge flags at once of every vertex. This takes fewer VALU instructions than previously. Fossil DB results on Sienna Cichlid (with NGGC on): Totals from 56917 (44.24% of 128647) affected shaders: CodeSize: 161028288 -> 158751628 (-1.41%) Instrs: 30917985 -> 30519571 (-1.29%) Latency: 130617204 -> 129975532 (-0.49%); split: -0.50%, +0.01% InvThroughput: 21280238 -> 20927401 (-1.66%) Copies: 3011120 -> 3011125 (+0.00%); split: -0.00%, +0.00% No Fossil DB changed with NGGC off. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11908>
This commit is contained in:
@@ -267,18 +267,13 @@ static nir_ssa_def *
|
|||||||
emit_pack_ngg_prim_exp_arg(nir_builder *b, unsigned num_vertices_per_primitives,
|
emit_pack_ngg_prim_exp_arg(nir_builder *b, unsigned num_vertices_per_primitives,
|
||||||
nir_ssa_def *vertex_indices[3], nir_ssa_def *is_null_prim)
|
nir_ssa_def *vertex_indices[3], nir_ssa_def *is_null_prim)
|
||||||
{
|
{
|
||||||
nir_ssa_def *arg = vertex_indices[0];
|
nir_ssa_def *arg = b->shader->info.stage == MESA_SHADER_VERTEX
|
||||||
|
? nir_build_load_initial_edgeflags_amd(b)
|
||||||
|
: nir_imm_int(b, 0);
|
||||||
|
|
||||||
for (unsigned i = 0; i < num_vertices_per_primitives; ++i) {
|
for (unsigned i = 0; i < num_vertices_per_primitives; ++i) {
|
||||||
assert(vertex_indices[i]);
|
assert(vertex_indices[i]);
|
||||||
|
arg = nir_ior(b, arg, nir_ishl(b, vertex_indices[i], nir_imm_int(b, 10u * i)));
|
||||||
if (i)
|
|
||||||
arg = nir_ior(b, arg, nir_ishl(b, vertex_indices[i], nir_imm_int(b, 10u * i)));
|
|
||||||
|
|
||||||
if (b->shader->info.stage == MESA_SHADER_VERTEX) {
|
|
||||||
nir_ssa_def *edgeflag = nir_build_load_initial_edgeflag_amd(b, 32, nir_imm_int(b, i));
|
|
||||||
arg = nir_ior(b, arg, nir_ishl(b, edgeflag, nir_imm_int(b, 10u * i + 9u)));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_null_prim) {
|
if (is_null_prim) {
|
||||||
|
@@ -8938,14 +8938,16 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
|||||||
Operand::c32(pos | (9u << 16u)));
|
Operand::c32(pos | (9u << 16u)));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_load_initial_edgeflag_amd: {
|
case nir_intrinsic_load_initial_edgeflags_amd: {
|
||||||
assert(ctx->stage.hw == HWStage::NGG);
|
assert(ctx->stage.hw == HWStage::NGG);
|
||||||
assert(nir_src_is_const(instr->src[0]));
|
|
||||||
unsigned i = nir_src_as_uint(instr->src[0]);
|
|
||||||
|
|
||||||
Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
|
Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
|
||||||
bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
|
/* Get initial edgeflags for each vertex at bits 8, 9, 10 of gs_invocation_id. */
|
||||||
gs_invocation_id, Operand::c32(8u + i), Operand::c32(1u));
|
Temp flags = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x700u), gs_invocation_id);
|
||||||
|
/* Move the bits to their desired position: 8->9, 9->19, 10->29. */
|
||||||
|
flags = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), Operand::c32(0x80402u), flags);
|
||||||
|
/* Remove garbage bits that are a byproduct of the multiplication. */
|
||||||
|
bld.vop2(aco_opcode::v_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::c32(0x20080200), flags);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_load_packed_passthrough_primitive_amd: {
|
case nir_intrinsic_load_packed_passthrough_primitive_amd: {
|
||||||
|
@@ -766,7 +766,7 @@ init_context(isel_context* ctx, nir_shader* shader)
|
|||||||
case nir_intrinsic_load_buffer_amd:
|
case nir_intrinsic_load_buffer_amd:
|
||||||
case nir_intrinsic_load_tess_rel_patch_id_amd:
|
case nir_intrinsic_load_tess_rel_patch_id_amd:
|
||||||
case nir_intrinsic_load_gs_vertex_offset_amd:
|
case nir_intrinsic_load_gs_vertex_offset_amd:
|
||||||
case nir_intrinsic_load_initial_edgeflag_amd:
|
case nir_intrinsic_load_initial_edgeflags_amd:
|
||||||
case nir_intrinsic_load_packed_passthrough_primitive_amd:
|
case nir_intrinsic_load_packed_passthrough_primitive_amd:
|
||||||
case nir_intrinsic_gds_atomic_add_amd:
|
case nir_intrinsic_gds_atomic_add_amd:
|
||||||
case nir_intrinsic_load_sbt_amd:
|
case nir_intrinsic_load_sbt_amd:
|
||||||
|
@@ -513,7 +513,7 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr)
|
|||||||
case nir_intrinsic_has_input_vertex_amd:
|
case nir_intrinsic_has_input_vertex_amd:
|
||||||
case nir_intrinsic_has_input_primitive_amd:
|
case nir_intrinsic_has_input_primitive_amd:
|
||||||
case nir_intrinsic_load_packed_passthrough_primitive_amd:
|
case nir_intrinsic_load_packed_passthrough_primitive_amd:
|
||||||
case nir_intrinsic_load_initial_edgeflag_amd:
|
case nir_intrinsic_load_initial_edgeflags_amd:
|
||||||
case nir_intrinsic_gds_atomic_add_amd:
|
case nir_intrinsic_gds_atomic_add_amd:
|
||||||
is_divergent = true;
|
is_divergent = true;
|
||||||
break;
|
break;
|
||||||
|
@@ -1196,8 +1196,8 @@ intrinsic("load_cull_small_primitives_enabled_amd", dest_comp=1, bit_sizes=[1],
|
|||||||
intrinsic("load_cull_any_enabled_amd", dest_comp=1, bit_sizes=[1], flags=[CAN_ELIMINATE])
|
intrinsic("load_cull_any_enabled_amd", dest_comp=1, bit_sizes=[1], flags=[CAN_ELIMINATE])
|
||||||
# Small primitive culling precision
|
# Small primitive culling precision
|
||||||
intrinsic("load_cull_small_prim_precision_amd", dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER])
|
intrinsic("load_cull_small_prim_precision_amd", dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER])
|
||||||
# Initial edge flag in a Vertex Shader. src = {vertex index}.
|
# Initial edge flags in a Vertex Shader, packed into the format the HW needs for primitive export.
|
||||||
intrinsic("load_initial_edgeflag_amd", src_comp=[1], dest_comp=1, indices=[])
|
intrinsic("load_initial_edgeflags_amd", src_comp=[], dest_comp=1, bit_sizes=[32], indices=[])
|
||||||
# Exports the current invocation's vertex. This is a placeholder where all vertex attribute export instructions should be emitted.
|
# Exports the current invocation's vertex. This is a placeholder where all vertex attribute export instructions should be emitted.
|
||||||
intrinsic("export_vertex_amd", src_comp=[], indices=[])
|
intrinsic("export_vertex_amd", src_comp=[], indices=[])
|
||||||
# Exports the current invocation's primitive. src[] = {packed_primitive_data}.
|
# Exports the current invocation's primitive. src[] = {packed_primitive_data}.
|
||||||
|
Reference in New Issue
Block a user