From 88c951bd463af52abd0257419f1601d7767a963d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Thu, 9 Jan 2025 18:27:33 -0600 Subject: [PATCH] ac/nir: Move ac_nir_lower_legacy_gs to separate file. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Marek Olšák Part-of: --- src/amd/common/ac_nir.c | 275 ----------------------- src/amd/common/ac_nir_lower_legacy_gs.c | 285 ++++++++++++++++++++++++ src/amd/common/meson.build | 1 + 3 files changed, 286 insertions(+), 275 deletions(-) create mode 100644 src/amd/common/ac_nir_lower_legacy_gs.c diff --git a/src/amd/common/ac_nir.c b/src/amd/common/ac_nir.c index 334ec372a10..5da74995e14 100644 --- a/src/amd/common/ac_nir.c +++ b/src/amd/common/ac_nir.c @@ -1027,281 +1027,6 @@ ac_nir_gs_shader_query(nir_builder *b, return true; } -typedef struct { - nir_def *outputs[64][4]; - nir_def *outputs_16bit_lo[16][4]; - nir_def *outputs_16bit_hi[16][4]; - - ac_nir_gs_output_info *info; - - nir_def *vertex_count[4]; - nir_def *primitive_count[4]; -} lower_legacy_gs_state; - -static bool -lower_legacy_gs_store_output(nir_builder *b, nir_intrinsic_instr *intrin, - lower_legacy_gs_state *s) -{ - /* Assume: - * - the shader used nir_lower_io_to_temporaries - * - 64-bit outputs are lowered - * - no indirect indexing is present - */ - assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1])); - - b->cursor = nir_before_instr(&intrin->instr); - - unsigned component = nir_intrinsic_component(intrin); - unsigned write_mask = nir_intrinsic_write_mask(intrin); - nir_io_semantics sem = nir_intrinsic_io_semantics(intrin); - - nir_def **outputs; - if (sem.location < VARYING_SLOT_VAR0_16BIT) { - outputs = s->outputs[sem.location]; - } else { - unsigned index = sem.location - VARYING_SLOT_VAR0_16BIT; - if (sem.high_16bits) - outputs = s->outputs_16bit_hi[index]; - else - outputs = s->outputs_16bit_lo[index]; - } - - nir_def *store_val = intrin->src[0].ssa; - /* 64bit output has been lowered to 32bit */ - assert(store_val->bit_size <= 32); - - /* 16-bit output stored in a normal varying slot that isn't a dedicated 16-bit slot. */ - const bool non_dedicated_16bit = sem.location < VARYING_SLOT_VAR0_16BIT && store_val->bit_size == 16; - - u_foreach_bit (i, write_mask) { - unsigned comp = component + i; - nir_def *store_component = nir_channel(b, store_val, i); - - if (non_dedicated_16bit) { - if (sem.high_16bits) { - nir_def *lo = outputs[comp] ? nir_unpack_32_2x16_split_x(b, outputs[comp]) : nir_imm_intN_t(b, 0, 16); - outputs[comp] = nir_pack_32_2x16_split(b, lo, store_component); - } else { - nir_def *hi = outputs[comp] ? nir_unpack_32_2x16_split_y(b, outputs[comp]) : nir_imm_intN_t(b, 0, 16); - outputs[comp] = nir_pack_32_2x16_split(b, store_component, hi); - } - } else { - outputs[comp] = store_component; - } - } - - nir_instr_remove(&intrin->instr); - return true; -} - -static bool -lower_legacy_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intrin, - lower_legacy_gs_state *s) -{ - b->cursor = nir_before_instr(&intrin->instr); - - unsigned stream = nir_intrinsic_stream_id(intrin); - nir_def *vtxidx = intrin->src[0].ssa; - - nir_def *gsvs_ring = nir_load_ring_gsvs_amd(b, .stream_id = stream); - nir_def *soffset = nir_load_ring_gs2vs_offset_amd(b); - - unsigned offset = 0; - u_foreach_bit64 (i, b->shader->info.outputs_written) { - for (unsigned j = 0; j < 4; j++) { - nir_def *output = s->outputs[i][j]; - /* Next vertex emit need a new value, reset all outputs. */ - s->outputs[i][j] = NULL; - - const uint8_t usage_mask = s->info->varying_mask[i] | s->info->sysval_mask[i]; - - if (!(usage_mask & (1 << j)) || - ((s->info->streams[i] >> (j * 2)) & 0x3) != stream) - continue; - - unsigned base = offset * b->shader->info.gs.vertices_out * 4; - offset++; - - /* no one set this output, skip the buffer store */ - if (!output) - continue; - - nir_def *voffset = nir_ishl_imm(b, vtxidx, 2); - - /* extend 8/16 bit to 32 bit, 64 bit has been lowered */ - nir_def *data = nir_u2uN(b, output, 32); - - nir_store_buffer_amd(b, data, gsvs_ring, voffset, soffset, nir_imm_int(b, 0), - .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL | - ACCESS_IS_SWIZZLED_AMD, - .base = base, - /* For ACO to not reorder this store around EmitVertex/EndPrimitve */ - .memory_modes = nir_var_shader_out); - } - } - - u_foreach_bit (i, b->shader->info.outputs_written_16bit) { - for (unsigned j = 0; j < 4; j++) { - nir_def *output_lo = s->outputs_16bit_lo[i][j]; - nir_def *output_hi = s->outputs_16bit_hi[i][j]; - /* Next vertex emit need a new value, reset all outputs. */ - s->outputs_16bit_lo[i][j] = NULL; - s->outputs_16bit_hi[i][j] = NULL; - - bool has_lo_16bit = (s->info->varying_mask_16bit_lo[i] & (1 << j)) && - ((s->info->streams_16bit_lo[i] >> (j * 2)) & 0x3) == stream; - bool has_hi_16bit = (s->info->varying_mask_16bit_hi[i] & (1 << j)) && - ((s->info->streams_16bit_hi[i] >> (j * 2)) & 0x3) == stream; - if (!has_lo_16bit && !has_hi_16bit) - continue; - - unsigned base = offset * b->shader->info.gs.vertices_out; - offset++; - - bool has_lo_16bit_out = has_lo_16bit && output_lo; - bool has_hi_16bit_out = has_hi_16bit && output_hi; - - /* no one set needed output, skip the buffer store */ - if (!has_lo_16bit_out && !has_hi_16bit_out) - continue; - - if (!has_lo_16bit_out) - output_lo = nir_undef(b, 1, 16); - - if (!has_hi_16bit_out) - output_hi = nir_undef(b, 1, 16); - - nir_def *voffset = nir_iadd_imm(b, vtxidx, base); - voffset = nir_ishl_imm(b, voffset, 2); - - nir_store_buffer_amd(b, nir_pack_32_2x16_split(b, output_lo, output_hi), - gsvs_ring, voffset, soffset, nir_imm_int(b, 0), - .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL | - ACCESS_IS_SWIZZLED_AMD, - /* For ACO to not reorder this store around EmitVertex/EndPrimitve */ - .memory_modes = nir_var_shader_out); - } - } - - /* Signal vertex emission. */ - nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b), - .base = AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8)); - - nir_instr_remove(&intrin->instr); - return true; -} - -static bool -lower_legacy_gs_set_vertex_and_primitive_count(nir_builder *b, nir_intrinsic_instr *intrin, - lower_legacy_gs_state *s) -{ - b->cursor = nir_before_instr(&intrin->instr); - - unsigned stream = nir_intrinsic_stream_id(intrin); - - s->vertex_count[stream] = intrin->src[0].ssa; - s->primitive_count[stream] = intrin->src[1].ssa; - - nir_instr_remove(&intrin->instr); - return true; -} - -static bool -lower_legacy_gs_end_primitive_with_counter(nir_builder *b, nir_intrinsic_instr *intrin, - lower_legacy_gs_state *s) -{ - b->cursor = nir_before_instr(&intrin->instr); - const unsigned stream = nir_intrinsic_stream_id(intrin); - - /* Signal primitive emission. */ - nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b), - .base = AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8)); - - nir_instr_remove(&intrin->instr); - return true; -} - -static bool -lower_legacy_gs_intrinsic(nir_builder *b, nir_instr *instr, void *state) -{ - lower_legacy_gs_state *s = (lower_legacy_gs_state *) state; - - if (instr->type != nir_instr_type_intrinsic) - return false; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - - if (intrin->intrinsic == nir_intrinsic_store_output) - return lower_legacy_gs_store_output(b, intrin, s); - else if (intrin->intrinsic == nir_intrinsic_emit_vertex_with_counter) - return lower_legacy_gs_emit_vertex_with_counter(b, intrin, s); - else if (intrin->intrinsic == nir_intrinsic_end_primitive_with_counter) - return lower_legacy_gs_end_primitive_with_counter(b, intrin, s); - else if (intrin->intrinsic == nir_intrinsic_set_vertex_and_primitive_count) - return lower_legacy_gs_set_vertex_and_primitive_count(b, intrin, s); - - return false; -} - -void -ac_nir_lower_legacy_gs(nir_shader *nir, - bool has_gen_prim_query, - bool has_pipeline_stats_query, - ac_nir_gs_output_info *output_info) -{ - lower_legacy_gs_state s = { - .info = output_info, - }; - - unsigned num_vertices_per_primitive = 0; - switch (nir->info.gs.output_primitive) { - case MESA_PRIM_POINTS: - num_vertices_per_primitive = 1; - break; - case MESA_PRIM_LINE_STRIP: - num_vertices_per_primitive = 2; - break; - case MESA_PRIM_TRIANGLE_STRIP: - num_vertices_per_primitive = 3; - break; - default: - unreachable("Invalid GS output primitive."); - break; - } - - nir_shader_instructions_pass(nir, lower_legacy_gs_intrinsic, - nir_metadata_control_flow, &s); - - nir_function_impl *impl = nir_shader_get_entrypoint(nir); - - nir_builder builder = nir_builder_at(nir_after_impl(impl)); - nir_builder *b = &builder; - - /* Emit shader query for mix use legacy/NGG GS */ - bool progress = ac_nir_gs_shader_query(b, - has_gen_prim_query, - has_pipeline_stats_query, - has_pipeline_stats_query, - num_vertices_per_primitive, - 64, - s.vertex_count, - s.primitive_count); - - /* Wait for all stores to finish. */ - nir_barrier(b, .execution_scope = SCOPE_INVOCATION, - .memory_scope = SCOPE_DEVICE, - .memory_semantics = NIR_MEMORY_RELEASE, - .memory_modes = nir_var_shader_out | nir_var_mem_ssbo | - nir_var_mem_global | nir_var_image); - - /* Signal that the GS is done. */ - nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b), - .base = AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE); - - if (progress) - nir_metadata_preserve(impl, nir_metadata_none); -} - /* Shader logging function for printing nir_def values. The driver prints this after * command submission. * diff --git a/src/amd/common/ac_nir_lower_legacy_gs.c b/src/amd/common/ac_nir_lower_legacy_gs.c new file mode 100644 index 00000000000..1a1b06c46ab --- /dev/null +++ b/src/amd/common/ac_nir_lower_legacy_gs.c @@ -0,0 +1,285 @@ +/* + * Copyright © 2021 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +#include "ac_nir.h" +#include "ac_nir_helpers.h" + +#include "nir_builder.h" + +typedef struct { + nir_def *outputs[64][4]; + nir_def *outputs_16bit_lo[16][4]; + nir_def *outputs_16bit_hi[16][4]; + + ac_nir_gs_output_info *info; + + nir_def *vertex_count[4]; + nir_def *primitive_count[4]; +} lower_legacy_gs_state; + +static bool +lower_legacy_gs_store_output(nir_builder *b, nir_intrinsic_instr *intrin, + lower_legacy_gs_state *s) +{ + /* Assume: + * - the shader used nir_lower_io_to_temporaries + * - 64-bit outputs are lowered + * - no indirect indexing is present + */ + assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1])); + + b->cursor = nir_before_instr(&intrin->instr); + + unsigned component = nir_intrinsic_component(intrin); + unsigned write_mask = nir_intrinsic_write_mask(intrin); + nir_io_semantics sem = nir_intrinsic_io_semantics(intrin); + + nir_def **outputs; + if (sem.location < VARYING_SLOT_VAR0_16BIT) { + outputs = s->outputs[sem.location]; + } else { + unsigned index = sem.location - VARYING_SLOT_VAR0_16BIT; + if (sem.high_16bits) + outputs = s->outputs_16bit_hi[index]; + else + outputs = s->outputs_16bit_lo[index]; + } + + nir_def *store_val = intrin->src[0].ssa; + /* 64bit output has been lowered to 32bit */ + assert(store_val->bit_size <= 32); + + /* 16-bit output stored in a normal varying slot that isn't a dedicated 16-bit slot. */ + const bool non_dedicated_16bit = sem.location < VARYING_SLOT_VAR0_16BIT && store_val->bit_size == 16; + + u_foreach_bit (i, write_mask) { + unsigned comp = component + i; + nir_def *store_component = nir_channel(b, store_val, i); + + if (non_dedicated_16bit) { + if (sem.high_16bits) { + nir_def *lo = outputs[comp] ? nir_unpack_32_2x16_split_x(b, outputs[comp]) : nir_imm_intN_t(b, 0, 16); + outputs[comp] = nir_pack_32_2x16_split(b, lo, store_component); + } else { + nir_def *hi = outputs[comp] ? nir_unpack_32_2x16_split_y(b, outputs[comp]) : nir_imm_intN_t(b, 0, 16); + outputs[comp] = nir_pack_32_2x16_split(b, store_component, hi); + } + } else { + outputs[comp] = store_component; + } + } + + nir_instr_remove(&intrin->instr); + return true; +} + +static bool +lower_legacy_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intrin, + lower_legacy_gs_state *s) +{ + b->cursor = nir_before_instr(&intrin->instr); + + unsigned stream = nir_intrinsic_stream_id(intrin); + nir_def *vtxidx = intrin->src[0].ssa; + + nir_def *gsvs_ring = nir_load_ring_gsvs_amd(b, .stream_id = stream); + nir_def *soffset = nir_load_ring_gs2vs_offset_amd(b); + + unsigned offset = 0; + u_foreach_bit64 (i, b->shader->info.outputs_written) { + for (unsigned j = 0; j < 4; j++) { + nir_def *output = s->outputs[i][j]; + /* Next vertex emit need a new value, reset all outputs. */ + s->outputs[i][j] = NULL; + + const uint8_t usage_mask = s->info->varying_mask[i] | s->info->sysval_mask[i]; + + if (!(usage_mask & (1 << j)) || + ((s->info->streams[i] >> (j * 2)) & 0x3) != stream) + continue; + + unsigned base = offset * b->shader->info.gs.vertices_out * 4; + offset++; + + /* no one set this output, skip the buffer store */ + if (!output) + continue; + + nir_def *voffset = nir_ishl_imm(b, vtxidx, 2); + + /* extend 8/16 bit to 32 bit, 64 bit has been lowered */ + nir_def *data = nir_u2uN(b, output, 32); + + nir_store_buffer_amd(b, data, gsvs_ring, voffset, soffset, nir_imm_int(b, 0), + .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL | + ACCESS_IS_SWIZZLED_AMD, + .base = base, + /* For ACO to not reorder this store around EmitVertex/EndPrimitve */ + .memory_modes = nir_var_shader_out); + } + } + + u_foreach_bit (i, b->shader->info.outputs_written_16bit) { + for (unsigned j = 0; j < 4; j++) { + nir_def *output_lo = s->outputs_16bit_lo[i][j]; + nir_def *output_hi = s->outputs_16bit_hi[i][j]; + /* Next vertex emit need a new value, reset all outputs. */ + s->outputs_16bit_lo[i][j] = NULL; + s->outputs_16bit_hi[i][j] = NULL; + + bool has_lo_16bit = (s->info->varying_mask_16bit_lo[i] & (1 << j)) && + ((s->info->streams_16bit_lo[i] >> (j * 2)) & 0x3) == stream; + bool has_hi_16bit = (s->info->varying_mask_16bit_hi[i] & (1 << j)) && + ((s->info->streams_16bit_hi[i] >> (j * 2)) & 0x3) == stream; + if (!has_lo_16bit && !has_hi_16bit) + continue; + + unsigned base = offset * b->shader->info.gs.vertices_out; + offset++; + + bool has_lo_16bit_out = has_lo_16bit && output_lo; + bool has_hi_16bit_out = has_hi_16bit && output_hi; + + /* no one set needed output, skip the buffer store */ + if (!has_lo_16bit_out && !has_hi_16bit_out) + continue; + + if (!has_lo_16bit_out) + output_lo = nir_undef(b, 1, 16); + + if (!has_hi_16bit_out) + output_hi = nir_undef(b, 1, 16); + + nir_def *voffset = nir_iadd_imm(b, vtxidx, base); + voffset = nir_ishl_imm(b, voffset, 2); + + nir_store_buffer_amd(b, nir_pack_32_2x16_split(b, output_lo, output_hi), + gsvs_ring, voffset, soffset, nir_imm_int(b, 0), + .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL | + ACCESS_IS_SWIZZLED_AMD, + /* For ACO to not reorder this store around EmitVertex/EndPrimitve */ + .memory_modes = nir_var_shader_out); + } + } + + /* Signal vertex emission. */ + nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b), + .base = AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8)); + + nir_instr_remove(&intrin->instr); + return true; +} + +static bool +lower_legacy_gs_set_vertex_and_primitive_count(nir_builder *b, nir_intrinsic_instr *intrin, + lower_legacy_gs_state *s) +{ + b->cursor = nir_before_instr(&intrin->instr); + + unsigned stream = nir_intrinsic_stream_id(intrin); + + s->vertex_count[stream] = intrin->src[0].ssa; + s->primitive_count[stream] = intrin->src[1].ssa; + + nir_instr_remove(&intrin->instr); + return true; +} + +static bool +lower_legacy_gs_end_primitive_with_counter(nir_builder *b, nir_intrinsic_instr *intrin, + lower_legacy_gs_state *s) +{ + b->cursor = nir_before_instr(&intrin->instr); + const unsigned stream = nir_intrinsic_stream_id(intrin); + + /* Signal primitive emission. */ + nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b), + .base = AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8)); + + nir_instr_remove(&intrin->instr); + return true; +} + +static bool +lower_legacy_gs_intrinsic(nir_builder *b, nir_instr *instr, void *state) +{ + lower_legacy_gs_state *s = (lower_legacy_gs_state *) state; + + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + if (intrin->intrinsic == nir_intrinsic_store_output) + return lower_legacy_gs_store_output(b, intrin, s); + else if (intrin->intrinsic == nir_intrinsic_emit_vertex_with_counter) + return lower_legacy_gs_emit_vertex_with_counter(b, intrin, s); + else if (intrin->intrinsic == nir_intrinsic_end_primitive_with_counter) + return lower_legacy_gs_end_primitive_with_counter(b, intrin, s); + else if (intrin->intrinsic == nir_intrinsic_set_vertex_and_primitive_count) + return lower_legacy_gs_set_vertex_and_primitive_count(b, intrin, s); + + return false; +} + +void +ac_nir_lower_legacy_gs(nir_shader *nir, + bool has_gen_prim_query, + bool has_pipeline_stats_query, + ac_nir_gs_output_info *output_info) +{ + lower_legacy_gs_state s = { + .info = output_info, + }; + + unsigned num_vertices_per_primitive = 0; + switch (nir->info.gs.output_primitive) { + case MESA_PRIM_POINTS: + num_vertices_per_primitive = 1; + break; + case MESA_PRIM_LINE_STRIP: + num_vertices_per_primitive = 2; + break; + case MESA_PRIM_TRIANGLE_STRIP: + num_vertices_per_primitive = 3; + break; + default: + unreachable("Invalid GS output primitive."); + break; + } + + nir_shader_instructions_pass(nir, lower_legacy_gs_intrinsic, + nir_metadata_control_flow, &s); + + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + nir_builder builder = nir_builder_at(nir_after_impl(impl)); + nir_builder *b = &builder; + + /* Emit shader query for mix use legacy/NGG GS */ + bool progress = ac_nir_gs_shader_query(b, + has_gen_prim_query, + has_pipeline_stats_query, + has_pipeline_stats_query, + num_vertices_per_primitive, + 64, + s.vertex_count, + s.primitive_count); + + /* Wait for all stores to finish. */ + nir_barrier(b, .execution_scope = SCOPE_INVOCATION, + .memory_scope = SCOPE_DEVICE, + .memory_semantics = NIR_MEMORY_RELEASE, + .memory_modes = nir_var_shader_out | nir_var_mem_ssbo | + nir_var_mem_global | nir_var_image); + + /* Signal that the GS is done. */ + nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b), + .base = AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE); + + if (progress) + nir_metadata_preserve(impl, nir_metadata_none); +} diff --git a/src/amd/common/meson.build b/src/amd/common/meson.build index 53c020ec883..0a9a57468ce 100644 --- a/src/amd/common/meson.build +++ b/src/amd/common/meson.build @@ -92,6 +92,7 @@ amd_common_files = files( 'ac_nir_lower_global_access.c', 'ac_nir_lower_image_opcodes_cdna.c', 'ac_nir_lower_intrinsics_to_args.c', + 'ac_nir_lower_legacy_gs.c', 'ac_nir_lower_legacy_vs.c', 'ac_nir_lower_resinfo.c', 'ac_nir_lower_taskmesh_io_to_mem.c',