diff --git a/src/amd/common/ac_nir.c b/src/amd/common/ac_nir.c index 8e738d5fb02..e447b095e0f 100644 --- a/src/amd/common/ac_nir.c +++ b/src/amd/common/ac_nir.c @@ -438,3 +438,231 @@ ac_nir_gs_shader_query(nir_builder *b, nir_pop_if(b, if_shader_query); return true; } + +typedef struct { + nir_ssa_def *outputs[64][4]; + nir_ssa_def *outputs_16bit_lo[16][4]; + nir_ssa_def *outputs_16bit_hi[16][4]; + + ac_nir_gs_output_info *info; + + nir_ssa_def *vertex_count[4]; + nir_ssa_def *primitive_count[4]; +} lower_legacy_gs_state; + +static bool +lower_legacy_gs_store_output(nir_builder *b, nir_intrinsic_instr *intrin, + lower_legacy_gs_state *s) +{ + /* Assume: + * - the shader used nir_lower_io_to_temporaries + * - 64-bit outputs are lowered + * - no indirect indexing is present + */ + assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1])); + + b->cursor = nir_before_instr(&intrin->instr); + + unsigned component = nir_intrinsic_component(intrin); + unsigned write_mask = nir_intrinsic_write_mask(intrin); + nir_io_semantics sem = nir_intrinsic_io_semantics(intrin); + + nir_ssa_def **outputs; + if (sem.location < VARYING_SLOT_VAR0_16BIT) { + outputs = s->outputs[sem.location]; + } else { + unsigned index = sem.location - VARYING_SLOT_VAR0_16BIT; + if (sem.high_16bits) + outputs = s->outputs_16bit_hi[index]; + else + outputs = s->outputs_16bit_lo[index]; + } + + nir_ssa_def *store_val = intrin->src[0].ssa; + /* 64bit output has been lowered to 32bit */ + assert(store_val->bit_size <= 32); + + u_foreach_bit (i, write_mask) { + unsigned comp = component + i; + outputs[comp] = nir_channel(b, store_val, i); + } + + nir_instr_remove(&intrin->instr); + return true; +} + +static bool +lower_legacy_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intrin, + lower_legacy_gs_state *s) +{ + b->cursor = nir_before_instr(&intrin->instr); + + unsigned stream = nir_intrinsic_stream_id(intrin); + nir_ssa_def *vtxidx = intrin->src[0].ssa; + + nir_ssa_def *gsvs_ring = nir_load_ring_gsvs_amd(b, .stream_id = stream); + nir_ssa_def *soffset = nir_load_ring_gs2vs_offset_amd(b); + + unsigned offset = 0; + u_foreach_bit64 (i, b->shader->info.outputs_written) { + for (unsigned j = 0; j < 4; j++) { + nir_ssa_def *output = s->outputs[i][j]; + /* Next vertex emit need a new value, reset all outputs. */ + s->outputs[i][j] = NULL; + + if (!(s->info->usage_mask[i] & (1 << j)) || + ((s->info->streams[i] >> (j * 2)) & 0x3) != stream) + continue; + + unsigned base = offset * b->shader->info.gs.vertices_out; + offset++; + + /* no one set this output, skip the buffer store */ + if (!output) + continue; + + nir_ssa_def *voffset = nir_iadd_imm(b, vtxidx, base); + voffset = nir_ishl_imm(b, voffset, 2); + + /* extend 8/16 bit to 32 bit, 64 bit has been lowered */ + nir_ssa_def *data = nir_u2uN(b, output, 32); + + nir_store_buffer_amd(b, data, gsvs_ring, voffset, soffset, nir_imm_int(b, 0), + .is_swizzled = true, .slc_amd = true, + .access = ACCESS_COHERENT, + /* For ACO to not reorder this store around EmitVertex/EndPrimitve */ + .memory_modes = nir_var_shader_out); + } + } + + u_foreach_bit (i, b->shader->info.outputs_written_16bit) { + for (unsigned j = 0; j < 4; j++) { + nir_ssa_def *output_lo = s->outputs_16bit_lo[i][j]; + nir_ssa_def *output_hi = s->outputs_16bit_hi[i][j]; + /* Next vertex emit need a new value, reset all outputs. */ + s->outputs_16bit_lo[i][j] = NULL; + s->outputs_16bit_hi[i][j] = NULL; + + bool has_lo_16bit = (s->info->usage_mask_16bit_lo[i] & (1 << j)) && + ((s->info->streams_16bit_lo[i] >> (j * 2)) & 0x3) == stream; + bool has_hi_16bit = (s->info->usage_mask_16bit_hi[i] & (1 << j)) && + ((s->info->streams_16bit_hi[i] >> (j * 2)) & 0x3) == stream; + if (!has_lo_16bit && !has_hi_16bit) + continue; + + unsigned base = offset * b->shader->info.gs.vertices_out; + offset++; + + bool has_lo_16bit_out = has_lo_16bit && output_lo; + bool has_hi_16bit_out = has_hi_16bit && output_hi; + + /* no one set needed output, skip the buffer store */ + if (!has_lo_16bit_out && !has_hi_16bit_out) + continue; + + if (!has_lo_16bit_out) + output_lo = nir_ssa_undef(b, 1, 16); + + if (!has_hi_16bit_out) + output_hi = nir_ssa_undef(b, 1, 16); + + nir_ssa_def *voffset = nir_iadd_imm(b, vtxidx, base); + voffset = nir_ishl_imm(b, voffset, 2); + + nir_store_buffer_amd(b, nir_pack_32_2x16_split(b, output_lo, output_hi), + gsvs_ring, voffset, soffset, nir_imm_int(b, 0), + .is_swizzled = true, .slc_amd = true, + .access = ACCESS_COHERENT, + /* For ACO to not reorder this store around EmitVertex/EndPrimitve */ + .memory_modes = nir_var_shader_out); + } + } + + /* Keep this instruction to signal vertex emission. */ + + return true; +} + +static bool +lower_legacy_gs_set_vertex_and_primitive_count(nir_builder *b, nir_intrinsic_instr *intrin, + lower_legacy_gs_state *s) +{ + b->cursor = nir_before_instr(&intrin->instr); + + unsigned stream = nir_intrinsic_stream_id(intrin); + + s->vertex_count[stream] = intrin->src[0].ssa; + s->primitive_count[stream] = intrin->src[1].ssa; + + nir_instr_remove(&intrin->instr); + return true; +} + +static bool +lower_legacy_gs_intrinsic(nir_builder *b, nir_instr *instr, void *state) +{ + lower_legacy_gs_state *s = (lower_legacy_gs_state *) state; + + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + if (intrin->intrinsic == nir_intrinsic_store_output) + return lower_legacy_gs_store_output(b, intrin, s); + else if (intrin->intrinsic == nir_intrinsic_emit_vertex_with_counter) + return lower_legacy_gs_emit_vertex_with_counter(b, intrin, s); + else if (intrin->intrinsic == nir_intrinsic_set_vertex_and_primitive_count) + return lower_legacy_gs_set_vertex_and_primitive_count(b, intrin, s); + + return false; +} + +void +ac_nir_lower_legacy_gs(nir_shader *nir, + bool has_gen_prim_query, + bool has_pipeline_stats_query, + ac_nir_gs_output_info *output_info) +{ + lower_legacy_gs_state s = { + .info = output_info, + }; + + unsigned num_vertices_per_primitive = 0; + switch (nir->info.gs.output_primitive) { + case SHADER_PRIM_POINTS: + num_vertices_per_primitive = 1; + break; + case SHADER_PRIM_LINE_STRIP: + num_vertices_per_primitive = 2; + break; + case SHADER_PRIM_TRIANGLE_STRIP: + num_vertices_per_primitive = 3; + break; + default: + unreachable("Invalid GS output primitive."); + break; + } + + nir_shader_instructions_pass(nir, lower_legacy_gs_intrinsic, + nir_metadata_block_index | nir_metadata_dominance, &s); + + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + nir_builder builder; + nir_builder *b = &builder; + nir_builder_init(b, impl); + + b->cursor = nir_after_cf_list(&impl->body); + + /* Emit shader query for mix use legacy/NGG GS */ + bool progress = ac_nir_gs_shader_query(b, + has_gen_prim_query, + has_pipeline_stats_query, + num_vertices_per_primitive, + 64, + s.vertex_count, + s.primitive_count); + if (progress) + nir_metadata_preserve(impl, nir_metadata_none); +} diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h index e32b6940387..3f5752aeaf2 100644 --- a/src/amd/common/ac_nir.h +++ b/src/amd/common/ac_nir.h @@ -187,6 +187,16 @@ ac_nir_lower_global_access(nir_shader *shader); bool ac_nir_lower_resinfo(nir_shader *nir, enum amd_gfx_level gfx_level); +typedef struct ac_nir_gs_output_info { + const uint8_t *streams; + const uint8_t *streams_16bit_lo; + const uint8_t *streams_16bit_hi; + + const uint8_t *usage_mask; + const uint8_t *usage_mask_16bit_lo; + const uint8_t *usage_mask_16bit_hi; +} ac_nir_gs_output_info; + nir_shader * ac_nir_create_gs_copy_shader(const nir_shader *gs_nir, bool disable_streamout, @@ -207,6 +217,12 @@ ac_nir_gs_shader_query(nir_builder *b, nir_ssa_def *vertex_count[4], nir_ssa_def *primitive_count[4]); +void +ac_nir_lower_legacy_gs(nir_shader *nir, + bool has_gen_prim_query, + bool has_pipeline_stats_query, + ac_nir_gs_output_info *output_info); + #ifdef __cplusplus } #endif