radv,aco: use ac_nir_lower_legacy_gs

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20296>
2022-12-05 17:32:15 +00:00
parent c7cedaaee2
commit 18d3e4fecd
3 changed files with 15 additions and 142 deletions
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -7760,102 +7760,6 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
   }
 }
 void
 visit_emit_vertex_with_counter(isel_context* ctx, nir_intrinsic_instr* instr)
 {
   Builder bld(ctx->program, ctx->block);
   unsigned stream = nir_intrinsic_stream_id(instr);
   Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
   next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
   nir_const_value* next_vertex_cv = nir_src_as_const_value(instr->src[0]);
   /* get GSVS ring */
   Temp gsvs_ring =
      bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer,
               Operand::c32(RING_GSVS_GS * 16u));
   unsigned num_components = ctx->program->info.gs.num_stream_output_components[stream];
   unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
   unsigned stream_offset = 0;
   for (unsigned i = 0; i < stream; i++) {
      unsigned prev_stride = 4u * ctx->program->info.gs.num_stream_output_components[i] *
                             ctx->shader->info.gs.vertices_out;
      stream_offset += prev_stride * ctx->program->wave_size;
   }
   /* Limit on the stride field for <= GFX7. */
   assert(stride < (1 << 14));
   Temp gsvs_dwords[4];
   for (unsigned i = 0; i < 4; i++)
      gsvs_dwords[i] = bld.tmp(s1);
   bld.pseudo(aco_opcode::p_split_vector, Definition(gsvs_dwords[0]), Definition(gsvs_dwords[1]),
              Definition(gsvs_dwords[2]), Definition(gsvs_dwords[3]), gsvs_ring);
   if (stream_offset) {
      Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand::c32(stream_offset));
      Temp carry = bld.tmp(s1);
      gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)),
                                gsvs_dwords[0], stream_offset_tmp);
      gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc),
                                gsvs_dwords[1], Operand::zero(), bld.scc(carry));
   }
   gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1],
                             Operand::c32(S_008F04_STRIDE(stride)));
   gsvs_dwords[2] = bld.copy(bld.def(s1), Operand::c32(ctx->program->wave_size));
   gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), gsvs_dwords[0], gsvs_dwords[1],
                          gsvs_dwords[2], gsvs_dwords[3]);
   unsigned offset = 0;
   for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
      for (unsigned j = 0; j < 4; j++) {
         if (((ctx->program->info.gs.output_streams[i] >> (j * 2)) & 0x3) != stream)
            continue;
         if (!(ctx->program->info.gs.output_usage_mask[i] & (1 << j)))
            continue;
         if (ctx->outputs.mask[i] & (1 << j)) {
            Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);
            unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;
            if (const_offset >= 4096u) {
               if (vaddr_offset.isUndefined())
                  vaddr_offset = bld.copy(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u));
               else
                  vaddr_offset = bld.vadd32(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u),
                                            vaddr_offset);
               const_offset %= 4096u;
            }
            aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
               aco_opcode::buffer_store_dword, Format::MUBUF, 4, 0)};
            mubuf->operands[0] = Operand(gsvs_ring);
            mubuf->operands[1] = vaddr_offset;
            mubuf->operands[2] = Operand(get_arg(ctx, ctx->args->ac.gs2vs_offset));
            mubuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);
            mubuf->offen = !vaddr_offset.isUndefined();
            mubuf->offset = const_offset;
            mubuf->glc = ctx->program->gfx_level < GFX11;
            mubuf->slc = true;
            mubuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);
            bld.insert(std::move(mubuf));
         }
         offset += ctx->shader->info.gs.vertices_out;
      }
      /* outputs for the next vertex are undefined and keeping them around can
       * create invalid IR with control flow */
      ctx->outputs.mask[i] = 0;
   }
   bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
 }
 Temp
 emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src)
 {
@@ -9170,7 +9074,8 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
   }
   case nir_intrinsic_emit_vertex_with_counter: {
      assert(ctx->stage.hw == HWStage::GS);
-      visit_emit_vertex_with_counter(ctx, instr);
+      unsigned stream = nir_intrinsic_stream_id(instr);
      bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
      break;
   }
   case nir_intrinsic_end_primitive_with_counter: {
@@ -9181,11 +9086,6 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
      }
      break;
   }
   case nir_intrinsic_set_vertex_and_primitive_count: {
      assert(ctx->stage.hw == HWStage::GS);
      /* unused in the legacy pipeline, the HW keeps track of this for us */
      break;
   }
   case nir_intrinsic_is_subgroup_invocation_lt_amd: {
      Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), lanecount_to_mask(ctx, src));
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -194,44 +194,7 @@ static void
 visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef vertexidx,
                               LLVMValueRef *addrs)
 {
   unsigned offset = 0;
   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
   for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
      unsigned output_usage_mask = ctx->shader_info->gs.output_usage_mask[i];
      uint8_t output_stream = ctx->shader_info->gs.output_streams[i];
      LLVMValueRef *out_ptr = &addrs[i * 4];
      bool *is_16bit_ptr = &abi->is_16bit[i * 4];
      int length = util_last_bit(output_usage_mask);
      if (!(ctx->output_mask & (1ull << i)))
         continue;
      for (unsigned j = 0; j < length; j++) {
         if (((output_stream >> (j * 2)) & 0x3) != stream)
            continue;
         if (!(output_usage_mask & (1 << j)))
            continue;
         LLVMTypeRef type = is_16bit_ptr[j] ? ctx->ac.f16 : ctx->ac.f32;
         LLVMValueRef out_val = LLVMBuildLoad2(ctx->ac.builder, type, out_ptr[j], "");
         LLVMValueRef voffset =
            LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out, false);
         offset++;
         voffset = LLVMBuildAdd(ctx->ac.builder, voffset, vertexidx, "");
         voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");
         out_val = ac_to_integer(&ctx->ac, out_val);
         out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
         ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, NULL, voffset,
                                     ac_get_arg(&ctx->ac, ctx->args->ac.gs2vs_offset),
                                     ac_glc | ac_slc | ac_swizzled);
      }
   }
   ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
                    ctx->gs_wave_id);
 }
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -3475,10 +3475,20 @@ radv_postprocess_nir(struct radv_pipeline *pipeline,
   if (lowered_ngg)
      radv_lower_ngg(device, stage, pipeline_key);
-   if (stage->stage == last_vgt_api_stage && stage->stage != MESA_SHADER_GEOMETRY && !lowered_ngg)
+   if (stage->stage == last_vgt_api_stage && !lowered_ngg) {
      if (stage->stage != MESA_SHADER_GEOMETRY) {
         NIR_PASS_V(stage->nir, ac_nir_lower_legacy_vs,
                    stage->info.outinfo.export_prim_id ? VARYING_SLOT_PRIMITIVE_ID : -1, false);
      } else {
         ac_nir_gs_output_info gs_out_info = {
            .streams = stage->info.gs.output_streams,
            .usage_mask = stage->info.gs.output_usage_mask,
         };
         NIR_PASS_V(stage->nir, ac_nir_lower_legacy_gs, false, false, &gs_out_info);
      }
   }
   NIR_PASS(_, stage->nir, nir_opt_idiv_const, 8);
   NIR_PASS(_, stage->nir, nir_lower_idiv,