radv,aco: remove old streamout code

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18898>
2022-09-30 19:49:56 +01:00
parent 3a96977542
commit 0cb48ec3b7
6 changed files with 3 additions and 277 deletions
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -11553,119 +11553,6 @@ create_fs_exports(isel_context* ctx)
   ctx->block->kind |= block_kind_export_end;
 }

-static void
-emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset,
-                   const struct aco_stream_output* output)
-{
-   assert(ctx->stage.hw == HWStage::VS);
-
-   unsigned loc = output->location;
-   unsigned buf = output->buffer;
-
-   unsigned writemask = output->component_mask & ctx->outputs.mask[loc];
-   while (writemask) {
-      int start, count;
-      u_bit_scan_consecutive_range(&writemask, &start, &count);
-      if (count == 3 && ctx->options->gfx_level == GFX6) {
-         /* GFX6 doesn't support storing vec3, split it. */
-         writemask |= 1u << (start + 2);
-         count = 2;
-      }
-
-      unsigned offset = output->offset + (start - (ffs(output->component_mask) - 1)) * 4;
-
-      Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count));
-      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
-         aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
-      for (int i = 0; i < count; ++i)
-         vec->operands[i] = Operand(ctx->outputs.temps[loc * 4 + start + i]);
-      vec->definitions[0] = Definition(write_data);
-      ctx->block->instructions.emplace_back(std::move(vec));
-
-      aco_opcode opcode = get_buffer_store_op(count * 4);
-      aco_ptr<MUBUF_instruction> store{
-         create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
-      store->operands[0] = Operand(so_buffers[buf]);
-      store->operands[1] = Operand(so_write_offset[buf]);
-      store->operands[2] = Operand::c32(0);
-      store->operands[3] = Operand(write_data);
-      if (offset > 4095) {
-         /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
-         Builder bld(ctx->program, ctx->block);
-         store->operands[1] =
-            bld.vadd32(bld.def(v1), Operand::c32(offset), Operand(so_write_offset[buf]));
-      } else {
-         store->offset = offset;
-      }
-      store->offen = true;
-      store->glc = ctx->program->gfx_level < GFX11;
-      store->dlc = false;
-      store->slc = true;
-      ctx->block->instructions.emplace_back(std::move(store));
-   }
-}
-
-static void
-emit_streamout(isel_context* ctx, unsigned stream)
-{
-   Builder bld(ctx->program, ctx->block);
-
-   Temp so_vtx_count =
-      bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
-               get_arg(ctx, ctx->args->ac.streamout_config), Operand::c32(0x70010u));
-
-   Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
-
-   Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
-
-   if_context ic;
-   begin_divergent_if_then(ctx, &ic, can_emit);
-
-   bld.reset(ctx->block);
-
-   Temp so_write_index =
-      bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid);
-
-   Temp so_buffers[4];
-   Temp so_write_offset[4];
-   Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
-
-   for (unsigned i = 0; i < 4; i++) {
-      unsigned stride = ctx->program->info.so.strides[i];
-      if (!stride)
-         continue;
-
-      so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr,
-                               bld.copy(bld.def(s1), Operand::c32(i * 16u)));
-
-      if (stride == 1) {
-         Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
-                                get_arg(ctx, ctx->args->ac.streamout_write_index),
-                                get_arg(ctx, ctx->args->ac.streamout_offset[i]));
-         Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
-
-         so_write_offset[i] =
-            bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), new_offset);
-      } else {
-         Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
-         Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(4u),
-                                 get_arg(ctx, ctx->args->ac.streamout_offset[i]));
-         so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
-      }
-   }
-
-   for (unsigned i = 0; i < ctx->program->info.so.num_outputs; i++) {
-      const struct aco_stream_output* output = &ctx->program->info.so.outputs[i];
-      if (stream != output->stream)
-         continue;
-
-      emit_stream_output(ctx, so_buffers, so_write_offset, output);
-   }
-
-   begin_divergent_if_else(ctx, &ic);
-   end_divergent_if(ctx, &ic);
-}
-
 Pseudo_instruction*
 add_startpgm(struct isel_context* ctx)
 {
--- a/src/amd/compiler/aco_shader_info.h
+++ b/src/amd/compiler/aco_shader_info.h
@@ -88,20 +88,6 @@ struct aco_vp_output_info {
   bool export_clip_dists;
 };

-struct aco_stream_output {
-   uint8_t location;
-   uint8_t buffer;
-   uint16_t offset;
-   uint8_t component_mask;
-   uint8_t stream;
-};
-
-struct aco_streamout_info {
-   uint16_t num_outputs;
-   struct aco_stream_output outputs[ACO_MAX_SO_OUTPUTS];
-   uint16_t strides[ACO_MAX_SO_BUFFERS];
-};
-
 struct aco_shader_info {
   uint8_t wave_size;
   bool is_ngg;
@@ -143,7 +129,6 @@ struct aco_shader_info {
   struct {
      uint8_t subgroup_size;
   } cs;
-   struct aco_streamout_info so;

   uint32_t gfx9_gs_ring_lds_size;
 };
--- a/src/amd/vulkan/radv_aco_shader_info.h
+++ b/src/amd/vulkan/radv_aco_shader_info.h
@@ -34,16 +34,6 @@
 #define ASSIGN_FIELD(x) aco_info->x = radv->x
 #define ASSIGN_FIELD_CP(x) memcpy(&aco_info->x, &radv->x, sizeof(radv->x))

-static inline void
-radv_aco_convert_shader_so_info(struct aco_shader_info *aco_info,
-                       const struct radv_shader_info *radv)
-{
-   ASSIGN_FIELD(so.num_outputs);
-   ASSIGN_FIELD_CP(so.outputs);
-   ASSIGN_FIELD_CP(so.strides);
-   /* enabled_stream_buffers_mask unused */
-}
-
 static inline void
 radv_aco_convert_shader_vp_info(struct aco_vp_output_info *aco_info,
 				const struct radv_vs_output_info *radv)
@@ -97,7 +87,6 @@ radv_aco_convert_shader_info(struct aco_shader_info *aco_info,
   ASSIGN_FIELD(ps.num_interp);
   ASSIGN_FIELD(ps.spi_ps_input);
   ASSIGN_FIELD(cs.subgroup_size);
-   radv_aco_convert_shader_so_info(aco_info, radv);
   aco_info->gfx9_gs_ring_lds_size = radv->gs_ring_info.lds_size;
 }

--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -727,126 +727,6 @@ radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan)
   return LLVMBuildLoad2(ctx->ac.builder, type, output, "");
 }

-static void
-radv_emit_stream_output(struct radv_shader_context *ctx, LLVMValueRef const *so_buffers,
-                        LLVMValueRef const *so_write_offsets,
-                        const struct radv_stream_output *output,
-                        struct radv_shader_output_values *shader_out)
-{
-   unsigned num_comps = util_bitcount(output->component_mask);
-   unsigned buf = output->buffer;
-   unsigned offset = output->offset;
-   unsigned start;
-   LLVMValueRef out[4];
-
-   assert(num_comps && num_comps <= 4);
-   if (!num_comps || num_comps > 4)
-      return;
-
-   /* Get the first component. */
-   start = ffs(output->component_mask) - 1;
-
-   /* Load the output as int. */
-   for (int i = 0; i < num_comps; i++) {
-      out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]);
-   }
-
-   /* Pack the output. */
-   LLVMValueRef vdata = NULL;
-
-   switch (num_comps) {
-   case 1: /* as i32 */
-      vdata = out[0];
-      break;
-   case 2: /* as v2i32 */
-   case 3: /* as v3i32 */
-   case 4: /* as v4i32 */
-      vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
-      break;
-   }
-
-   LLVMValueRef voffset = LLVMBuildAdd(ctx->ac.builder, so_write_offsets[buf],
-                                       LLVMConstInt(ctx->ac.i32, offset, 0), "");
-   ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], vdata, NULL, voffset, ctx->ac.i32_0,
-                               ac_glc | ac_slc);
-}
-
-static void
-radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)
-{
-   int i;
-
-   /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
-   assert(ctx->args->ac.streamout_config.used);
-   LLVMValueRef so_vtx_count = ac_build_bfe(
-      &ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config),
-      LLVMConstInt(ctx->ac.i32, 16, false), LLVMConstInt(ctx->ac.i32, 7, false), false);
-
-   LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
-
-   /* can_emit = tid < so_vtx_count; */
-   LLVMValueRef can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, so_vtx_count, "");
-
-   /* Emit the streamout code conditionally. This actually avoids
-    * out-of-bounds buffer access. The hw tells us via the SGPR
-    * (so_vtx_count) which threads are allowed to emit streamout data.
-    */
-   ac_build_ifcc(&ctx->ac, can_emit, 6501);
-   {
-      /* The buffer offset is computed as follows:
-       *   ByteOffset = streamout_offset[buffer_id]*4 +
-       *                (streamout_write_index + thread_id)*stride[buffer_id] +
-       *                attrib_offset
-       */
-      LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_write_index);
-
-      /* Compute (streamout_write_index + thread_id). */
-      so_write_index = LLVMBuildAdd(ctx->ac.builder, so_write_index, tid, "");
-
-      /* Load the descriptor and compute the write offset for each
-       * enabled buffer.
-       */
-      LLVMValueRef so_write_offset[4] = {0};
-      LLVMValueRef so_buffers[4] = {0};
-      struct ac_llvm_pointer buf_ptr = ac_get_ptr_arg(&ctx->ac, &ctx->args->ac, ctx->args->streamout_buffers);
-
-      for (i = 0; i < 4; i++) {
-         uint16_t stride = ctx->shader_info->so.strides[i];
-
-         if (!stride)
-            continue;
-
-         LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i, false);
-
-         so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-
-         LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_offset[i]);
-
-         so_offset =
-            LLVMBuildMul(ctx->ac.builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, false), "");
-
-         so_write_offset[i] = ac_build_imad(
-            &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, stride * 4, false), so_offset);
-      }
-
-      /* Write streamout data. */
-      for (i = 0; i < ctx->shader_info->so.num_outputs; i++) {
-         struct radv_shader_output_values shader_out = {0};
-         const struct radv_stream_output *output = &ctx->shader_info->so.outputs[i];
-
-         if (stream != output->stream)
-            continue;
-
-         for (int j = 0; j < 4; j++) {
-            shader_out.values[j] = radv_load_output(ctx, output->location, j);
-         }
-
-         radv_emit_stream_output(ctx, so_buffers, so_write_offset, output, &shader_out);
-      }
-   }
-   ac_build_endif(&ctx->ac, 6501);
-}
-
 static void
 radv_build_param_exports(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,
                         unsigned noutput, const struct radv_vs_output_info *outinfo,
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -171,17 +171,8 @@ enum radv_ud_index {
   AC_UD_MAX_UD = AC_UD_CS_MAX_UD,
 };

-struct radv_stream_output {
-   uint8_t location;
-   uint8_t buffer;
-   uint16_t offset;
-   uint8_t component_mask;
-   uint8_t stream;
-};
-
 struct radv_streamout_info {
   uint16_t num_outputs;
-   struct radv_stream_output outputs[MAX_SO_OUTPUTS];
   uint16_t strides[MAX_SO_BUFFERS];
   uint32_t enabled_stream_buffers_mask;
 };
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -287,15 +287,9 @@ gather_xfb_info(const nir_shader *nir, struct radv_shader_info *info)
   so->num_outputs = xfb->output_count;

   for (unsigned i = 0; i < xfb->output_count; i++) {
-      struct radv_stream_output *output = &so->outputs[i];
-
-      output->buffer = xfb->outputs[i].buffer;
-      output->stream = xfb->buffer_to_stream[xfb->outputs[i].buffer];
-      output->offset = xfb->outputs[i].offset;
-      output->location = xfb->outputs[i].location;
-      output->component_mask = xfb->outputs[i].component_mask;
-
-      so->enabled_stream_buffers_mask |= (1 << output->buffer) << (output->stream * 4);
+      unsigned output_buffer = xfb->outputs[i].buffer;
+      unsigned stream = xfb->buffer_to_stream[xfb->outputs[i].buffer];
+      so->enabled_stream_buffers_mask |= (1 << output_buffer) << (stream * 4);
   }

   for (unsigned i = 0; i < NIR_MAX_XFB_BUFFERS; i++) {