radv,aco: remove old streamout code

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18898>
This commit is contained in:
Rhys Perry
2022-09-30 19:49:56 +01:00
committed by Marge Bot
parent 3a96977542
commit 0cb48ec3b7
6 changed files with 3 additions and 277 deletions

View File

@@ -11553,119 +11553,6 @@ create_fs_exports(isel_context* ctx)
ctx->block->kind |= block_kind_export_end; ctx->block->kind |= block_kind_export_end;
} }
static void
emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset,
const struct aco_stream_output* output)
{
assert(ctx->stage.hw == HWStage::VS);
unsigned loc = output->location;
unsigned buf = output->buffer;
unsigned writemask = output->component_mask & ctx->outputs.mask[loc];
while (writemask) {
int start, count;
u_bit_scan_consecutive_range(&writemask, &start, &count);
if (count == 3 && ctx->options->gfx_level == GFX6) {
/* GFX6 doesn't support storing vec3, split it. */
writemask |= 1u << (start + 2);
count = 2;
}
unsigned offset = output->offset + (start - (ffs(output->component_mask) - 1)) * 4;
Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count));
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
for (int i = 0; i < count; ++i)
vec->operands[i] = Operand(ctx->outputs.temps[loc * 4 + start + i]);
vec->definitions[0] = Definition(write_data);
ctx->block->instructions.emplace_back(std::move(vec));
aco_opcode opcode = get_buffer_store_op(count * 4);
aco_ptr<MUBUF_instruction> store{
create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
store->operands[0] = Operand(so_buffers[buf]);
store->operands[1] = Operand(so_write_offset[buf]);
store->operands[2] = Operand::c32(0);
store->operands[3] = Operand(write_data);
if (offset > 4095) {
/* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
Builder bld(ctx->program, ctx->block);
store->operands[1] =
bld.vadd32(bld.def(v1), Operand::c32(offset), Operand(so_write_offset[buf]));
} else {
store->offset = offset;
}
store->offen = true;
store->glc = ctx->program->gfx_level < GFX11;
store->dlc = false;
store->slc = true;
ctx->block->instructions.emplace_back(std::move(store));
}
}
static void
emit_streamout(isel_context* ctx, unsigned stream)
{
Builder bld(ctx->program, ctx->block);
Temp so_vtx_count =
bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
get_arg(ctx, ctx->args->ac.streamout_config), Operand::c32(0x70010u));
Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
if_context ic;
begin_divergent_if_then(ctx, &ic, can_emit);
bld.reset(ctx->block);
Temp so_write_index =
bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid);
Temp so_buffers[4];
Temp so_write_offset[4];
Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
for (unsigned i = 0; i < 4; i++) {
unsigned stride = ctx->program->info.so.strides[i];
if (!stride)
continue;
so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr,
bld.copy(bld.def(s1), Operand::c32(i * 16u)));
if (stride == 1) {
Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
get_arg(ctx, ctx->args->ac.streamout_write_index),
get_arg(ctx, ctx->args->ac.streamout_offset[i]));
Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
so_write_offset[i] =
bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), new_offset);
} else {
Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(4u),
get_arg(ctx, ctx->args->ac.streamout_offset[i]));
so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
}
}
for (unsigned i = 0; i < ctx->program->info.so.num_outputs; i++) {
const struct aco_stream_output* output = &ctx->program->info.so.outputs[i];
if (stream != output->stream)
continue;
emit_stream_output(ctx, so_buffers, so_write_offset, output);
}
begin_divergent_if_else(ctx, &ic);
end_divergent_if(ctx, &ic);
}
Pseudo_instruction* Pseudo_instruction*
add_startpgm(struct isel_context* ctx) add_startpgm(struct isel_context* ctx)
{ {

View File

@@ -88,20 +88,6 @@ struct aco_vp_output_info {
bool export_clip_dists; bool export_clip_dists;
}; };
struct aco_stream_output {
uint8_t location;
uint8_t buffer;
uint16_t offset;
uint8_t component_mask;
uint8_t stream;
};
struct aco_streamout_info {
uint16_t num_outputs;
struct aco_stream_output outputs[ACO_MAX_SO_OUTPUTS];
uint16_t strides[ACO_MAX_SO_BUFFERS];
};
struct aco_shader_info { struct aco_shader_info {
uint8_t wave_size; uint8_t wave_size;
bool is_ngg; bool is_ngg;
@@ -143,7 +129,6 @@ struct aco_shader_info {
struct { struct {
uint8_t subgroup_size; uint8_t subgroup_size;
} cs; } cs;
struct aco_streamout_info so;
uint32_t gfx9_gs_ring_lds_size; uint32_t gfx9_gs_ring_lds_size;
}; };

View File

@@ -34,16 +34,6 @@
#define ASSIGN_FIELD(x) aco_info->x = radv->x #define ASSIGN_FIELD(x) aco_info->x = radv->x
#define ASSIGN_FIELD_CP(x) memcpy(&aco_info->x, &radv->x, sizeof(radv->x)) #define ASSIGN_FIELD_CP(x) memcpy(&aco_info->x, &radv->x, sizeof(radv->x))
static inline void
radv_aco_convert_shader_so_info(struct aco_shader_info *aco_info,
const struct radv_shader_info *radv)
{
ASSIGN_FIELD(so.num_outputs);
ASSIGN_FIELD_CP(so.outputs);
ASSIGN_FIELD_CP(so.strides);
/* enabled_stream_buffers_mask unused */
}
static inline void static inline void
radv_aco_convert_shader_vp_info(struct aco_vp_output_info *aco_info, radv_aco_convert_shader_vp_info(struct aco_vp_output_info *aco_info,
const struct radv_vs_output_info *radv) const struct radv_vs_output_info *radv)
@@ -97,7 +87,6 @@ radv_aco_convert_shader_info(struct aco_shader_info *aco_info,
ASSIGN_FIELD(ps.num_interp); ASSIGN_FIELD(ps.num_interp);
ASSIGN_FIELD(ps.spi_ps_input); ASSIGN_FIELD(ps.spi_ps_input);
ASSIGN_FIELD(cs.subgroup_size); ASSIGN_FIELD(cs.subgroup_size);
radv_aco_convert_shader_so_info(aco_info, radv);
aco_info->gfx9_gs_ring_lds_size = radv->gs_ring_info.lds_size; aco_info->gfx9_gs_ring_lds_size = radv->gs_ring_info.lds_size;
} }

View File

@@ -727,126 +727,6 @@ radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan)
return LLVMBuildLoad2(ctx->ac.builder, type, output, ""); return LLVMBuildLoad2(ctx->ac.builder, type, output, "");
} }
static void
radv_emit_stream_output(struct radv_shader_context *ctx, LLVMValueRef const *so_buffers,
LLVMValueRef const *so_write_offsets,
const struct radv_stream_output *output,
struct radv_shader_output_values *shader_out)
{
unsigned num_comps = util_bitcount(output->component_mask);
unsigned buf = output->buffer;
unsigned offset = output->offset;
unsigned start;
LLVMValueRef out[4];
assert(num_comps && num_comps <= 4);
if (!num_comps || num_comps > 4)
return;
/* Get the first component. */
start = ffs(output->component_mask) - 1;
/* Load the output as int. */
for (int i = 0; i < num_comps; i++) {
out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]);
}
/* Pack the output. */
LLVMValueRef vdata = NULL;
switch (num_comps) {
case 1: /* as i32 */
vdata = out[0];
break;
case 2: /* as v2i32 */
case 3: /* as v3i32 */
case 4: /* as v4i32 */
vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
break;
}
LLVMValueRef voffset = LLVMBuildAdd(ctx->ac.builder, so_write_offsets[buf],
LLVMConstInt(ctx->ac.i32, offset, 0), "");
ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], vdata, NULL, voffset, ctx->ac.i32_0,
ac_glc | ac_slc);
}
static void
radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)
{
int i;
/* Get bits [22:16], i.e. (so_param >> 16) & 127; */
assert(ctx->args->ac.streamout_config.used);
LLVMValueRef so_vtx_count = ac_build_bfe(
&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config),
LLVMConstInt(ctx->ac.i32, 16, false), LLVMConstInt(ctx->ac.i32, 7, false), false);
LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
/* can_emit = tid < so_vtx_count; */
LLVMValueRef can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, so_vtx_count, "");
/* Emit the streamout code conditionally. This actually avoids
* out-of-bounds buffer access. The hw tells us via the SGPR
* (so_vtx_count) which threads are allowed to emit streamout data.
*/
ac_build_ifcc(&ctx->ac, can_emit, 6501);
{
/* The buffer offset is computed as follows:
* ByteOffset = streamout_offset[buffer_id]*4 +
* (streamout_write_index + thread_id)*stride[buffer_id] +
* attrib_offset
*/
LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_write_index);
/* Compute (streamout_write_index + thread_id). */
so_write_index = LLVMBuildAdd(ctx->ac.builder, so_write_index, tid, "");
/* Load the descriptor and compute the write offset for each
* enabled buffer.
*/
LLVMValueRef so_write_offset[4] = {0};
LLVMValueRef so_buffers[4] = {0};
struct ac_llvm_pointer buf_ptr = ac_get_ptr_arg(&ctx->ac, &ctx->args->ac, ctx->args->streamout_buffers);
for (i = 0; i < 4; i++) {
uint16_t stride = ctx->shader_info->so.strides[i];
if (!stride)
continue;
LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i, false);
so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_offset[i]);
so_offset =
LLVMBuildMul(ctx->ac.builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, false), "");
so_write_offset[i] = ac_build_imad(
&ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, stride * 4, false), so_offset);
}
/* Write streamout data. */
for (i = 0; i < ctx->shader_info->so.num_outputs; i++) {
struct radv_shader_output_values shader_out = {0};
const struct radv_stream_output *output = &ctx->shader_info->so.outputs[i];
if (stream != output->stream)
continue;
for (int j = 0; j < 4; j++) {
shader_out.values[j] = radv_load_output(ctx, output->location, j);
}
radv_emit_stream_output(ctx, so_buffers, so_write_offset, output, &shader_out);
}
}
ac_build_endif(&ctx->ac, 6501);
}
static void static void
radv_build_param_exports(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs, radv_build_param_exports(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,
unsigned noutput, const struct radv_vs_output_info *outinfo, unsigned noutput, const struct radv_vs_output_info *outinfo,

View File

@@ -171,17 +171,8 @@ enum radv_ud_index {
AC_UD_MAX_UD = AC_UD_CS_MAX_UD, AC_UD_MAX_UD = AC_UD_CS_MAX_UD,
}; };
struct radv_stream_output {
uint8_t location;
uint8_t buffer;
uint16_t offset;
uint8_t component_mask;
uint8_t stream;
};
struct radv_streamout_info { struct radv_streamout_info {
uint16_t num_outputs; uint16_t num_outputs;
struct radv_stream_output outputs[MAX_SO_OUTPUTS];
uint16_t strides[MAX_SO_BUFFERS]; uint16_t strides[MAX_SO_BUFFERS];
uint32_t enabled_stream_buffers_mask; uint32_t enabled_stream_buffers_mask;
}; };

View File

@@ -287,15 +287,9 @@ gather_xfb_info(const nir_shader *nir, struct radv_shader_info *info)
so->num_outputs = xfb->output_count; so->num_outputs = xfb->output_count;
for (unsigned i = 0; i < xfb->output_count; i++) { for (unsigned i = 0; i < xfb->output_count; i++) {
struct radv_stream_output *output = &so->outputs[i]; unsigned output_buffer = xfb->outputs[i].buffer;
unsigned stream = xfb->buffer_to_stream[xfb->outputs[i].buffer];
output->buffer = xfb->outputs[i].buffer; so->enabled_stream_buffers_mask |= (1 << output_buffer) << (stream * 4);
output->stream = xfb->buffer_to_stream[xfb->outputs[i].buffer];
output->offset = xfb->outputs[i].offset;
output->location = xfb->outputs[i].location;
output->component_mask = xfb->outputs[i].component_mask;
so->enabled_stream_buffers_mask |= (1 << output->buffer) << (output->stream * 4);
} }
for (unsigned i = 0; i < NIR_MAX_XFB_BUFFERS; i++) { for (unsigned i = 0; i < NIR_MAX_XFB_BUFFERS; i++) {