radv,aco: remove old streamout code
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18898>
This commit is contained in:
@@ -11553,119 +11553,6 @@ create_fs_exports(isel_context* ctx)
|
||||
ctx->block->kind |= block_kind_export_end;
|
||||
}
|
||||
|
||||
static void
|
||||
emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset,
|
||||
const struct aco_stream_output* output)
|
||||
{
|
||||
assert(ctx->stage.hw == HWStage::VS);
|
||||
|
||||
unsigned loc = output->location;
|
||||
unsigned buf = output->buffer;
|
||||
|
||||
unsigned writemask = output->component_mask & ctx->outputs.mask[loc];
|
||||
while (writemask) {
|
||||
int start, count;
|
||||
u_bit_scan_consecutive_range(&writemask, &start, &count);
|
||||
if (count == 3 && ctx->options->gfx_level == GFX6) {
|
||||
/* GFX6 doesn't support storing vec3, split it. */
|
||||
writemask |= 1u << (start + 2);
|
||||
count = 2;
|
||||
}
|
||||
|
||||
unsigned offset = output->offset + (start - (ffs(output->component_mask) - 1)) * 4;
|
||||
|
||||
Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count));
|
||||
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
|
||||
aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
|
||||
for (int i = 0; i < count; ++i)
|
||||
vec->operands[i] = Operand(ctx->outputs.temps[loc * 4 + start + i]);
|
||||
vec->definitions[0] = Definition(write_data);
|
||||
ctx->block->instructions.emplace_back(std::move(vec));
|
||||
|
||||
aco_opcode opcode = get_buffer_store_op(count * 4);
|
||||
aco_ptr<MUBUF_instruction> store{
|
||||
create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
|
||||
store->operands[0] = Operand(so_buffers[buf]);
|
||||
store->operands[1] = Operand(so_write_offset[buf]);
|
||||
store->operands[2] = Operand::c32(0);
|
||||
store->operands[3] = Operand(write_data);
|
||||
if (offset > 4095) {
|
||||
/* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
|
||||
Builder bld(ctx->program, ctx->block);
|
||||
store->operands[1] =
|
||||
bld.vadd32(bld.def(v1), Operand::c32(offset), Operand(so_write_offset[buf]));
|
||||
} else {
|
||||
store->offset = offset;
|
||||
}
|
||||
store->offen = true;
|
||||
store->glc = ctx->program->gfx_level < GFX11;
|
||||
store->dlc = false;
|
||||
store->slc = true;
|
||||
ctx->block->instructions.emplace_back(std::move(store));
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
emit_streamout(isel_context* ctx, unsigned stream)
|
||||
{
|
||||
Builder bld(ctx->program, ctx->block);
|
||||
|
||||
Temp so_vtx_count =
|
||||
bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
|
||||
get_arg(ctx, ctx->args->ac.streamout_config), Operand::c32(0x70010u));
|
||||
|
||||
Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
|
||||
|
||||
Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
|
||||
|
||||
if_context ic;
|
||||
begin_divergent_if_then(ctx, &ic, can_emit);
|
||||
|
||||
bld.reset(ctx->block);
|
||||
|
||||
Temp so_write_index =
|
||||
bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid);
|
||||
|
||||
Temp so_buffers[4];
|
||||
Temp so_write_offset[4];
|
||||
Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
|
||||
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
unsigned stride = ctx->program->info.so.strides[i];
|
||||
if (!stride)
|
||||
continue;
|
||||
|
||||
so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr,
|
||||
bld.copy(bld.def(s1), Operand::c32(i * 16u)));
|
||||
|
||||
if (stride == 1) {
|
||||
Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
|
||||
get_arg(ctx, ctx->args->ac.streamout_write_index),
|
||||
get_arg(ctx, ctx->args->ac.streamout_offset[i]));
|
||||
Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
|
||||
|
||||
so_write_offset[i] =
|
||||
bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), new_offset);
|
||||
} else {
|
||||
Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
|
||||
Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(4u),
|
||||
get_arg(ctx, ctx->args->ac.streamout_offset[i]));
|
||||
so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < ctx->program->info.so.num_outputs; i++) {
|
||||
const struct aco_stream_output* output = &ctx->program->info.so.outputs[i];
|
||||
if (stream != output->stream)
|
||||
continue;
|
||||
|
||||
emit_stream_output(ctx, so_buffers, so_write_offset, output);
|
||||
}
|
||||
|
||||
begin_divergent_if_else(ctx, &ic);
|
||||
end_divergent_if(ctx, &ic);
|
||||
}
|
||||
|
||||
Pseudo_instruction*
|
||||
add_startpgm(struct isel_context* ctx)
|
||||
{
|
||||
|
@@ -88,20 +88,6 @@ struct aco_vp_output_info {
|
||||
bool export_clip_dists;
|
||||
};
|
||||
|
||||
struct aco_stream_output {
|
||||
uint8_t location;
|
||||
uint8_t buffer;
|
||||
uint16_t offset;
|
||||
uint8_t component_mask;
|
||||
uint8_t stream;
|
||||
};
|
||||
|
||||
struct aco_streamout_info {
|
||||
uint16_t num_outputs;
|
||||
struct aco_stream_output outputs[ACO_MAX_SO_OUTPUTS];
|
||||
uint16_t strides[ACO_MAX_SO_BUFFERS];
|
||||
};
|
||||
|
||||
struct aco_shader_info {
|
||||
uint8_t wave_size;
|
||||
bool is_ngg;
|
||||
@@ -143,7 +129,6 @@ struct aco_shader_info {
|
||||
struct {
|
||||
uint8_t subgroup_size;
|
||||
} cs;
|
||||
struct aco_streamout_info so;
|
||||
|
||||
uint32_t gfx9_gs_ring_lds_size;
|
||||
};
|
||||
|
@@ -34,16 +34,6 @@
|
||||
#define ASSIGN_FIELD(x) aco_info->x = radv->x
|
||||
#define ASSIGN_FIELD_CP(x) memcpy(&aco_info->x, &radv->x, sizeof(radv->x))
|
||||
|
||||
static inline void
|
||||
radv_aco_convert_shader_so_info(struct aco_shader_info *aco_info,
|
||||
const struct radv_shader_info *radv)
|
||||
{
|
||||
ASSIGN_FIELD(so.num_outputs);
|
||||
ASSIGN_FIELD_CP(so.outputs);
|
||||
ASSIGN_FIELD_CP(so.strides);
|
||||
/* enabled_stream_buffers_mask unused */
|
||||
}
|
||||
|
||||
static inline void
|
||||
radv_aco_convert_shader_vp_info(struct aco_vp_output_info *aco_info,
|
||||
const struct radv_vs_output_info *radv)
|
||||
@@ -97,7 +87,6 @@ radv_aco_convert_shader_info(struct aco_shader_info *aco_info,
|
||||
ASSIGN_FIELD(ps.num_interp);
|
||||
ASSIGN_FIELD(ps.spi_ps_input);
|
||||
ASSIGN_FIELD(cs.subgroup_size);
|
||||
radv_aco_convert_shader_so_info(aco_info, radv);
|
||||
aco_info->gfx9_gs_ring_lds_size = radv->gs_ring_info.lds_size;
|
||||
}
|
||||
|
||||
|
@@ -727,126 +727,6 @@ radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan)
|
||||
return LLVMBuildLoad2(ctx->ac.builder, type, output, "");
|
||||
}
|
||||
|
||||
static void
|
||||
radv_emit_stream_output(struct radv_shader_context *ctx, LLVMValueRef const *so_buffers,
|
||||
LLVMValueRef const *so_write_offsets,
|
||||
const struct radv_stream_output *output,
|
||||
struct radv_shader_output_values *shader_out)
|
||||
{
|
||||
unsigned num_comps = util_bitcount(output->component_mask);
|
||||
unsigned buf = output->buffer;
|
||||
unsigned offset = output->offset;
|
||||
unsigned start;
|
||||
LLVMValueRef out[4];
|
||||
|
||||
assert(num_comps && num_comps <= 4);
|
||||
if (!num_comps || num_comps > 4)
|
||||
return;
|
||||
|
||||
/* Get the first component. */
|
||||
start = ffs(output->component_mask) - 1;
|
||||
|
||||
/* Load the output as int. */
|
||||
for (int i = 0; i < num_comps; i++) {
|
||||
out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]);
|
||||
}
|
||||
|
||||
/* Pack the output. */
|
||||
LLVMValueRef vdata = NULL;
|
||||
|
||||
switch (num_comps) {
|
||||
case 1: /* as i32 */
|
||||
vdata = out[0];
|
||||
break;
|
||||
case 2: /* as v2i32 */
|
||||
case 3: /* as v3i32 */
|
||||
case 4: /* as v4i32 */
|
||||
vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
|
||||
break;
|
||||
}
|
||||
|
||||
LLVMValueRef voffset = LLVMBuildAdd(ctx->ac.builder, so_write_offsets[buf],
|
||||
LLVMConstInt(ctx->ac.i32, offset, 0), "");
|
||||
ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], vdata, NULL, voffset, ctx->ac.i32_0,
|
||||
ac_glc | ac_slc);
|
||||
}
|
||||
|
||||
static void
|
||||
radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)
|
||||
{
|
||||
int i;
|
||||
|
||||
/* Get bits [22:16], i.e. (so_param >> 16) & 127; */
|
||||
assert(ctx->args->ac.streamout_config.used);
|
||||
LLVMValueRef so_vtx_count = ac_build_bfe(
|
||||
&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config),
|
||||
LLVMConstInt(ctx->ac.i32, 16, false), LLVMConstInt(ctx->ac.i32, 7, false), false);
|
||||
|
||||
LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
|
||||
|
||||
/* can_emit = tid < so_vtx_count; */
|
||||
LLVMValueRef can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, so_vtx_count, "");
|
||||
|
||||
/* Emit the streamout code conditionally. This actually avoids
|
||||
* out-of-bounds buffer access. The hw tells us via the SGPR
|
||||
* (so_vtx_count) which threads are allowed to emit streamout data.
|
||||
*/
|
||||
ac_build_ifcc(&ctx->ac, can_emit, 6501);
|
||||
{
|
||||
/* The buffer offset is computed as follows:
|
||||
* ByteOffset = streamout_offset[buffer_id]*4 +
|
||||
* (streamout_write_index + thread_id)*stride[buffer_id] +
|
||||
* attrib_offset
|
||||
*/
|
||||
LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_write_index);
|
||||
|
||||
/* Compute (streamout_write_index + thread_id). */
|
||||
so_write_index = LLVMBuildAdd(ctx->ac.builder, so_write_index, tid, "");
|
||||
|
||||
/* Load the descriptor and compute the write offset for each
|
||||
* enabled buffer.
|
||||
*/
|
||||
LLVMValueRef so_write_offset[4] = {0};
|
||||
LLVMValueRef so_buffers[4] = {0};
|
||||
struct ac_llvm_pointer buf_ptr = ac_get_ptr_arg(&ctx->ac, &ctx->args->ac, ctx->args->streamout_buffers);
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
uint16_t stride = ctx->shader_info->so.strides[i];
|
||||
|
||||
if (!stride)
|
||||
continue;
|
||||
|
||||
LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i, false);
|
||||
|
||||
so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
|
||||
|
||||
LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_offset[i]);
|
||||
|
||||
so_offset =
|
||||
LLVMBuildMul(ctx->ac.builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, false), "");
|
||||
|
||||
so_write_offset[i] = ac_build_imad(
|
||||
&ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, stride * 4, false), so_offset);
|
||||
}
|
||||
|
||||
/* Write streamout data. */
|
||||
for (i = 0; i < ctx->shader_info->so.num_outputs; i++) {
|
||||
struct radv_shader_output_values shader_out = {0};
|
||||
const struct radv_stream_output *output = &ctx->shader_info->so.outputs[i];
|
||||
|
||||
if (stream != output->stream)
|
||||
continue;
|
||||
|
||||
for (int j = 0; j < 4; j++) {
|
||||
shader_out.values[j] = radv_load_output(ctx, output->location, j);
|
||||
}
|
||||
|
||||
radv_emit_stream_output(ctx, so_buffers, so_write_offset, output, &shader_out);
|
||||
}
|
||||
}
|
||||
ac_build_endif(&ctx->ac, 6501);
|
||||
}
|
||||
|
||||
static void
|
||||
radv_build_param_exports(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,
|
||||
unsigned noutput, const struct radv_vs_output_info *outinfo,
|
||||
|
@@ -171,17 +171,8 @@ enum radv_ud_index {
|
||||
AC_UD_MAX_UD = AC_UD_CS_MAX_UD,
|
||||
};
|
||||
|
||||
struct radv_stream_output {
|
||||
uint8_t location;
|
||||
uint8_t buffer;
|
||||
uint16_t offset;
|
||||
uint8_t component_mask;
|
||||
uint8_t stream;
|
||||
};
|
||||
|
||||
struct radv_streamout_info {
|
||||
uint16_t num_outputs;
|
||||
struct radv_stream_output outputs[MAX_SO_OUTPUTS];
|
||||
uint16_t strides[MAX_SO_BUFFERS];
|
||||
uint32_t enabled_stream_buffers_mask;
|
||||
};
|
||||
|
@@ -287,15 +287,9 @@ gather_xfb_info(const nir_shader *nir, struct radv_shader_info *info)
|
||||
so->num_outputs = xfb->output_count;
|
||||
|
||||
for (unsigned i = 0; i < xfb->output_count; i++) {
|
||||
struct radv_stream_output *output = &so->outputs[i];
|
||||
|
||||
output->buffer = xfb->outputs[i].buffer;
|
||||
output->stream = xfb->buffer_to_stream[xfb->outputs[i].buffer];
|
||||
output->offset = xfb->outputs[i].offset;
|
||||
output->location = xfb->outputs[i].location;
|
||||
output->component_mask = xfb->outputs[i].component_mask;
|
||||
|
||||
so->enabled_stream_buffers_mask |= (1 << output->buffer) << (output->stream * 4);
|
||||
unsigned output_buffer = xfb->outputs[i].buffer;
|
||||
unsigned stream = xfb->buffer_to_stream[xfb->outputs[i].buffer];
|
||||
so->enabled_stream_buffers_mask |= (1 << output_buffer) << (stream * 4);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < NIR_MAX_XFB_BUFFERS; i++) {
|
||||
|
Reference in New Issue
Block a user