aco: Refactor VS output stores in preparation for tessellation.

This commit takes the new helpers into use by the VS output
store function. This function is also where the VS outputs will
be handled when the VS runs on the HW LS stage.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3964>
This commit is contained in:
Timur Kristóf
2020-02-28 16:20:01 +01:00
committed by Marge Bot
parent 0062bb04ac
commit db93af5f1b

View File

@@ -3150,81 +3150,39 @@ std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_i
return get_intrinsic_io_basic_offset(ctx, instr, stride, stride);
}
void visit_store_vsgs_output(isel_context *ctx, nir_intrinsic_instr *instr)
void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
{
unsigned write_mask = nir_intrinsic_write_mask(instr);
unsigned component = nir_intrinsic_component(instr);
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
unsigned idx = (nir_intrinsic_base(instr) + component) * 4u;
Operand offset(s1);
Builder bld(ctx->program, ctx->block);
nir_instr *off_instr = instr->src[1].ssa->parent_instr;
if (off_instr->type != nir_instr_type_load_const)
offset = bld.v_mul24_imm(bld.def(v1), get_ssa_temp(ctx, instr->src[1].ssa), 16u);
else
idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 16u;
std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, 4u);
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
unsigned write_mask = nir_intrinsic_write_mask(instr);
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u;
if (ctx->stage == vertex_es) {
/* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
Temp elems[NIR_MAX_VEC_COMPONENTS * 2];
if (elem_size_bytes == 8) {
for (unsigned i = 0; i < src.size() / 2; i++) {
Temp elem = emit_extract_vector(ctx, src, i, v2);
elems[i*2] = bld.tmp(v1);
elems[i*2+1] = bld.tmp(v1);
bld.pseudo(aco_opcode::p_split_vector, Definition(elems[i*2]), Definition(elems[i*2+1]), elem);
}
write_mask = widen_mask(write_mask, 2);
elem_size_bytes /= 2u;
} else {
for (unsigned i = 0; i < src.size(); i++)
elems[i] = emit_extract_vector(ctx, src, i, v1);
}
while (write_mask) {
unsigned index = u_bit_scan(&write_mask);
unsigned offset = index * elem_size_bytes;
Temp elem = emit_extract_vector(ctx, src, index, RegClass(RegType::vgpr, elem_size_bytes / 4));
Operand vaddr_offset(v1);
unsigned const_offset = idx + offset;
if (const_offset >= 4096u) {
vaddr_offset = bld.copy(bld.def(v1), Operand(const_offset / 4096u * 4096u));
const_offset %= 4096u;
}
aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
mtbuf->operands[0] = Operand(esgs_ring);
mtbuf->operands[1] = vaddr_offset;
mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->es2gs_offset));
mtbuf->operands[3] = Operand(elem);
mtbuf->offen = !vaddr_offset.isUndefined();
mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
mtbuf->offset = const_offset;
mtbuf->glc = true;
mtbuf->slc = true;
mtbuf->barrier = barrier_none;
mtbuf->can_reorder = true;
bld.insert(std::move(mtbuf));
}
Temp es2gs_offset = get_arg(ctx, ctx->args->es2gs_offset);
store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, true, true);
} else {
unsigned itemsize = ctx->program->info->vs.es_info.esgs_itemsize;
Temp lds_base;
Temp vertex_idx = emit_mbcnt(ctx, bld.def(v1));
Temp wave_idx = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand(4u << 16 | 24));
vertex_idx = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), vertex_idx,
bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_idx), ctx->program->wave_size));
if (ctx->stage == vertex_geometry_gs) {
/* GFX9+: ES stage is merged into GS, data is passed between them using LDS. */
unsigned itemsize = ctx->program->info->vs.es_info.esgs_itemsize;
Temp thread_id = emit_mbcnt(ctx, bld.def(v1));
Temp wave_idx = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand(4u << 16 | 24));
Temp vertex_idx = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), thread_id,
bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_idx), ctx->program->wave_size));
lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, itemsize);
} else {
unreachable("Invalid LS or ES stage");
}
Temp lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, itemsize);
if (!offset.isUndefined())
lds_base = bld.vadd32(bld.def(v1), offset, lds_base);
offs = offset_add(ctx, offs, std::make_pair(lds_base, 0u));
unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
store_lds(ctx, elem_size_bytes, src, write_mask, offs.first, offs.second, lds_align);
unsigned align = calculate_lds_alignment(ctx, idx);
store_lds(ctx, elem_size_bytes, src, write_mask, lds_base, idx, align);
}
}
@@ -3258,7 +3216,7 @@ void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
}
} else if (ctx->stage == vertex_es ||
(ctx->stage == vertex_geometry_gs && ctx->shader->info.stage == MESA_SHADER_VERTEX)) {
visit_store_vsgs_output(ctx, instr);
visit_store_ls_or_es_output(ctx, instr);
} else {
unreachable("Shader stage not implemented");
}