From 18e97817148eb376274a8749c03b45c2f817c139 Mon Sep 17 00:00:00 2001 From: Gert Wollny Date: Sun, 6 Sep 2020 23:40:24 +0200 Subject: [PATCH] r600/sfn: Use load_ubo_vec4 lowering pass This replaces the lowering pass to align UBO loads at 16 byte boundaries. v2: use nir functions to query constants in ubo_vec4 (Eric) Signed-off-by: Gert Wollny Reviewed-by: Eric Anholt Part-of: --- src/gallium/drivers/r600/sfn/sfn_nir.cpp | 100 +----------------- .../drivers/r600/sfn/sfn_shader_base.cpp | 53 +++++----- .../drivers/r600/sfn/sfn_shader_base.h | 2 +- 3 files changed, 30 insertions(+), 125 deletions(-) diff --git a/src/gallium/drivers/r600/sfn/sfn_nir.cpp b/src/gallium/drivers/r600/sfn/sfn_nir.cpp index 7c88b4a4b49..091f4ffface 100644 --- a/src/gallium/drivers/r600/sfn/sfn_nir.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_nir.cpp @@ -390,100 +390,6 @@ bool r600_lower_scratch_addresses(nir_shader *shader) return progress; } -static nir_ssa_def * -r600_lower_ubo_to_align16_impl(nir_builder *b, nir_instr *instr, void *_options) -{ - b->cursor = nir_before_instr(instr); - - nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr); - assert(op->intrinsic == nir_intrinsic_load_ubo); - - bool const_address = (nir_src_is_const(op->src[1]) && nir_src_is_const(op->src[0])); - - nir_ssa_def *offset = op->src[1].ssa; - - /* This is ugly: With const addressing we can actually set a proper fetch target mask, - * but for this we need the component encoded, we don't shift and do de decoding in the - * backend. Otherwise we shift by four and resolve the component here - * (TODO: encode the start component in the intrinsic when the offset base is non-constant - * but a multiple of 16 */ - - nir_ssa_def *new_offset = offset; - if (!const_address) - new_offset = nir_ishr(b, offset, nir_imm_int(b, 4)); - - nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_r600); - load->num_components = const_address ? op->num_components : 4; - load->src[0] = op->src[0]; - load->src[1] = nir_src_for_ssa(new_offset); - nir_intrinsic_set_align(load, nir_intrinsic_align_mul(op), nir_intrinsic_align_offset(op)); - - nir_ssa_dest_init(&load->instr, &load->dest, load->num_components, 32, NULL); - nir_builder_instr_insert(b, &load->instr); - - /* when four components are loaded or both the offset and the location - * are constant, then the backend can deal with it better */ - if (op->num_components == 4 || const_address) - return &load->dest.ssa; - - /* What comes below is a performance disaster when the offset is not constant - * because then we have to assume that any component can be the first one and we - * have to pick the result manually. */ - nir_ssa_def *first_comp = nir_iand(b, nir_ishr(b, offset, nir_imm_int(b, 2)), - nir_imm_int(b,3)); - - const unsigned swz_000[4] = {0, 0, 0}; - nir_ssa_def *component_select = nir_ieq(b, r600_imm_ivec3(b, 0, 1, 2), - nir_swizzle(b, first_comp, swz_000, 3)); - - if (op->num_components == 1) { - nir_ssa_def *check0 = nir_bcsel(b, nir_channel(b, component_select, 0), - nir_channel(b, &load->dest.ssa, 0), - nir_channel(b, &load->dest.ssa, 3)); - nir_ssa_def *check1 = nir_bcsel(b, nir_channel(b, component_select, 1), - nir_channel(b, &load->dest.ssa, 1), - check0); - return nir_bcsel(b, nir_channel(b, component_select, 2), - nir_channel(b, &load->dest.ssa, 2), - check1); - } else if (op->num_components == 2) { - const unsigned szw_01[2] = {0, 1}; - const unsigned szw_12[2] = {1, 2}; - const unsigned szw_23[2] = {2, 3}; - - nir_ssa_def *check0 = nir_bcsel(b, nir_channel(b, component_select, 0), - nir_swizzle(b, &load->dest.ssa, szw_01, 2), - nir_swizzle(b, &load->dest.ssa, szw_23, 2)); - return nir_bcsel(b, nir_channel(b, component_select, 1), - nir_swizzle(b, &load->dest.ssa, szw_12, 2), - check0); - } else { - const unsigned szw_012[3] = {0, 1, 2}; - const unsigned szw_123[3] = {1, 2, 3}; - return nir_bcsel(b, nir_channel(b, component_select, 0), - nir_swizzle(b, &load->dest.ssa, szw_012, 3), - nir_swizzle(b, &load->dest.ssa, szw_123, 3)); - } -} - -bool r600_lower_ubo_to_align16_filter(const nir_instr *instr, const void *_options) -{ - if (instr->type != nir_instr_type_intrinsic) - return false; - - nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr); - return op->intrinsic == nir_intrinsic_load_ubo; -} - - -bool r600_lower_ubo_to_align16(nir_shader *shader) -{ - return nir_shader_lower_instructions(shader, - r600_lower_ubo_to_align16_filter, - r600_lower_ubo_to_align16_impl, - nullptr); -} - static void insert_uniform_sorted(struct exec_list *var_list, nir_variable *new_var) { @@ -875,10 +781,8 @@ int r600_shader_from_nir(struct r600_context *rctx, const nir_function *func = reinterpret_cast(exec_list_get_head_const(&sel->nir->functions)); assert(func->impl->registers.length() == 0 && !has_saturate(func)); - if (true) { - optimize_once(sel->nir); - NIR_PASS_V(sel->nir, r600_lower_ubo_to_align16); - } + NIR_PASS_V(sel->nir, nir_lower_ubo_vec4); + /* It seems the output of this optimization is cached somewhere, and * when there are registers, then we can no longer copy propagate, so * skip the optimization then. (There is probably a better way, but yeah) diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp b/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp index 04953c1679f..13a4c688ddf 100644 --- a/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp @@ -634,8 +634,8 @@ bool ShaderFromNirProcessor::emit_intrinsic_instruction(nir_intrinsic_instr* ins case nir_intrinsic_discard: case nir_intrinsic_discard_if: return emit_discard_if(instr); - case nir_intrinsic_load_ubo_r600: - return emit_load_ubo(instr); + case nir_intrinsic_load_ubo_vec4: + return emit_load_ubo_vec4(instr); case nir_intrinsic_load_tcs_in_param_base_r600: return emit_load_tcs_param_base(instr, 0); case nir_intrinsic_load_tcs_out_param_base_r600: @@ -777,7 +777,7 @@ GPRVector ShaderFromNirProcessor::vec_from_nir_with_fetch_constant(const nir_src /* Now check whether all inputs come from the same GPR, and fill * empty slots in the vector with unused swizzles, bail out if - * the sources are not from the same GPR + * the sources are nqot from the same GPR */ if (use_same) { @@ -829,26 +829,19 @@ GPRVector ShaderFromNirProcessor::vec_from_nir_with_fetch_constant(const nir_src return GPRVector(v);; } -bool ShaderFromNirProcessor::emit_load_ubo(nir_intrinsic_instr* instr) +bool ShaderFromNirProcessor::emit_load_ubo_vec4(nir_intrinsic_instr* instr) { - nir_src& src0 = instr->src[0]; - nir_src& src1 = instr->src[1]; + auto bufid = nir_src_as_const_value(instr->src[0]); + auto buf_offset = nir_src_as_const_value(instr->src[1]); - int sel_bufid_reg = src0.is_ssa ? src0.ssa->index : src0.reg.reg->index; - const nir_load_const_instr* literal0 = get_literal_constant(sel_bufid_reg); - - int ofs_reg = src1.is_ssa ? src1.ssa->index : src1.reg.reg->index; - const nir_load_const_instr* literal1 = get_literal_constant(ofs_reg); - if (literal0) { - if (literal1) { - uint bufid = literal0->value[0].u32; - uint buf_ofs = literal1->value[0].u32 >> 4; - int buf_cmp = ((literal1->value[0].u32 >> 2) & 3); + if (bufid) { + if (buf_offset) { + int buf_cmp = nir_intrinsic_component(instr); AluInstruction *ir = nullptr; - for (int i = 0; i < instr->num_components; ++i) { + for (unsigned i = 0; i < nir_dest_num_components(instr->dest); ++i) { int cmp = buf_cmp + i; assert(cmp < 4); - auto u = PValue(new UniformValue(512 + buf_ofs, cmp, bufid + 1)); + auto u = PValue(new UniformValue(512 + buf_offset->u32, cmp, bufid->u32)); if (instr->dest.is_ssa) load_preloaded_value(instr->dest, i, u); else { @@ -861,21 +854,24 @@ bool ShaderFromNirProcessor::emit_load_ubo(nir_intrinsic_instr* instr) return true; } else { - /* literal0 is lost ...*/ - return load_uniform_indirect(instr, from_nir(instr->src[1], 0, 0), 0, literal0->value[0].u32 + 1); + return load_uniform_indirect(instr, from_nir(instr->src[1], 0, 0), 0, bufid->u32); } } else { - /* TODO: This can also be solved by using the CF indes on the ALU block, and - * this would probably make sense when there are more then one loads with - * the same buffer ID. */ + /* TODO: if buf_offset is constant then this can also be solved by using the CF indes + * on the ALU block, and this would probably make sense when there are more then one + * loads with the same buffer ID. */ PValue bufid = from_nir(instr->src[0], 0, 0); PValue addr = from_nir_with_fetch_constant(instr->src[1], 0); GPRVector trgt; - for (int i = 0; i < 4; ++i) + std::array swz = {7,7,7,7}; + for (unsigned i = 0; i < nir_dest_num_components(instr->dest); ++i) { trgt.set_reg_i(i, from_nir(instr->dest, i)); + swz[i] = i + nir_intrinsic_component(instr); + } auto ir = new FetchInstruction(vc_fetch, no_index_offset, trgt, addr, 0, - 1, bufid, bim_zero); + 0, bufid, bim_zero); + ir->set_dest_swizzle(swz); emit_instruction(ir); for (int i = 0; i < instr->num_components ; ++i) { @@ -887,6 +883,7 @@ bool ShaderFromNirProcessor::emit_load_ubo(nir_intrinsic_instr* instr) } + bool ShaderFromNirProcessor::emit_discard_if(nir_intrinsic_instr* instr) { r600::sfn_log << SfnLog::instr << "emit '" @@ -919,8 +916,11 @@ bool ShaderFromNirProcessor::load_uniform_indirect(nir_intrinsic_instr* instr, P } GPRVector trgt; - for (int i = 0; i < 4; ++i) + std::array swz = {7,7,7,7}; + for (int i = 0; i < 4; ++i) { trgt.set_reg_i(i, from_nir(instr->dest, i)); + swz[i] = i + nir_intrinsic_component(instr); + } if (addr->type() != Value::gpr) { emit_instruction(op1_mov, trgt.reg_i(0), {addr}, {alu_write, alu_last_instr}); @@ -930,6 +930,7 @@ bool ShaderFromNirProcessor::load_uniform_indirect(nir_intrinsic_instr* instr, P /* FIXME: buffer index and index mode are not set correctly */ auto ir = new FetchInstruction(vc_fetch, no_index_offset, trgt, addr, offest, bufferid, PValue(), bim_none); + ir->set_dest_swizzle(swz); emit_instruction(ir); m_sh_info.indirect_files |= 1 << TGSI_FILE_CONSTANT; return true; diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_base.h b/src/gallium/drivers/r600/sfn/sfn_shader_base.h index bd2cd9ce680..2bf094aa5b3 100644 --- a/src/gallium/drivers/r600/sfn/sfn_shader_base.h +++ b/src/gallium/drivers/r600/sfn/sfn_shader_base.h @@ -150,7 +150,7 @@ private: virtual bool emit_intrinsic_instruction_override(nir_intrinsic_instr* instr); bool emit_tex_instruction(nir_instr* instr); bool emit_discard_if(nir_intrinsic_instr* instr); - bool emit_load_ubo(nir_intrinsic_instr* instr); + bool emit_load_ubo_vec4(nir_intrinsic_instr* instr); bool emit_ssbo_atomic_add(nir_intrinsic_instr* instr); bool load_uniform_indirect(nir_intrinsic_instr* instr, PValue addr, int offest, int bufid);