diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_mesh.cpp index c7c18078c20..fae8aa1d6e6 100644 --- a/src/intel/compiler/brw_mesh.cpp +++ b/src/intel/compiler/brw_mesh.cpp @@ -1004,6 +1004,82 @@ emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr, } } +static void +emit_urb_indirect_vec4_write(const fs_builder &bld, + const fs_reg &offset_src, + unsigned base, + const fs_reg &src, + fs_reg urb_handle, + unsigned src_comp_offset, + unsigned dst_comp_offset, + unsigned comps, + unsigned mask) +{ + for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) { + fs_builder bld8 = bld.group(8, q); + + fs_reg off = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld8.MOV(off, quarter(offset_src, q)); + bld8.ADD(off, off, brw_imm_ud(base)); + bld8.SHR(off, off, brw_imm_ud(2)); + + fs_reg payload_srcs[4]; + unsigned length = 0; + + for (unsigned i = 0; i < dst_comp_offset; i++) + payload_srcs[length++] = reg_undef; + + for (unsigned c = 0; c < comps; c++) + payload_srcs[length++] = quarter(offset(src, bld, c + src_comp_offset), q); + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; + srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16); + srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length), + BRW_REGISTER_TYPE_F); + bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0); + + fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, + reg_undef, srcs, ARRAY_SIZE(srcs)); + inst->mlen = 3 + length; + inst->offset = 0; + } +} + +static void +emit_urb_indirect_writes_mod(const fs_builder &bld, nir_intrinsic_instr *instr, + const fs_reg &src, const fs_reg &offset_src, + fs_reg urb_handle, unsigned mod) +{ + assert(nir_src_bit_size(instr->src[0]) == 32); + + const unsigned comps = nir_src_num_components(instr->src[0]); + assert(comps <= 4); + + const unsigned mask = nir_intrinsic_write_mask(instr); + const unsigned base_in_dwords = nir_intrinsic_base(instr) + + component_from_intrinsic(instr); + + const unsigned comp_shift = mod; + const unsigned first_comps = MIN2(comps, 4 - comp_shift); + const unsigned second_comps = comps - first_comps; + const unsigned first_mask = (mask << comp_shift) & 0xF; + const unsigned second_mask = (mask >> (4 - comp_shift)) & 0xF; + + if (first_mask > 0) { + emit_urb_indirect_vec4_write(bld, offset_src, base_in_dwords, src, + urb_handle, 0, comp_shift, first_comps, + first_mask); + } + + if (second_mask > 0) { + emit_urb_indirect_vec4_write(bld, offset_src, base_in_dwords + 4, src, + urb_handle, first_comps, 0, second_comps, + second_mask); + } +} + static void emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr, const fs_reg &src, const fs_reg &offset_src, @@ -1184,15 +1260,30 @@ fs_visitor::emit_task_mesh_store(const fs_builder &bld, nir_intrinsic_instr *ins ubld8.MOV(h, urb_handle); ubld8.AND(h, h, brw_imm_ud(0xFFFF)); - /* TODO(mesh): for per_vertex and per_primitive, if we could keep around - * the non-array-index offset, we could use to decide if we can perform - * either one or (at most) two writes instead one per component. - */ - - if (nir_src_is_const(*offset_nir_src)) + if (nir_src_is_const(*offset_nir_src)) { emit_urb_direct_writes(bld, instr, src, h); - else - emit_urb_indirect_writes(bld, instr, src, get_nir_src(*offset_nir_src), h); + } else { + bool use_mod = false; + unsigned mod; + + if (offset_nir_src->is_ssa) { + /* Try to calculate the value of (offset + base) % 4. If we can do + * this, then we can do indirect writes using only up to 2 URB + * writes (1 if modulo + num_comps is <= 4). + */ + use_mod = nir_mod_analysis(nir_get_ssa_scalar(offset_nir_src->ssa, 0), nir_type_uint, 4, &mod); + if (use_mod) { + mod += nir_intrinsic_base(instr) + component_from_intrinsic(instr); + mod %= 4; + } + } + + if (use_mod) { + emit_urb_indirect_writes_mod(bld, instr, src, get_nir_src(*offset_nir_src), h, mod); + } else { + emit_urb_indirect_writes(bld, instr, src, get_nir_src(*offset_nir_src), h); + } + } } void