diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 1b41b0bd206..073a1ad391c 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -5127,6 +5127,46 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_load_global_constant_uniform_block_intel: { + const unsigned total_dwords = ALIGN(instr->num_components, REG_SIZE / 4); + unsigned loaded_dwords = 0; + + const fs_builder ubld1 = bld.exec_all().group(1, 0); + const fs_builder ubld8 = bld.exec_all().group(8, 0); + const fs_builder ubld16 = bld.exec_all().group(16, 0); + + const fs_reg packed_consts = + ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords); + fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[0])); + + while (loaded_dwords < total_dwords) { + const unsigned block = + choose_oword_block_size_dwords(devinfo, + total_dwords - loaded_dwords); + const unsigned block_bytes = block * 4; + + const fs_builder &ubld = block <= 8 ? ubld8 : ubld16; + + fs_reg srcs[A64_LOGICAL_NUM_SRCS]; + srcs[A64_LOGICAL_ADDRESS] = address; + srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ + srcs[A64_LOGICAL_ARG] = brw_imm_ud(block); + srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); + ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, + retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD), + srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes; + + increment_a64_address(ubld1, address, block_bytes); + loaded_dwords += block; + } + + for (unsigned c = 0; c < instr->num_components; c++) + bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD), + component(packed_consts, c)); + + break; + } + case nir_intrinsic_load_ssbo: { assert(devinfo->ver >= 7); diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 5803dbdbbc1..eea8c374fd2 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -1310,7 +1310,8 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, if (low->intrinsic == nir_intrinsic_load_global_const_block_intel || low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel || low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel || - low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel) { + low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel || + low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel) { if (num_components > 4) { if (!util_is_power_of_two_nonzero(num_components)) return false; diff --git a/src/intel/compiler/brw_nir_blockify_uniform_loads.c b/src/intel/compiler/brw_nir_blockify_uniform_loads.c index e78b582753b..d28d6a4adf6 100644 --- a/src/intel/compiler/brw_nir_blockify_uniform_loads.c +++ b/src/intel/compiler/brw_nir_blockify_uniform_loads.c @@ -87,6 +87,22 @@ brw_nir_blockify_uniform_loads_instr(nir_builder *b, intrin->intrinsic = nir_intrinsic_load_shared_uniform_block_intel; return true; + case nir_intrinsic_load_global_constant: + if (nir_src_is_divergent(intrin->src[0])) + return false; + + if (nir_dest_bit_size(intrin->dest) != 32) + return false; + + /* Without the LSC, we can only do block loads of at least 4dwords (1 + * oword). + */ + if (!devinfo->has_lsc && nir_dest_num_components(intrin->dest) < 4) + return false; + + intrin->intrinsic = nir_intrinsic_load_global_constant_uniform_block_intel; + return true; + default: return false; }