diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 2bc4add7734..0d83300f1d1 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1765,7 +1765,8 @@ store("shared_block_intel", [1], [BASE, WRITE_MASK, ALIGN_MUL, ALIGN_OFFSET]) # Similar to load_global_const_block_intel but for UBOs # offset should be uniform # src[] = { buffer_index, offset }. -load("ubo_uniform_block_intel", [-1, 1], [ACCESS, ALIGN_MUL, ALIGN_OFFSET, RANGE_BASE, RANGE], [CAN_ELIMINATE]) +load("ubo_uniform_block_intel", [-1, 1], + [ACCESS, ALIGN_MUL, ALIGN_OFFSET, RANGE_BASE, RANGE], [CAN_ELIMINATE, CAN_REORDER]) # Similar to load_global_const_block_intel but for SSBOs # offset should be uniform diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 08461043360..1b41b0bd206 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4836,7 +4836,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } - case nir_intrinsic_load_ubo: { + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ubo_uniform_block_intel: { fs_reg surface, surface_handle; if (get_nir_src_bindless(instr->src[0])) @@ -4845,16 +4846,72 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr surface = get_nir_buffer_intrinsic_index(bld, instr); if (!nir_src_is_const(instr->src[1])) { - fs_reg base_offset = retype(get_nir_src(instr->src[1]), - BRW_REGISTER_TYPE_UD); + if (instr->intrinsic == nir_intrinsic_load_ubo) { + /* load_ubo with non-uniform offset */ + fs_reg base_offset = retype(get_nir_src(instr->src[1]), + BRW_REGISTER_TYPE_UD); - for (int i = 0; i < instr->num_components; i++) - VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), - surface, surface_handle, - base_offset, i * type_sz(dest.type), - nir_dest_bit_size(instr->dest) / 8); + for (int i = 0; i < instr->num_components; i++) + VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), + surface, surface_handle, + base_offset, i * type_sz(dest.type), + nir_dest_bit_size(instr->dest) / 8); - prog_data->has_ubo_pull = true; + prog_data->has_ubo_pull = true; + } else { + /* load_ubo with uniform offset */ + const fs_builder ubld1 = bld.exec_all().group(1, 0); + const fs_builder ubld8 = bld.exec_all().group(8, 0); + const fs_builder ubld16 = bld.exec_all().group(16, 0); + + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + + srcs[SURFACE_LOGICAL_SRC_SURFACE] = surface; + srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = surface_handle; + + const nir_src load_offset = instr->src[1]; + if (nir_src_is_const(load_offset)) { + fs_reg addr = ubld8.vgrf(BRW_REGISTER_TYPE_UD); + ubld8.MOV(addr, brw_imm_ud(nir_src_as_uint(load_offset))); + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0); + } else { + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + bld.emit_uniformize(get_nir_src(load_offset)); + } + + const unsigned total_dwords = ALIGN(instr->num_components, REG_SIZE / 4); + unsigned loaded_dwords = 0; + + const fs_reg packed_consts = + ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords); + + while (loaded_dwords < total_dwords) { + const unsigned block = + choose_oword_block_size_dwords(devinfo, + total_dwords - loaded_dwords); + const unsigned block_bytes = block * 4; + + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block); + + const fs_builder &ubld = block <= 8 ? ubld8 : ubld16; + ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, + retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD), + srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = align(block_bytes, REG_SIZE); + + loaded_dwords += block; + + ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS], + srcs[SURFACE_LOGICAL_SRC_ADDRESS], + brw_imm_ud(block_bytes)); + } + + for (unsigned c = 0; c < instr->num_components; c++) { + bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD), + component(packed_consts, c)); + } + + prog_data->has_ubo_pull = true; + } } else { /* Even if we are loading doubles, a pull constant load will load * a 32-bit vec4, so should only reserve vgrf space for that. If we diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 60a6194ff6b..5803dbdbbc1 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -1308,6 +1308,7 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, return false; if (low->intrinsic == nir_intrinsic_load_global_const_block_intel || + low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel || low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel || low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel) { if (num_components > 4) { diff --git a/src/intel/compiler/brw_nir_blockify_uniform_loads.c b/src/intel/compiler/brw_nir_blockify_uniform_loads.c index adc70bd0c33..e78b582753b 100644 --- a/src/intel/compiler/brw_nir_blockify_uniform_loads.c +++ b/src/intel/compiler/brw_nir_blockify_uniform_loads.c @@ -37,6 +37,7 @@ brw_nir_blockify_uniform_loads_instr(nir_builder *b, nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { + case nir_intrinsic_load_ubo: case nir_intrinsic_load_ssbo: /* BDW PRMs, Volume 7: 3D-Media-GPGPU: OWord Block ReadWrite: * @@ -60,7 +61,10 @@ brw_nir_blockify_uniform_loads_instr(nir_builder *b, if (!devinfo->has_lsc && nir_dest_num_components(intrin->dest) < 4) return false; - intrin->intrinsic = nir_intrinsic_load_ssbo_uniform_block_intel; + intrin->intrinsic = + intrin->intrinsic == nir_intrinsic_load_ubo ? + nir_intrinsic_load_ubo_uniform_block_intel : + nir_intrinsic_load_ssbo_uniform_block_intel; return true; case nir_intrinsic_load_shared: diff --git a/src/intel/compiler/brw_nir_lower_non_uniform_resource_intel.c b/src/intel/compiler/brw_nir_lower_non_uniform_resource_intel.c index b9d4cb31f52..d49e6fbd34a 100644 --- a/src/intel/compiler/brw_nir_lower_non_uniform_resource_intel.c +++ b/src/intel/compiler/brw_nir_lower_non_uniform_resource_intel.c @@ -92,6 +92,7 @@ brw_nir_lower_non_uniform_intrinsic(nir_builder *b, case nir_intrinsic_ssbo_atomic_swap: case nir_intrinsic_load_ssbo_block_intel: case nir_intrinsic_store_ssbo_block_intel: + case nir_intrinsic_load_ubo_uniform_block_intel: case nir_intrinsic_load_ssbo_uniform_block_intel: case nir_intrinsic_image_load_raw_intel: case nir_intrinsic_image_store_raw_intel: