intel/fs,rt: Add a predicate to load_global_const_block
This allows us to do bounds checked A64 block load without the it being counted as control-flow by NIR. This means that NIR optimizations like CSE will be able to work on these the same as a regular load. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8635>
This commit is contained in:

committed by
Marge Bot

parent
2407952ec9
commit
1ce3660a5a
@@ -1191,9 +1191,11 @@ image("store_raw_intel", src_comp=[1, 0])
|
||||
|
||||
# Intrinsic to load a block of at least 32B of constant data from a 64-bit
|
||||
# global memory address. The memory address must be uniform and 32B-aligned.
|
||||
# src[] = { address }.
|
||||
intrinsic("load_global_const_block_intel", src_comp=[1], dest_comp=0, bit_sizes=[32],
|
||||
indices=[BASE], flags=[CAN_ELIMINATE, CAN_REORDER])
|
||||
# The second source is a predicate which indicates whether or not to actually
|
||||
# do the load.
|
||||
# src[] = { address, predicate }.
|
||||
intrinsic("load_global_const_block_intel", src_comp=[1, 1], dest_comp=0,
|
||||
bit_sizes=[32], indices=[BASE], flags=[CAN_ELIMINATE, CAN_REORDER])
|
||||
|
||||
# Number of data items being operated on for a SIMD program.
|
||||
system_value("simd_width_intel", 1)
|
||||
|
@@ -4665,12 +4665,43 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
|
||||
assert(instr->num_components == 8 || instr->num_components == 16);
|
||||
|
||||
const fs_builder ubld = bld.exec_all().group(instr->num_components, 0);
|
||||
fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,
|
||||
tmp,
|
||||
bld.emit_uniformize(get_nir_src(instr->src[0])), /* Address */
|
||||
fs_reg(), /* No source data */
|
||||
brw_imm_ud(instr->num_components));
|
||||
fs_reg load_val;
|
||||
|
||||
bool is_pred_const = nir_src_is_const(instr->src[1]);
|
||||
if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) {
|
||||
/* In this case, we don't want the UBO load at all. We really
|
||||
* shouldn't get here but it's possible.
|
||||
*/
|
||||
load_val = brw_imm_ud(0);
|
||||
} else {
|
||||
/* The uniform process may stomp the flag so do this first */
|
||||
fs_reg addr = bld.emit_uniformize(get_nir_src(instr->src[0]));
|
||||
|
||||
load_val = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
|
||||
/* If the predicate is constant and we got here, then it's non-zero
|
||||
* and we don't need the predicate at all.
|
||||
*/
|
||||
if (!is_pred_const) {
|
||||
/* Load the predicate */
|
||||
fs_reg pred = bld.emit_uniformize(get_nir_src(instr->src[1]));
|
||||
fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred);
|
||||
mov->conditional_mod = BRW_CONDITIONAL_NZ;
|
||||
|
||||
/* Stomp the destination with 0 if we're OOB */
|
||||
mov = ubld.MOV(load_val, brw_imm_ud(0));
|
||||
mov->predicate = BRW_PREDICATE_NORMAL;
|
||||
mov->predicate_inverse = true;
|
||||
}
|
||||
|
||||
fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,
|
||||
load_val, addr,
|
||||
fs_reg(), /* No source data */
|
||||
brw_imm_ud(instr->num_components));
|
||||
|
||||
if (!is_pred_const)
|
||||
load->predicate = BRW_PREDICATE_NORMAL;
|
||||
}
|
||||
|
||||
/* From the HW perspective, we just did a single SIMD16 instruction
|
||||
* which loaded a dword in each SIMD channel. From NIR's perspective,
|
||||
@@ -4681,7 +4712,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
|
||||
*/
|
||||
for (unsigned i = 0; i < instr->num_components; i++) {
|
||||
bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
|
||||
component(tmp, i));
|
||||
component(load_val, i));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@@ -164,7 +164,8 @@ lower_rt_intrinsics_impl(nir_function_impl *impl,
|
||||
nir_ssa_def *addr =
|
||||
nir_iadd_imm(b, nir_load_btd_global_arg_addr_intel(b),
|
||||
aligned_offset + i * 64);
|
||||
data[i] = nir_load_global_const_block_intel(b, 16, addr);
|
||||
data[i] = nir_load_global_const_block_intel(b, 16, addr,
|
||||
nir_imm_true(b));
|
||||
}
|
||||
|
||||
sysval = nir_extract_bits(b, data, 2, suboffset * 8,
|
||||
|
@@ -217,7 +217,7 @@ brw_nir_rt_load_globals(nir_builder *b,
|
||||
nir_ssa_def *addr = nir_load_btd_global_arg_addr_intel(b);
|
||||
|
||||
nir_ssa_def *data;
|
||||
data = nir_load_global_const_block_intel(b, 16, addr);
|
||||
data = nir_load_global_const_block_intel(b, 16, addr, nir_imm_true(b));
|
||||
defs->base_mem_addr = nir_pack_64_2x32(b, nir_channels(b, data, 0x3));
|
||||
|
||||
defs->call_stack_handler_addr =
|
||||
@@ -240,7 +240,8 @@ brw_nir_rt_load_globals(nir_builder *b,
|
||||
defs->sw_stack_size = nir_channel(b, data, 12);
|
||||
defs->launch_size = nir_channels(b, data, 0x7u << 13);
|
||||
|
||||
data = nir_load_global_const_block_intel(b, 8, nir_iadd_imm(b, addr, 64));
|
||||
data = nir_load_global_const_block_intel(b, 8, nir_iadd_imm(b, addr, 64),
|
||||
nir_imm_true(b));
|
||||
defs->call_sbt_addr =
|
||||
nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
|
||||
nir_extract_i16(b, nir_channel(b, data, 1),
|
||||
|
Reference in New Issue
Block a user