intel/nir: remove load_global_const_block_intel intrinsic

load_global_constant_uniform_block_intel is equivalent in terms of
loading, then for the predicate we just do a bcsel afterward in places
where that is required.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30659>
This commit is contained in:
Lionel Landwerlin
2024-08-14 20:15:44 +03:00
committed by Marge Bot
parent a15466187c
commit fbafa9cabd
11 changed files with 40 additions and 201 deletions

View File

@@ -226,7 +226,6 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_load_scalar_arg_amd:
case nir_intrinsic_load_smem_amd:
case nir_intrinsic_load_resume_shader_address_amd:
case nir_intrinsic_load_global_const_block_intel:
case nir_intrinsic_load_reloc_const_intel:
case nir_intrinsic_load_btd_global_arg_addr_intel:
case nir_intrinsic_load_btd_local_arg_addr_intel:

View File

@@ -2133,14 +2133,6 @@ image("load_raw_intel", src_comp=[1], dest_comp=0,
flags=[CAN_ELIMINATE])
image("store_raw_intel", src_comp=[1, 0])
# Intrinsic to load a block of at least 32B of constant data from a 64-bit
# global memory address. The memory address must be uniform and 32B-aligned.
# The second source is a predicate which indicates whether or not to actually
# do the load.
# src[] = { address, predicate }.
intrinsic("load_global_const_block_intel", src_comp=[1, 1], dest_comp=0,
bit_sizes=[32], indices=[BASE], flags=[CAN_ELIMINATE, CAN_REORDER])
# Number of data items being operated on for a SIMD program.
system_value("simd_width_intel", 1)

View File

@@ -155,7 +155,6 @@ can_remat_instr(nir_instr *instr, struct sized_bitset *remat)
case nir_intrinsic_load_vulkan_descriptor:
case nir_intrinsic_load_push_constant:
case nir_intrinsic_load_global_constant:
case nir_intrinsic_load_global_const_block_intel:
case nir_intrinsic_load_desc_set_address_intel:
/* These intrinsics don't need to be spilled as long as they don't
* depend on any spilled values.

View File

@@ -6816,69 +6816,6 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
fs_nir_emit_global_atomic(ntb, bld, instr);
break;
case nir_intrinsic_load_global_const_block_intel: {
assert(instr->def.bit_size == 32);
assert(instr->num_components == 8 || instr->num_components == 16);
const fs_builder ubld = bld.exec_all().group(instr->num_components, 0);
brw_reg load_val;
bool is_pred_const = nir_src_is_const(instr->src[1]);
if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) {
/* In this case, we don't want the UBO load at all. We really
* shouldn't get here but it's possible.
*/
load_val = brw_imm_ud(0);
} else {
/* The uniform process may stomp the flag so do this first */
brw_reg addr = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
load_val = ubld.vgrf(BRW_TYPE_UD);
/* If the predicate is constant and we got here, then it's non-zero
* and we don't need the predicate at all.
*/
if (!is_pred_const) {
/* Load the predicate */
brw_reg pred = bld.emit_uniformize(get_nir_src(ntb, instr->src[1]));
fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred);
mov->conditional_mod = BRW_CONDITIONAL_NZ;
/* Stomp the destination with 0 if we're OOB */
mov = ubld.MOV(load_val, brw_imm_ud(0));
mov->predicate = BRW_PREDICATE_NORMAL;
mov->predicate_inverse = true;
}
brw_reg srcs[A64_LOGICAL_NUM_SRCS];
srcs[A64_LOGICAL_ADDRESS] = addr;
srcs[A64_LOGICAL_SRC] = brw_reg(); /* No source data */
srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components);
/* This intrinsic loads memory from a uniform address, sometimes
* shared across lanes. We never need to mask it.
*/
srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,
load_val, srcs, A64_LOGICAL_NUM_SRCS);
if (!is_pred_const)
load->predicate = BRW_PREDICATE_NORMAL;
}
/* From the HW perspective, we just did a single SIMD16 instruction
* which loaded a dword in each SIMD channel. From NIR's perspective,
* this instruction returns a vec16. Any users of this data in the
* back-end will expect a vec16 per SIMD channel so we have to emit a
* pile of MOVs to resolve this discrepancy. Fortunately, copy-prop
* will generally clean them up for us.
*/
for (unsigned i = 0; i < instr->num_components; i++) {
bld.MOV(retype(offset(dest, bld, i), BRW_TYPE_UD),
component(load_val, i));
}
break;
}
case nir_intrinsic_load_global_constant_uniform_block_intel: {
const unsigned total_dwords = ALIGN(instr->num_components,
REG_SIZE * reg_unit(devinfo) / 4);

View File

@@ -1421,8 +1421,7 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
if (bit_size > 32)
return false;
if (low->intrinsic == nir_intrinsic_load_global_const_block_intel ||
low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel) {
@@ -2175,8 +2174,12 @@ brw_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load_uniform,
nir_def *data[2];
for (unsigned i = 0; i < 2; i++) {
nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64);
data[i] = nir_load_global_const_block_intel(b, 16, addr,
nir_imm_true(b));
data[i] = nir_load_global_constant_uniform_block_intel(
b, 16, 32, addr,
.access = ACCESS_CAN_REORDER | ACCESS_NON_WRITEABLE,
.align_mul = 64,
.align_offset = 64);
}
sysval = nir_extract_bits(b, data, 2, suboffset * 8,

View File

@@ -60,10 +60,13 @@ brw_nir_rt_store(nir_builder *b, nir_def *addr, unsigned align,
}
static inline nir_def *
brw_nir_rt_load_const(nir_builder *b, unsigned components,
nir_def *addr, nir_def *pred)
brw_nir_rt_load_const(nir_builder *b, unsigned components, nir_def *addr)
{
return nir_load_global_const_block_intel(b, components, addr, pred);
return nir_load_global_constant_uniform_block_intel(
b, components, 32, addr,
.access = ACCESS_CAN_REORDER | ACCESS_NON_WRITEABLE,
.align_mul = 64,
.align_offset = 64);
}
static inline nir_def *
@@ -312,7 +315,7 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
nir_def *addr)
{
nir_def *data;
data = brw_nir_rt_load_const(b, 16, addr, nir_imm_true(b));
data = brw_nir_rt_load_const(b, 16, addr);
defs->base_mem_addr = nir_pack_64_2x32(b, nir_trim_vector(b, data, 2));
defs->call_stack_handler_addr =
@@ -335,7 +338,7 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
defs->sw_stack_size = nir_channel(b, data, 12);
defs->launch_size = nir_channels(b, data, 0x7u << 13);
data = brw_nir_rt_load_const(b, 8, nir_iadd_imm(b, addr, 64), nir_imm_true(b));
data = brw_nir_rt_load_const(b, 8, nir_iadd_imm(b, addr, 64));
defs->call_sbt_addr =
nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
nir_extract_i16(b, nir_channel(b, data, 1),

View File

@@ -5251,69 +5251,6 @@ fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
fs_nir_emit_global_atomic(ntb, bld, instr);
break;
case nir_intrinsic_load_global_const_block_intel: {
assert(instr->def.bit_size == 32);
assert(instr->num_components == 8 || instr->num_components == 16);
const fs_builder ubld = bld.exec_all().group(instr->num_components, 0);
elk_fs_reg load_val;
bool is_pred_const = nir_src_is_const(instr->src[1]);
if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) {
/* In this case, we don't want the UBO load at all. We really
* shouldn't get here but it's possible.
*/
load_val = elk_imm_ud(0);
} else {
/* The uniform process may stomp the flag so do this first */
elk_fs_reg addr = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
load_val = ubld.vgrf(ELK_REGISTER_TYPE_UD);
/* If the predicate is constant and we got here, then it's non-zero
* and we don't need the predicate at all.
*/
if (!is_pred_const) {
/* Load the predicate */
elk_fs_reg pred = bld.emit_uniformize(get_nir_src(ntb, instr->src[1]));
elk_fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred);
mov->conditional_mod = ELK_CONDITIONAL_NZ;
/* Stomp the destination with 0 if we're OOB */
mov = ubld.MOV(load_val, elk_imm_ud(0));
mov->predicate = ELK_PREDICATE_NORMAL;
mov->predicate_inverse = true;
}
elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
srcs[A64_LOGICAL_ADDRESS] = addr;
srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
srcs[A64_LOGICAL_ARG] = elk_imm_ud(instr->num_components);
/* This intrinsic loads memory from a uniform address, sometimes
* shared across lanes. We never need to mask it.
*/
srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
elk_fs_inst *load = ubld.emit(ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,
load_val, srcs, A64_LOGICAL_NUM_SRCS);
if (!is_pred_const)
load->predicate = ELK_PREDICATE_NORMAL;
}
/* From the HW perspective, we just did a single SIMD16 instruction
* which loaded a dword in each SIMD channel. From NIR's perspective,
* this instruction returns a vec16. Any users of this data in the
* back-end will expect a vec16 per SIMD channel so we have to emit a
* pile of MOVs to resolve this discrepancy. Fortunately, copy-prop
* will generally clean them up for us.
*/
for (unsigned i = 0; i < instr->num_components; i++) {
bld.MOV(retype(offset(dest, bld, i), ELK_REGISTER_TYPE_UD),
component(load_val, i));
}
break;
}
case nir_intrinsic_load_global_constant_uniform_block_intel: {
const unsigned total_dwords = ALIGN(instr->num_components,
REG_SIZE * reg_unit(devinfo) / 4);

View File

@@ -1139,8 +1139,7 @@ elk_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
if (bit_size > 32)
return false;
if (low->intrinsic == nir_intrinsic_load_global_const_block_intel ||
low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel) {
@@ -1873,8 +1872,7 @@ elk_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load_uniform,
nir_def *data[2];
for (unsigned i = 0; i < 2; i++) {
nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64);
data[i] = nir_load_global_const_block_intel(b, 16, addr,
nir_imm_true(b));
data[i] = nir_load_global_constant_uniform_block_intel(b, 16, 32, addr);
}
sysval = nir_extract_bits(b, data, 2, suboffset * 8,

View File

@@ -106,39 +106,6 @@ intel_nir_blockify_uniform_loads_instr(nir_builder *b,
intrin->intrinsic = nir_intrinsic_load_global_constant_uniform_block_intel;
return true;
case nir_intrinsic_load_global_const_block_intel:
/* Only deal with the simple predication true case */
if (!nir_src_is_const(intrin->src[1]) ||
nir_src_as_uint(intrin->src[1]) == 0)
return false;
if (nir_src_is_divergent(intrin->src[0]))
return false;
if (intrin->def.bit_size != 32)
return false;
/* Without the LSC, we can only do block loads of at least 4dwords (1
* oword).
*/
if (!devinfo->has_lsc && intrin->def.num_components < 4)
return false;
b->cursor = nir_before_instr(&intrin->instr);
nir_def *def =
nir_load_global_constant_uniform_block_intel(
b,
intrin->def.num_components,
intrin->def.bit_size,
intrin->src[0].ssa,
.access = ACCESS_NON_WRITEABLE | ACCESS_CAN_REORDER,
.align_mul = 4,
.align_offset = 4);
nir_def_rewrite_uses(&intrin->def, def);
nir_instr_remove(&intrin->instr);
return true;
default:
return false;
}

View File

@@ -58,17 +58,19 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
/* Load two just in case we go over a 64B boundary */
nir_def *data[2];
for (unsigned i = 0; i < 2; i++) {
nir_def *pred;
nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64);
data[i] = nir_load_global_constant_uniform_block_intel(
b, 16, 32, addr,
.access = nir_intrinsic_access(load),
.align_mul = 64,
.align_offset = 64);
if (bound) {
pred = nir_igt_imm(b, bound, aligned_offset + i * 64 + 63);
} else {
pred = nir_imm_true(b);
data[i] = nir_bcsel(b,
nir_igt_imm(b, bound, aligned_offset + i * 64 + 63),
data[i],
nir_imm_int(b, 0));
}
nir_def *addr = nir_iadd_imm(b, base_addr,
aligned_offset + i * 64);
data[i] = nir_load_global_const_block_intel(b, 16, addr, pred);
}
val = nir_extract_bits(b, data, 2, suboffset * 8,

View File

@@ -44,7 +44,7 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
unsigned byte_size = bit_size / 8;
nir_def *val;
if (nir_src_is_const(load->src[1])) {
if (!nir_src_is_divergent(load->src[0]) && nir_src_is_const(load->src[1])) {
uint32_t offset = nir_src_as_uint(load->src[1]);
/* Things should be component-aligned. */
@@ -58,17 +58,19 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
/* Load two just in case we go over a 64B boundary */
nir_def *data[2];
for (unsigned i = 0; i < 2; i++) {
nir_def *pred;
nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64);
data[i] = nir_load_global_constant_uniform_block_intel(
b, 16, 32, addr,
.access = nir_intrinsic_access(load),
.align_mul = 64,
.align_offset = 64);
if (bound) {
pred = nir_igt_imm(b, bound, aligned_offset + i * 64 + 63);
} else {
pred = nir_imm_true(b);
data[i] = nir_bcsel(b,
nir_igt_imm(b, bound, aligned_offset + i * 64 + 63),
data[i],
nir_imm_int(b, 0));
}
nir_def *addr = nir_iadd_imm(b, base_addr,
aligned_offset + i * 64);
data[i] = nir_load_global_const_block_intel(b, 16, addr, pred);
}
val = nir_extract_bits(b, data, 2, suboffset * 8,