intel/nir: remove load_global_const_block_intel intrinsic
load_global_constant_uniform_block_intel is equivalent in terms of loading, then for the predicate we just do a bcsel afterward in places where that is required. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30659>
This commit is contained in:

committed by
Marge Bot

parent
a15466187c
commit
fbafa9cabd
@@ -226,7 +226,6 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
||||
case nir_intrinsic_load_scalar_arg_amd:
|
||||
case nir_intrinsic_load_smem_amd:
|
||||
case nir_intrinsic_load_resume_shader_address_amd:
|
||||
case nir_intrinsic_load_global_const_block_intel:
|
||||
case nir_intrinsic_load_reloc_const_intel:
|
||||
case nir_intrinsic_load_btd_global_arg_addr_intel:
|
||||
case nir_intrinsic_load_btd_local_arg_addr_intel:
|
||||
|
@@ -2133,14 +2133,6 @@ image("load_raw_intel", src_comp=[1], dest_comp=0,
|
||||
flags=[CAN_ELIMINATE])
|
||||
image("store_raw_intel", src_comp=[1, 0])
|
||||
|
||||
# Intrinsic to load a block of at least 32B of constant data from a 64-bit
|
||||
# global memory address. The memory address must be uniform and 32B-aligned.
|
||||
# The second source is a predicate which indicates whether or not to actually
|
||||
# do the load.
|
||||
# src[] = { address, predicate }.
|
||||
intrinsic("load_global_const_block_intel", src_comp=[1, 1], dest_comp=0,
|
||||
bit_sizes=[32], indices=[BASE], flags=[CAN_ELIMINATE, CAN_REORDER])
|
||||
|
||||
# Number of data items being operated on for a SIMD program.
|
||||
system_value("simd_width_intel", 1)
|
||||
|
||||
|
@@ -155,7 +155,6 @@ can_remat_instr(nir_instr *instr, struct sized_bitset *remat)
|
||||
case nir_intrinsic_load_vulkan_descriptor:
|
||||
case nir_intrinsic_load_push_constant:
|
||||
case nir_intrinsic_load_global_constant:
|
||||
case nir_intrinsic_load_global_const_block_intel:
|
||||
case nir_intrinsic_load_desc_set_address_intel:
|
||||
/* These intrinsics don't need to be spilled as long as they don't
|
||||
* depend on any spilled values.
|
||||
|
@@ -6816,69 +6816,6 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
|
||||
fs_nir_emit_global_atomic(ntb, bld, instr);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_global_const_block_intel: {
|
||||
assert(instr->def.bit_size == 32);
|
||||
assert(instr->num_components == 8 || instr->num_components == 16);
|
||||
|
||||
const fs_builder ubld = bld.exec_all().group(instr->num_components, 0);
|
||||
brw_reg load_val;
|
||||
|
||||
bool is_pred_const = nir_src_is_const(instr->src[1]);
|
||||
if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) {
|
||||
/* In this case, we don't want the UBO load at all. We really
|
||||
* shouldn't get here but it's possible.
|
||||
*/
|
||||
load_val = brw_imm_ud(0);
|
||||
} else {
|
||||
/* The uniform process may stomp the flag so do this first */
|
||||
brw_reg addr = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
|
||||
|
||||
load_val = ubld.vgrf(BRW_TYPE_UD);
|
||||
|
||||
/* If the predicate is constant and we got here, then it's non-zero
|
||||
* and we don't need the predicate at all.
|
||||
*/
|
||||
if (!is_pred_const) {
|
||||
/* Load the predicate */
|
||||
brw_reg pred = bld.emit_uniformize(get_nir_src(ntb, instr->src[1]));
|
||||
fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred);
|
||||
mov->conditional_mod = BRW_CONDITIONAL_NZ;
|
||||
|
||||
/* Stomp the destination with 0 if we're OOB */
|
||||
mov = ubld.MOV(load_val, brw_imm_ud(0));
|
||||
mov->predicate = BRW_PREDICATE_NORMAL;
|
||||
mov->predicate_inverse = true;
|
||||
}
|
||||
|
||||
brw_reg srcs[A64_LOGICAL_NUM_SRCS];
|
||||
srcs[A64_LOGICAL_ADDRESS] = addr;
|
||||
srcs[A64_LOGICAL_SRC] = brw_reg(); /* No source data */
|
||||
srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components);
|
||||
/* This intrinsic loads memory from a uniform address, sometimes
|
||||
* shared across lanes. We never need to mask it.
|
||||
*/
|
||||
srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
|
||||
|
||||
fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,
|
||||
load_val, srcs, A64_LOGICAL_NUM_SRCS);
|
||||
if (!is_pred_const)
|
||||
load->predicate = BRW_PREDICATE_NORMAL;
|
||||
}
|
||||
|
||||
/* From the HW perspective, we just did a single SIMD16 instruction
|
||||
* which loaded a dword in each SIMD channel. From NIR's perspective,
|
||||
* this instruction returns a vec16. Any users of this data in the
|
||||
* back-end will expect a vec16 per SIMD channel so we have to emit a
|
||||
* pile of MOVs to resolve this discrepancy. Fortunately, copy-prop
|
||||
* will generally clean them up for us.
|
||||
*/
|
||||
for (unsigned i = 0; i < instr->num_components; i++) {
|
||||
bld.MOV(retype(offset(dest, bld, i), BRW_TYPE_UD),
|
||||
component(load_val, i));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_global_constant_uniform_block_intel: {
|
||||
const unsigned total_dwords = ALIGN(instr->num_components,
|
||||
REG_SIZE * reg_unit(devinfo) / 4);
|
||||
|
@@ -1421,8 +1421,7 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
|
||||
if (bit_size > 32)
|
||||
return false;
|
||||
|
||||
if (low->intrinsic == nir_intrinsic_load_global_const_block_intel ||
|
||||
low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
|
||||
if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
|
||||
low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
|
||||
low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
|
||||
low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel) {
|
||||
@@ -2175,8 +2174,12 @@ brw_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load_uniform,
|
||||
nir_def *data[2];
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64);
|
||||
data[i] = nir_load_global_const_block_intel(b, 16, addr,
|
||||
nir_imm_true(b));
|
||||
|
||||
data[i] = nir_load_global_constant_uniform_block_intel(
|
||||
b, 16, 32, addr,
|
||||
.access = ACCESS_CAN_REORDER | ACCESS_NON_WRITEABLE,
|
||||
.align_mul = 64,
|
||||
.align_offset = 64);
|
||||
}
|
||||
|
||||
sysval = nir_extract_bits(b, data, 2, suboffset * 8,
|
||||
|
@@ -60,10 +60,13 @@ brw_nir_rt_store(nir_builder *b, nir_def *addr, unsigned align,
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_load_const(nir_builder *b, unsigned components,
|
||||
nir_def *addr, nir_def *pred)
|
||||
brw_nir_rt_load_const(nir_builder *b, unsigned components, nir_def *addr)
|
||||
{
|
||||
return nir_load_global_const_block_intel(b, components, addr, pred);
|
||||
return nir_load_global_constant_uniform_block_intel(
|
||||
b, components, 32, addr,
|
||||
.access = ACCESS_CAN_REORDER | ACCESS_NON_WRITEABLE,
|
||||
.align_mul = 64,
|
||||
.align_offset = 64);
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
@@ -312,7 +315,7 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
|
||||
nir_def *addr)
|
||||
{
|
||||
nir_def *data;
|
||||
data = brw_nir_rt_load_const(b, 16, addr, nir_imm_true(b));
|
||||
data = brw_nir_rt_load_const(b, 16, addr);
|
||||
defs->base_mem_addr = nir_pack_64_2x32(b, nir_trim_vector(b, data, 2));
|
||||
|
||||
defs->call_stack_handler_addr =
|
||||
@@ -335,7 +338,7 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
|
||||
defs->sw_stack_size = nir_channel(b, data, 12);
|
||||
defs->launch_size = nir_channels(b, data, 0x7u << 13);
|
||||
|
||||
data = brw_nir_rt_load_const(b, 8, nir_iadd_imm(b, addr, 64), nir_imm_true(b));
|
||||
data = brw_nir_rt_load_const(b, 8, nir_iadd_imm(b, addr, 64));
|
||||
defs->call_sbt_addr =
|
||||
nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
|
||||
nir_extract_i16(b, nir_channel(b, data, 1),
|
||||
|
@@ -5251,69 +5251,6 @@ fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
|
||||
fs_nir_emit_global_atomic(ntb, bld, instr);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_global_const_block_intel: {
|
||||
assert(instr->def.bit_size == 32);
|
||||
assert(instr->num_components == 8 || instr->num_components == 16);
|
||||
|
||||
const fs_builder ubld = bld.exec_all().group(instr->num_components, 0);
|
||||
elk_fs_reg load_val;
|
||||
|
||||
bool is_pred_const = nir_src_is_const(instr->src[1]);
|
||||
if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) {
|
||||
/* In this case, we don't want the UBO load at all. We really
|
||||
* shouldn't get here but it's possible.
|
||||
*/
|
||||
load_val = elk_imm_ud(0);
|
||||
} else {
|
||||
/* The uniform process may stomp the flag so do this first */
|
||||
elk_fs_reg addr = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
|
||||
|
||||
load_val = ubld.vgrf(ELK_REGISTER_TYPE_UD);
|
||||
|
||||
/* If the predicate is constant and we got here, then it's non-zero
|
||||
* and we don't need the predicate at all.
|
||||
*/
|
||||
if (!is_pred_const) {
|
||||
/* Load the predicate */
|
||||
elk_fs_reg pred = bld.emit_uniformize(get_nir_src(ntb, instr->src[1]));
|
||||
elk_fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred);
|
||||
mov->conditional_mod = ELK_CONDITIONAL_NZ;
|
||||
|
||||
/* Stomp the destination with 0 if we're OOB */
|
||||
mov = ubld.MOV(load_val, elk_imm_ud(0));
|
||||
mov->predicate = ELK_PREDICATE_NORMAL;
|
||||
mov->predicate_inverse = true;
|
||||
}
|
||||
|
||||
elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
|
||||
srcs[A64_LOGICAL_ADDRESS] = addr;
|
||||
srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
|
||||
srcs[A64_LOGICAL_ARG] = elk_imm_ud(instr->num_components);
|
||||
/* This intrinsic loads memory from a uniform address, sometimes
|
||||
* shared across lanes. We never need to mask it.
|
||||
*/
|
||||
srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
|
||||
|
||||
elk_fs_inst *load = ubld.emit(ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,
|
||||
load_val, srcs, A64_LOGICAL_NUM_SRCS);
|
||||
if (!is_pred_const)
|
||||
load->predicate = ELK_PREDICATE_NORMAL;
|
||||
}
|
||||
|
||||
/* From the HW perspective, we just did a single SIMD16 instruction
|
||||
* which loaded a dword in each SIMD channel. From NIR's perspective,
|
||||
* this instruction returns a vec16. Any users of this data in the
|
||||
* back-end will expect a vec16 per SIMD channel so we have to emit a
|
||||
* pile of MOVs to resolve this discrepancy. Fortunately, copy-prop
|
||||
* will generally clean them up for us.
|
||||
*/
|
||||
for (unsigned i = 0; i < instr->num_components; i++) {
|
||||
bld.MOV(retype(offset(dest, bld, i), ELK_REGISTER_TYPE_UD),
|
||||
component(load_val, i));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_global_constant_uniform_block_intel: {
|
||||
const unsigned total_dwords = ALIGN(instr->num_components,
|
||||
REG_SIZE * reg_unit(devinfo) / 4);
|
||||
|
@@ -1139,8 +1139,7 @@ elk_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
|
||||
if (bit_size > 32)
|
||||
return false;
|
||||
|
||||
if (low->intrinsic == nir_intrinsic_load_global_const_block_intel ||
|
||||
low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
|
||||
if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
|
||||
low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
|
||||
low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
|
||||
low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel) {
|
||||
@@ -1873,8 +1872,7 @@ elk_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load_uniform,
|
||||
nir_def *data[2];
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64);
|
||||
data[i] = nir_load_global_const_block_intel(b, 16, addr,
|
||||
nir_imm_true(b));
|
||||
data[i] = nir_load_global_constant_uniform_block_intel(b, 16, 32, addr);
|
||||
}
|
||||
|
||||
sysval = nir_extract_bits(b, data, 2, suboffset * 8,
|
||||
|
@@ -106,39 +106,6 @@ intel_nir_blockify_uniform_loads_instr(nir_builder *b,
|
||||
intrin->intrinsic = nir_intrinsic_load_global_constant_uniform_block_intel;
|
||||
return true;
|
||||
|
||||
case nir_intrinsic_load_global_const_block_intel:
|
||||
/* Only deal with the simple predication true case */
|
||||
if (!nir_src_is_const(intrin->src[1]) ||
|
||||
nir_src_as_uint(intrin->src[1]) == 0)
|
||||
return false;
|
||||
|
||||
if (nir_src_is_divergent(intrin->src[0]))
|
||||
return false;
|
||||
|
||||
if (intrin->def.bit_size != 32)
|
||||
return false;
|
||||
|
||||
/* Without the LSC, we can only do block loads of at least 4dwords (1
|
||||
* oword).
|
||||
*/
|
||||
if (!devinfo->has_lsc && intrin->def.num_components < 4)
|
||||
return false;
|
||||
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
nir_def *def =
|
||||
nir_load_global_constant_uniform_block_intel(
|
||||
b,
|
||||
intrin->def.num_components,
|
||||
intrin->def.bit_size,
|
||||
intrin->src[0].ssa,
|
||||
.access = ACCESS_NON_WRITEABLE | ACCESS_CAN_REORDER,
|
||||
.align_mul = 4,
|
||||
.align_offset = 4);
|
||||
|
||||
nir_def_rewrite_uses(&intrin->def, def);
|
||||
nir_instr_remove(&intrin->instr);
|
||||
return true;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
@@ -58,17 +58,19 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
|
||||
/* Load two just in case we go over a 64B boundary */
|
||||
nir_def *data[2];
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
nir_def *pred;
|
||||
nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64);
|
||||
|
||||
data[i] = nir_load_global_constant_uniform_block_intel(
|
||||
b, 16, 32, addr,
|
||||
.access = nir_intrinsic_access(load),
|
||||
.align_mul = 64,
|
||||
.align_offset = 64);
|
||||
if (bound) {
|
||||
pred = nir_igt_imm(b, bound, aligned_offset + i * 64 + 63);
|
||||
} else {
|
||||
pred = nir_imm_true(b);
|
||||
data[i] = nir_bcsel(b,
|
||||
nir_igt_imm(b, bound, aligned_offset + i * 64 + 63),
|
||||
data[i],
|
||||
nir_imm_int(b, 0));
|
||||
}
|
||||
|
||||
nir_def *addr = nir_iadd_imm(b, base_addr,
|
||||
aligned_offset + i * 64);
|
||||
|
||||
data[i] = nir_load_global_const_block_intel(b, 16, addr, pred);
|
||||
}
|
||||
|
||||
val = nir_extract_bits(b, data, 2, suboffset * 8,
|
||||
|
@@ -44,7 +44,7 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
|
||||
unsigned byte_size = bit_size / 8;
|
||||
|
||||
nir_def *val;
|
||||
if (nir_src_is_const(load->src[1])) {
|
||||
if (!nir_src_is_divergent(load->src[0]) && nir_src_is_const(load->src[1])) {
|
||||
uint32_t offset = nir_src_as_uint(load->src[1]);
|
||||
|
||||
/* Things should be component-aligned. */
|
||||
@@ -58,17 +58,19 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
|
||||
/* Load two just in case we go over a 64B boundary */
|
||||
nir_def *data[2];
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
nir_def *pred;
|
||||
nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64);
|
||||
|
||||
data[i] = nir_load_global_constant_uniform_block_intel(
|
||||
b, 16, 32, addr,
|
||||
.access = nir_intrinsic_access(load),
|
||||
.align_mul = 64,
|
||||
.align_offset = 64);
|
||||
if (bound) {
|
||||
pred = nir_igt_imm(b, bound, aligned_offset + i * 64 + 63);
|
||||
} else {
|
||||
pred = nir_imm_true(b);
|
||||
data[i] = nir_bcsel(b,
|
||||
nir_igt_imm(b, bound, aligned_offset + i * 64 + 63),
|
||||
data[i],
|
||||
nir_imm_int(b, 0));
|
||||
}
|
||||
|
||||
nir_def *addr = nir_iadd_imm(b, base_addr,
|
||||
aligned_offset + i * 64);
|
||||
|
||||
data[i] = nir_load_global_const_block_intel(b, 16, addr, pred);
|
||||
}
|
||||
|
||||
val = nir_extract_bits(b, data, 2, suboffset * 8,
|
||||
|
Reference in New Issue
Block a user