intel/fs: Add and use a new load_simd_width_intel intrinsic
Intrinsic to get the SIMD width, which not always the same as subgroup size. Starting with a small scope (Intel), but we can rename it later to generalize if this turns out useful for other drivers. Change brw_nir_lower_cs_intrinsics() to use this intrinsic instead of a width will be passed as argument. The pass also used to optimized load_subgroup_id for the case that the workgroup fitted into a single thread (it will be constant zero). This optimization moved together with lowering of the SIMD. This is a preparation for letting the drivers call it before the brw_compile_cs() step. No shader-db changes in BDW, SKL, ICL and TGL. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Jordan Justen <jordan.l.justen@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4794>
This commit is contained in:
@@ -645,6 +645,9 @@ system_value("color1", 4)
|
|||||||
# System value for internal compute shaders in radeonsi.
|
# System value for internal compute shaders in radeonsi.
|
||||||
system_value("user_data_amd", 4)
|
system_value("user_data_amd", 4)
|
||||||
|
|
||||||
|
# Number of data items being operated on for a SIMD program.
|
||||||
|
system_value("simd_width_intel", 1)
|
||||||
|
|
||||||
# Barycentric coordinate intrinsics.
|
# Barycentric coordinate intrinsics.
|
||||||
#
|
#
|
||||||
# These set up the barycentric coordinates for a particular interpolation.
|
# These set up the barycentric coordinates for a particular interpolation.
|
||||||
|
@@ -8946,6 +8946,56 @@ cs_fill_push_const_info(const struct gen_device_info *devinfo,
|
|||||||
prog_data->nr_params);
|
prog_data->nr_params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
filter_simd(const nir_instr *instr, const void *_options)
|
||||||
|
{
|
||||||
|
if (instr->type != nir_instr_type_intrinsic)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
|
||||||
|
case nir_intrinsic_load_simd_width_intel:
|
||||||
|
case nir_intrinsic_load_subgroup_id:
|
||||||
|
return true;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static nir_ssa_def *
|
||||||
|
lower_simd(nir_builder *b, nir_instr *instr, void *options)
|
||||||
|
{
|
||||||
|
uintptr_t simd_width = (uintptr_t)options;
|
||||||
|
|
||||||
|
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
|
||||||
|
case nir_intrinsic_load_simd_width_intel:
|
||||||
|
return nir_imm_int(b, simd_width);
|
||||||
|
|
||||||
|
case nir_intrinsic_load_subgroup_id:
|
||||||
|
/* If the whole workgroup fits in one thread, we can lower subgroup_id
|
||||||
|
* to a constant zero.
|
||||||
|
*/
|
||||||
|
if (!b->shader->info.cs.local_size_variable) {
|
||||||
|
unsigned local_workgroup_size = b->shader->info.cs.local_size[0] *
|
||||||
|
b->shader->info.cs.local_size[1] *
|
||||||
|
b->shader->info.cs.local_size[2];
|
||||||
|
if (local_workgroup_size <= simd_width)
|
||||||
|
return nir_imm_int(b, 0);
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
|
||||||
|
{
|
||||||
|
nir_shader_lower_instructions(nir, filter_simd, lower_simd,
|
||||||
|
(void *)(uintptr_t)dispatch_width);
|
||||||
|
}
|
||||||
|
|
||||||
static nir_shader *
|
static nir_shader *
|
||||||
compile_cs_to_nir(const struct brw_compiler *compiler,
|
compile_cs_to_nir(const struct brw_compiler *compiler,
|
||||||
void *mem_ctx,
|
void *mem_ctx,
|
||||||
@@ -8956,7 +9006,9 @@ compile_cs_to_nir(const struct brw_compiler *compiler,
|
|||||||
nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
|
nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
|
||||||
brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true);
|
brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true);
|
||||||
|
|
||||||
NIR_PASS_V(shader, brw_nir_lower_cs_intrinsics, dispatch_width);
|
NIR_PASS_V(shader, brw_nir_lower_cs_intrinsics);
|
||||||
|
|
||||||
|
NIR_PASS_V(shader, brw_nir_lower_simd, dispatch_width);
|
||||||
|
|
||||||
/* Clean up after the local index and ID calculations. */
|
/* Clean up after the local index and ID calculations. */
|
||||||
NIR_PASS_V(shader, nir_opt_constant_folding);
|
NIR_PASS_V(shader, nir_opt_constant_folding);
|
||||||
|
@@ -3879,6 +3879,11 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case nir_intrinsic_load_simd_width_intel: {
|
||||||
|
bld.MOV(dest, brw_imm_ud(cs_prog_data->simd_size));
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
|
||||||
default:
|
default:
|
||||||
nir_emit_intrinsic(bld, instr);
|
nir_emit_intrinsic(bld, instr);
|
||||||
break;
|
break;
|
||||||
|
@@ -99,8 +99,7 @@ void
|
|||||||
brw_nir_link_shaders(const struct brw_compiler *compiler,
|
brw_nir_link_shaders(const struct brw_compiler *compiler,
|
||||||
nir_shader *producer, nir_shader *consumer);
|
nir_shader *producer, nir_shader *consumer);
|
||||||
|
|
||||||
bool brw_nir_lower_cs_intrinsics(nir_shader *nir,
|
bool brw_nir_lower_cs_intrinsics(nir_shader *nir);
|
||||||
unsigned dispatch_width);
|
|
||||||
void brw_nir_lower_alpha_to_coverage(nir_shader *shader);
|
void brw_nir_lower_alpha_to_coverage(nir_shader *shader);
|
||||||
void brw_nir_lower_legacy_clipping(nir_shader *nir,
|
void brw_nir_lower_legacy_clipping(nir_shader *nir,
|
||||||
int nr_userclip_plane_consts,
|
int nr_userclip_plane_consts,
|
||||||
|
@@ -26,7 +26,6 @@
|
|||||||
|
|
||||||
struct lower_intrinsics_state {
|
struct lower_intrinsics_state {
|
||||||
nir_shader *nir;
|
nir_shader *nir;
|
||||||
unsigned dispatch_width;
|
|
||||||
nir_function_impl *impl;
|
nir_function_impl *impl;
|
||||||
bool progress;
|
bool progress;
|
||||||
nir_builder builder;
|
nir_builder builder;
|
||||||
@@ -61,14 +60,10 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
|
|||||||
if (!local_index) {
|
if (!local_index) {
|
||||||
assert(!local_id);
|
assert(!local_id);
|
||||||
|
|
||||||
nir_ssa_def *subgroup_id;
|
nir_ssa_def *subgroup_id = nir_load_subgroup_id(b);
|
||||||
if (state->local_workgroup_size <= state->dispatch_width)
|
|
||||||
subgroup_id = nir_imm_int(b, 0);
|
|
||||||
else
|
|
||||||
subgroup_id = nir_load_subgroup_id(b);
|
|
||||||
|
|
||||||
nir_ssa_def *thread_local_id =
|
nir_ssa_def *thread_local_id =
|
||||||
nir_imul_imm(b, subgroup_id, state->dispatch_width);
|
nir_imul(b, subgroup_id, nir_load_simd_width_intel(b));
|
||||||
nir_ssa_def *channel = nir_load_subgroup_invocation(b);
|
nir_ssa_def *channel = nir_load_subgroup_invocation(b);
|
||||||
nir_ssa_def *linear = nir_iadd(b, channel, thread_local_id);
|
nir_ssa_def *linear = nir_iadd(b, channel, thread_local_id);
|
||||||
|
|
||||||
@@ -151,35 +146,25 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case nir_intrinsic_load_subgroup_id:
|
|
||||||
if (state->local_workgroup_size > 8)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
/* For small workgroup sizes, we know subgroup_id will be zero */
|
|
||||||
sysval = nir_imm_int(b, 0);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case nir_intrinsic_load_num_subgroups: {
|
case nir_intrinsic_load_num_subgroups: {
|
||||||
|
nir_ssa_def *size;
|
||||||
if (state->nir->info.cs.local_size_variable) {
|
if (state->nir->info.cs.local_size_variable) {
|
||||||
nir_ssa_def *size_xyz = nir_load_local_group_size(b);
|
nir_ssa_def *size_xyz = nir_load_local_group_size(b);
|
||||||
nir_ssa_def *size_x = nir_channel(b, size_xyz, 0);
|
nir_ssa_def *size_x = nir_channel(b, size_xyz, 0);
|
||||||
nir_ssa_def *size_y = nir_channel(b, size_xyz, 1);
|
nir_ssa_def *size_y = nir_channel(b, size_xyz, 1);
|
||||||
nir_ssa_def *size_z = nir_channel(b, size_xyz, 2);
|
nir_ssa_def *size_z = nir_channel(b, size_xyz, 2);
|
||||||
nir_ssa_def *size = nir_imul(b, nir_imul(b, size_x, size_y), size_z);
|
size = nir_imul(b, nir_imul(b, size_x, size_y), size_z);
|
||||||
|
} else {
|
||||||
|
size = nir_imm_int(b, nir->info.cs.local_size[0] *
|
||||||
|
nir->info.cs.local_size[1] *
|
||||||
|
nir->info.cs.local_size[2]);
|
||||||
|
}
|
||||||
|
|
||||||
/* Calculate the equivalent of DIV_ROUND_UP. */
|
/* Calculate the equivalent of DIV_ROUND_UP. */
|
||||||
sysval = nir_idiv(b,
|
nir_ssa_def *simd_width = nir_load_simd_width_intel(b);
|
||||||
nir_iadd_imm(b,
|
sysval =
|
||||||
nir_iadd_imm(b, size, state->dispatch_width), -1),
|
nir_udiv(b, nir_iadd_imm(b, nir_iadd(b, size, simd_width), -1),
|
||||||
nir_imm_int(b, state->dispatch_width));
|
simd_width);
|
||||||
} else {
|
|
||||||
unsigned local_workgroup_size =
|
|
||||||
nir->info.cs.local_size[0] * nir->info.cs.local_size[1] *
|
|
||||||
nir->info.cs.local_size[2];
|
|
||||||
unsigned num_subgroups =
|
|
||||||
DIV_ROUND_UP(local_workgroup_size, state->dispatch_width);
|
|
||||||
sysval = nir_imm_int(b, num_subgroups);
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -210,14 +195,12 @@ lower_cs_intrinsics_convert_impl(struct lower_intrinsics_state *state)
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
brw_nir_lower_cs_intrinsics(nir_shader *nir,
|
brw_nir_lower_cs_intrinsics(nir_shader *nir)
|
||||||
unsigned dispatch_width)
|
|
||||||
{
|
{
|
||||||
assert(nir->info.stage == MESA_SHADER_COMPUTE);
|
assert(nir->info.stage == MESA_SHADER_COMPUTE);
|
||||||
|
|
||||||
struct lower_intrinsics_state state = {
|
struct lower_intrinsics_state state = {
|
||||||
.nir = nir,
|
.nir = nir,
|
||||||
.dispatch_width = dispatch_width,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!nir->info.cs.local_size_variable) {
|
if (!nir->info.cs.local_size_variable) {
|
||||||
|
Reference in New Issue
Block a user