intel: Use common helpers for TCS passthrough shaders

Rob added these new helpers a while back, which freedreno and radeonsi
both share.  We should use them too.  The new helpers use variables and
system value intrinsics, so we can drop the explicit binding table
creation and just use the normal paths.

Because we have to rewrite the system value uploading anyway, we drop
the scrambling of the default tessellation levels on upload, and instead
let the compiler go ahead and remap components like any normal shader.
In theory, this results in more shuffling in the shader.  In practice,
we already do MOVs for message setup.  In the passthrough shaders I
looked at, this resulted in no extra instructions on Icelake (SIMD8
SINGLE_PATCH) and Tigerlake (8_PATCH).  On Haswell, one shader grew by
a single instruction for a pittance of cycles in a stage that isn't a
performance bottleneck anyway.  Avoiding remapping wasn't so much of an
optimization as just the way that I originally wrote it.  Not worth it.

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20809>
This commit is contained in:
Kenneth Graunke
2023-01-19 23:25:20 -08:00
committed by Marge Bot
parent 3a9edfc494
commit 96ba0344db
3 changed files with 91 additions and 127 deletions

View File

@@ -449,6 +449,8 @@ crocus_setup_uniforms(ASSERTED const struct intel_device_info *devinfo,
unsigned num_system_values = 0; unsigned num_system_values = 0;
unsigned patch_vert_idx = -1; unsigned patch_vert_idx = -1;
unsigned tess_outer_default_idx = -1;
unsigned tess_inner_default_idx = -1;
unsigned ucp_idx[CROCUS_MAX_CLIP_PLANES]; unsigned ucp_idx[CROCUS_MAX_CLIP_PLANES];
unsigned img_idx[PIPE_MAX_SHADER_IMAGES]; unsigned img_idx[PIPE_MAX_SHADER_IMAGES];
unsigned variable_group_size_idx = -1; unsigned variable_group_size_idx = -1;
@@ -539,6 +541,36 @@ crocus_setup_uniforms(ASSERTED const struct intel_device_info *devinfo,
b.cursor = nir_before_instr(instr); b.cursor = nir_before_instr(instr);
offset = nir_imm_int(&b, patch_vert_idx * sizeof(uint32_t)); offset = nir_imm_int(&b, patch_vert_idx * sizeof(uint32_t));
break; break;
case nir_intrinsic_load_tess_level_outer_default:
if (tess_outer_default_idx == -1) {
tess_outer_default_idx = num_system_values;
num_system_values += 4;
}
for (int i = 0; i < 4; i++) {
system_values[tess_outer_default_idx + i] =
BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i;
}
b.cursor = nir_before_instr(instr);
offset =
nir_imm_int(&b, tess_outer_default_idx * sizeof(uint32_t));
break;
case nir_intrinsic_load_tess_level_inner_default:
if (tess_inner_default_idx == -1) {
tess_inner_default_idx = num_system_values;
num_system_values += 2;
}
for (int i = 0; i < 2; i++) {
system_values[tess_inner_default_idx + i] =
BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X + i;
}
b.cursor = nir_before_instr(instr);
offset =
nir_imm_int(&b, tess_inner_default_idx * sizeof(uint32_t));
break;
case nir_intrinsic_image_deref_load_param_intel: { case nir_intrinsic_image_deref_load_param_intel: {
assert(devinfo->ver < 9); assert(devinfo->ver < 9);
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
@@ -1420,52 +1452,19 @@ crocus_compile_tcs(struct crocus_context *ice,
if (ish) { if (ish) {
nir = nir_shader_clone(mem_ctx, ish->nir); nir = nir_shader_clone(mem_ctx, ish->nir);
crocus_setup_uniforms(devinfo, mem_ctx, nir, prog_data, &system_values,
&num_system_values, &num_cbufs);
crocus_lower_swizzles(nir, &key->base.tex);
crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
num_system_values, num_cbufs, &key->base.tex);
if (can_push_ubo(devinfo))
brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
} else { } else {
nir = brw_nir_create_passthrough_tcs(mem_ctx, compiler, key); nir = brw_nir_create_passthrough_tcs(mem_ctx, compiler, key);
/* Reserve space for passing the default tess levels as constants. */
num_cbufs = 1;
num_system_values = 8;
system_values =
rzalloc_array(mem_ctx, enum brw_param_builtin, num_system_values);
prog_data->param = rzalloc_array(mem_ctx, uint32_t, num_system_values);
prog_data->nr_params = num_system_values;
if (key->_tes_primitive_mode == TESS_PRIMITIVE_QUADS) {
for (int i = 0; i < 4; i++)
system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i;
system_values[3] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X;
system_values[2] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y;
} else if (key->_tes_primitive_mode == TESS_PRIMITIVE_TRIANGLES) {
for (int i = 0; i < 3; i++)
system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i;
system_values[4] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X;
} else {
assert(key->_tes_primitive_mode == TESS_PRIMITIVE_ISOLINES);
system_values[7] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y;
system_values[6] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
}
/* Manually setup the TCS binding table. */
memset(&bt, 0, sizeof(bt));
bt.sizes[CROCUS_SURFACE_GROUP_UBO] = 1;
bt.used_mask[CROCUS_SURFACE_GROUP_UBO] = 1;
bt.size_bytes = 4;
prog_data->ubo_ranges[0].length = 1;
} }
crocus_setup_uniforms(devinfo, mem_ctx, nir, prog_data, &system_values,
&num_system_values, &num_cbufs);
crocus_lower_swizzles(nir, &key->base.tex);
crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
num_system_values, num_cbufs, &key->base.tex);
if (can_push_ubo(devinfo))
brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
struct brw_tcs_prog_key key_clean = *key; struct brw_tcs_prog_key key_clean = *key;
crocus_sanitize_tex_key(&key_clean.base.tex); crocus_sanitize_tex_key(&key_clean.base.tex);

View File

@@ -482,6 +482,8 @@ iris_setup_uniforms(ASSERTED const struct intel_device_info *devinfo,
unsigned num_system_values = 0; unsigned num_system_values = 0;
unsigned patch_vert_idx = -1; unsigned patch_vert_idx = -1;
unsigned tess_outer_default_idx = -1;
unsigned tess_inner_default_idx = -1;
unsigned ucp_idx[IRIS_MAX_CLIP_PLANES]; unsigned ucp_idx[IRIS_MAX_CLIP_PLANES];
unsigned img_idx[PIPE_MAX_SHADER_IMAGES]; unsigned img_idx[PIPE_MAX_SHADER_IMAGES];
unsigned variable_group_size_idx = -1; unsigned variable_group_size_idx = -1;
@@ -581,6 +583,36 @@ iris_setup_uniforms(ASSERTED const struct intel_device_info *devinfo,
offset = nir_imm_int(&b, system_values_start + offset = nir_imm_int(&b, system_values_start +
patch_vert_idx * sizeof(uint32_t)); patch_vert_idx * sizeof(uint32_t));
break; break;
case nir_intrinsic_load_tess_level_outer_default:
if (tess_outer_default_idx == -1) {
tess_outer_default_idx = num_system_values;
num_system_values += 4;
}
for (int i = 0; i < 4; i++) {
system_values[tess_outer_default_idx + i] =
BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i;
}
b.cursor = nir_before_instr(instr);
offset = nir_imm_int(&b, system_values_start +
tess_outer_default_idx * sizeof(uint32_t));
break;
case nir_intrinsic_load_tess_level_inner_default:
if (tess_inner_default_idx == -1) {
tess_inner_default_idx = num_system_values;
num_system_values += 2;
}
for (int i = 0; i < 2; i++) {
system_values[tess_inner_default_idx + i] =
BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X + i;
}
b.cursor = nir_before_instr(instr);
offset = nir_imm_int(&b, system_values_start +
tess_inner_default_idx * sizeof(uint32_t));
break;
case nir_intrinsic_image_deref_load_param_intel: { case nir_intrinsic_image_deref_load_param_intel: {
assert(devinfo->ver < 9); assert(devinfo->ver < 9);
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
@@ -1530,50 +1562,16 @@ iris_compile_tcs(struct iris_screen *screen,
if (ish) { if (ish) {
nir = nir_shader_clone(mem_ctx, ish->nir); nir = nir_shader_clone(mem_ctx, ish->nir);
iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data, 0, &system_values,
&num_system_values, &num_cbufs);
iris_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
num_system_values, num_cbufs);
brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
} else { } else {
nir = nir = brw_nir_create_passthrough_tcs(mem_ctx, compiler, &brw_key);
brw_nir_create_passthrough_tcs(mem_ctx, compiler, &brw_key);
/* Reserve space for passing the default tess levels as constants. */
num_cbufs = 1;
num_system_values = 8;
system_values =
rzalloc_array(mem_ctx, enum brw_param_builtin, num_system_values);
prog_data->param = rzalloc_array(mem_ctx, uint32_t, num_system_values);
prog_data->nr_params = num_system_values;
if (key->_tes_primitive_mode == TESS_PRIMITIVE_QUADS) {
for (int i = 0; i < 4; i++)
system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i;
system_values[3] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X;
system_values[2] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y;
} else if (key->_tes_primitive_mode == TESS_PRIMITIVE_TRIANGLES) {
for (int i = 0; i < 3; i++)
system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i;
system_values[4] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X;
} else {
assert(key->_tes_primitive_mode == TESS_PRIMITIVE_ISOLINES);
system_values[7] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y;
system_values[6] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
}
/* Manually setup the TCS binding table. */
memset(&bt, 0, sizeof(bt));
bt.sizes[IRIS_SURFACE_GROUP_UBO] = 1;
bt.used_mask[IRIS_SURFACE_GROUP_UBO] = 1;
bt.size_bytes = 4;
prog_data->ubo_ranges[0].length = 1;
} }
iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data, 0, &system_values,
&num_system_values, &num_cbufs);
iris_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
num_system_values, num_cbufs);
brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
struct brw_compile_tcs_params params = { struct brw_compile_tcs_params params = {
.nir = nir, .nir = nir,
.key = &brw_key, .key = &brw_key,

View File

@@ -189,9 +189,6 @@ remap_patch_urb_offsets(nir_block *block, nir_builder *b,
const struct brw_vue_map *vue_map, const struct brw_vue_map *vue_map,
enum tess_primitive_mode tes_primitive_mode) enum tess_primitive_mode tes_primitive_mode)
{ {
const bool is_passthrough_tcs = b->shader->info.name &&
strcmp(b->shader->info.name, "passthrough TCS") == 0;
nir_foreach_instr_safe(instr, block) { nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic) if (instr->type != nir_instr_type_intrinsic)
continue; continue;
@@ -203,8 +200,7 @@ remap_patch_urb_offsets(nir_block *block, nir_builder *b,
if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) || if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) ||
(stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) { (stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) {
if (!is_passthrough_tcs && if (remap_tess_levels(b, intrin, tes_primitive_mode))
remap_tess_levels(b, intrin, tes_primitive_mode))
continue; continue;
int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]]; int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]];
@@ -1858,50 +1854,21 @@ brw_nir_create_passthrough_tcs(void *mem_ctx, const struct brw_compiler *compile
{ {
const nir_shader_compiler_options *options = const nir_shader_compiler_options *options =
compiler->nir_options[MESA_SHADER_TESS_CTRL]; compiler->nir_options[MESA_SHADER_TESS_CTRL];
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_TESS_CTRL,
options, "passthrough TCS");
ralloc_steal(mem_ctx, b.shader);
nir_shader *nir = b.shader;
nir_variable *var;
nir_ssa_def *load;
nir_ssa_def *zero = nir_imm_int(&b, 0);
nir_ssa_def *invoc_id = nir_load_invocation_id(&b);
nir->info.inputs_read = key->outputs_written & uint64_t inputs_read = key->outputs_written &
~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER); ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
nir->info.outputs_written = key->outputs_written;
nir->info.tess.tcs_vertices_out = key->input_vertices;
nir->num_uniforms = 8 * sizeof(uint32_t);
var = nir_variable_create(nir, nir_var_uniform, glsl_vec4_type(), "hdr_0"); unsigned locations[64];
var->data.location = 0; unsigned num_locations = 0;
var = nir_variable_create(nir, nir_var_uniform, glsl_vec4_type(), "hdr_1");
var->data.location = 1;
/* Write the patch URB header. */ u_foreach_bit64(varying, inputs_read)
for (int i = 0; i <= 1; i++) { locations[num_locations++] = varying;
load = nir_load_uniform(&b, 4, 32, zero, .base = i * 4 * sizeof(uint32_t));
nir_store_output(&b, load, zero,
.base = VARYING_SLOT_TESS_LEVEL_INNER - i,
.write_mask = WRITEMASK_XYZW);
}
/* Copy inputs to outputs. */
uint64_t varyings = nir->info.inputs_read;
while (varyings != 0) {
const int varying = ffsll(varyings) - 1;
load = nir_load_per_vertex_input(&b, 4, 32, invoc_id, zero, .base = varying);
nir_store_per_vertex_output(&b, load, invoc_id, zero,
.base = varying,
.write_mask = WRITEMASK_XYZW);
varyings &= ~BITFIELD64_BIT(varying);
}
nir_shader *nir =
nir_create_passthrough_tcs_impl(options, locations, num_locations,
key->input_vertices);
nir->info.inputs_read = inputs_read;
nir->info.tess._primitive_mode = key->_tes_primitive_mode;
nir_validate_shader(nir, "in brw_nir_create_passthrough_tcs"); nir_validate_shader(nir, "in brw_nir_create_passthrough_tcs");
struct brw_nir_compiler_opts opts = {}; struct brw_nir_compiler_opts opts = {};