From c92b8a802ed9c4b3f4ef5923bc98899ee7153011 Mon Sep 17 00:00:00 2001 From: Caio Oliveira Date: Fri, 12 Jul 2024 13:52:46 -0700 Subject: [PATCH] intel/brw: Move remaining compile stages to their own files Reviewed-by: Ian Romanick Part-of: --- src/intel/compiler/brw_compile_bs.cpp | 195 +++ src/intel/compiler/brw_compile_cs.cpp | 183 +++ src/intel/compiler/brw_compile_fs.cpp | 867 ++++++++++++ .../{brw_mesh.cpp => brw_compile_mesh.cpp} | 0 src/intel/compiler/brw_compile_tes.cpp | 140 ++ src/intel/compiler/brw_fs.cpp | 1195 ----------------- src/intel/compiler/brw_shader.cpp | 130 -- src/intel/compiler/meson.build | 6 +- 8 files changed, 1390 insertions(+), 1326 deletions(-) create mode 100644 src/intel/compiler/brw_compile_bs.cpp create mode 100644 src/intel/compiler/brw_compile_cs.cpp create mode 100644 src/intel/compiler/brw_compile_fs.cpp rename src/intel/compiler/{brw_mesh.cpp => brw_compile_mesh.cpp} (100%) create mode 100644 src/intel/compiler/brw_compile_tes.cpp diff --git a/src/intel/compiler/brw_compile_bs.cpp b/src/intel/compiler/brw_compile_bs.cpp new file mode 100644 index 00000000000..83d9f020194 --- /dev/null +++ b/src/intel/compiler/brw_compile_bs.cpp @@ -0,0 +1,195 @@ +/* + * Copyright © 2010 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "brw_fs.h" +#include "brw_fs_live_variables.h" +#include "brw_nir.h" +#include "brw_cfg.h" +#include "brw_private.h" +#include "intel_nir.h" +#include "shader_enums.h" +#include "dev/intel_debug.h" +#include "dev/intel_wa.h" + +#include + +static uint64_t +brw_bsr(const struct intel_device_info *devinfo, + uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset) +{ + assert(offset % 64 == 0); + assert(simd_size == 8 || simd_size == 16); + assert(local_arg_offset % 8 == 0); + + return offset | + SET_BITS(simd_size == 8, 4, 4) | + SET_BITS(local_arg_offset / 8, 2, 0); +} + +static uint8_t +compile_single_bs(const struct brw_compiler *compiler, + struct brw_compile_bs_params *params, + const struct brw_bs_prog_key *key, + struct brw_bs_prog_data *prog_data, + nir_shader *shader, + fs_generator *g, + struct brw_compile_stats *stats, + int *prog_offset) +{ + const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT); + + prog_data->base.stage = shader->info.stage; + prog_data->max_stack_size = MAX2(prog_data->max_stack_size, + shader->scratch_size); + + const unsigned max_dispatch_width = 16; + brw_nir_apply_key(shader, compiler, &key->base, max_dispatch_width); + brw_postprocess_nir(shader, compiler, debug_enabled, + key->base.robust_flags); + + brw_simd_selection_state simd_state{ + .devinfo = compiler->devinfo, + .prog_data = prog_data, + + /* Since divergence is a lot more likely in RT than compute, it makes + * sense to limit ourselves to the smallest available SIMD for now. + */ + .required_width = compiler->devinfo->ver >= 20 ? 16u : 8u, + }; + + std::unique_ptr v[2]; + + for (unsigned simd = 0; simd < ARRAY_SIZE(v); simd++) { + if (!brw_simd_should_compile(simd_state, simd)) + continue; + + const unsigned dispatch_width = 8u << simd; + + if (dispatch_width == 8 && compiler->devinfo->ver >= 20) + continue; + + v[simd] = std::make_unique(compiler, ¶ms->base, + &key->base, + &prog_data->base, shader, + dispatch_width, + stats != NULL, + debug_enabled); + + const bool allow_spilling = !brw_simd_any_compiled(simd_state); + if (v[simd]->run_bs(allow_spilling)) { + brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); + } else { + simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, + v[simd]->fail_msg); + if (simd > 0) { + brw_shader_perf_log(compiler, params->base.log_data, + "SIMD%u shader failed to compile: %s", + dispatch_width, v[simd]->fail_msg); + } + } + } + + const int selected_simd = brw_simd_select(simd_state); + if (selected_simd < 0) { + params->base.error_str = + ralloc_asprintf(params->base.mem_ctx, + "Can't compile shader: " + "SIMD8 '%s' and SIMD16 '%s'.\n", + simd_state.error[0], simd_state.error[1]); + return 0; + } + + assert(selected_simd < int(ARRAY_SIZE(v))); + fs_visitor *selected = v[selected_simd].get(); + assert(selected); + + const unsigned dispatch_width = selected->dispatch_width; + + int offset = g->generate_code(selected->cfg, dispatch_width, selected->shader_stats, + selected->performance_analysis.require(), stats); + if (prog_offset) + *prog_offset = offset; + else + assert(offset == 0); + + return dispatch_width; +} + +const unsigned * +brw_compile_bs(const struct brw_compiler *compiler, + struct brw_compile_bs_params *params) +{ + nir_shader *shader = params->base.nir; + struct brw_bs_prog_data *prog_data = params->prog_data; + unsigned num_resume_shaders = params->num_resume_shaders; + nir_shader **resume_shaders = params->resume_shaders; + const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT); + + prog_data->base.stage = shader->info.stage; + prog_data->base.ray_queries = shader->info.ray_queries; + prog_data->base.total_scratch = 0; + + prog_data->max_stack_size = 0; + prog_data->num_resume_shaders = num_resume_shaders; + + fs_generator g(compiler, ¶ms->base, &prog_data->base, + shader->info.stage); + if (unlikely(debug_enabled)) { + char *name = ralloc_asprintf(params->base.mem_ctx, + "%s %s shader %s", + shader->info.label ? + shader->info.label : "unnamed", + gl_shader_stage_name(shader->info.stage), + shader->info.name); + g.enable_debug(name); + } + + prog_data->simd_size = + compile_single_bs(compiler, params, params->key, prog_data, + shader, &g, params->base.stats, NULL); + if (prog_data->simd_size == 0) + return NULL; + + uint64_t *resume_sbt = ralloc_array(params->base.mem_ctx, + uint64_t, num_resume_shaders); + for (unsigned i = 0; i < num_resume_shaders; i++) { + if (INTEL_DEBUG(DEBUG_RT)) { + char *name = ralloc_asprintf(params->base.mem_ctx, + "%s %s resume(%u) shader %s", + shader->info.label ? + shader->info.label : "unnamed", + gl_shader_stage_name(shader->info.stage), + i, shader->info.name); + g.enable_debug(name); + } + + /* TODO: Figure out shader stats etc. for resume shaders */ + int offset = 0; + uint8_t simd_size = + compile_single_bs(compiler, params, params->key, + prog_data, resume_shaders[i], &g, NULL, &offset); + if (simd_size == 0) + return NULL; + + assert(offset > 0); + resume_sbt[i] = brw_bsr(compiler->devinfo, offset, simd_size, 0); + } + + /* We only have one constant data so we want to make sure they're all the + * same. + */ + for (unsigned i = 0; i < num_resume_shaders; i++) { + assert(resume_shaders[i]->constant_data_size == + shader->constant_data_size); + assert(memcmp(resume_shaders[i]->constant_data, + shader->constant_data, + shader->constant_data_size) == 0); + } + + g.add_const_data(shader->constant_data, shader->constant_data_size); + g.add_resume_sbt(num_resume_shaders, resume_sbt); + + return g.get_assembly(); +} diff --git a/src/intel/compiler/brw_compile_cs.cpp b/src/intel/compiler/brw_compile_cs.cpp new file mode 100644 index 00000000000..90cbfc7afd7 --- /dev/null +++ b/src/intel/compiler/brw_compile_cs.cpp @@ -0,0 +1,183 @@ +/* + * Copyright © 2010 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "brw_fs.h" +#include "brw_fs_live_variables.h" +#include "brw_nir.h" +#include "brw_cfg.h" +#include "brw_private.h" +#include "intel_nir.h" +#include "shader_enums.h" +#include "dev/intel_debug.h" +#include "dev/intel_wa.h" + +#include + +static void +fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords) +{ + block->dwords = dwords; + block->regs = DIV_ROUND_UP(dwords, 8); + block->size = block->regs * 32; +} + +static void +cs_fill_push_const_info(const struct intel_device_info *devinfo, + struct brw_cs_prog_data *cs_prog_data) +{ + const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; + int subgroup_id_index = brw_get_subgroup_id_param_index(devinfo, prog_data); + + /* The thread ID should be stored in the last param dword */ + assert(subgroup_id_index == -1 || + subgroup_id_index == (int)prog_data->nr_params - 1); + + unsigned cross_thread_dwords, per_thread_dwords; + if (subgroup_id_index >= 0) { + /* Fill all but the last register with cross-thread payload */ + cross_thread_dwords = 8 * (subgroup_id_index / 8); + per_thread_dwords = prog_data->nr_params - cross_thread_dwords; + assert(per_thread_dwords > 0 && per_thread_dwords <= 8); + } else { + /* Fill all data using cross-thread payload */ + cross_thread_dwords = prog_data->nr_params; + per_thread_dwords = 0u; + } + + fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords); + fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords); + + assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 || + cs_prog_data->push.per_thread.size == 0); + assert(cs_prog_data->push.cross_thread.dwords + + cs_prog_data->push.per_thread.dwords == + prog_data->nr_params); +} + +const unsigned * +brw_compile_cs(const struct brw_compiler *compiler, + struct brw_compile_cs_params *params) +{ + const nir_shader *nir = params->base.nir; + const struct brw_cs_prog_key *key = params->key; + struct brw_cs_prog_data *prog_data = params->prog_data; + + const bool debug_enabled = + brw_should_print_shader(nir, params->base.debug_flag ? + params->base.debug_flag : DEBUG_CS); + + prog_data->base.stage = MESA_SHADER_COMPUTE; + prog_data->base.total_shared = nir->info.shared_size; + prog_data->base.ray_queries = nir->info.ray_queries; + prog_data->base.total_scratch = 0; + + if (!nir->info.workgroup_size_variable) { + prog_data->local_size[0] = nir->info.workgroup_size[0]; + prog_data->local_size[1] = nir->info.workgroup_size[1]; + prog_data->local_size[2] = nir->info.workgroup_size[2]; + } + + brw_simd_selection_state simd_state{ + .devinfo = compiler->devinfo, + .prog_data = prog_data, + .required_width = brw_required_dispatch_width(&nir->info), + }; + + std::unique_ptr v[3]; + + for (unsigned simd = 0; simd < 3; simd++) { + if (!brw_simd_should_compile(simd_state, simd)) + continue; + + const unsigned dispatch_width = 8u << simd; + + nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir); + brw_nir_apply_key(shader, compiler, &key->base, + dispatch_width); + + NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width); + + /* Clean up after the local index and ID calculations. */ + NIR_PASS(_, shader, nir_opt_constant_folding); + NIR_PASS(_, shader, nir_opt_dce); + + brw_postprocess_nir(shader, compiler, debug_enabled, + key->base.robust_flags); + + v[simd] = std::make_unique(compiler, ¶ms->base, + &key->base, + &prog_data->base, + shader, dispatch_width, + params->base.stats != NULL, + debug_enabled); + + const int first = brw_simd_first_compiled(simd_state); + if (first >= 0) + v[simd]->import_uniforms(v[first].get()); + + const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable; + + if (v[simd]->run_cs(allow_spilling)) { + cs_fill_push_const_info(compiler->devinfo, prog_data); + + brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); + } else { + simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg); + if (simd > 0) { + brw_shader_perf_log(compiler, params->base.log_data, + "SIMD%u shader failed to compile: %s\n", + dispatch_width, v[simd]->fail_msg); + } + } + } + + const int selected_simd = brw_simd_select(simd_state); + if (selected_simd < 0) { + params->base.error_str = + ralloc_asprintf(params->base.mem_ctx, + "Can't compile shader: " + "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n", + simd_state.error[0], simd_state.error[1], + simd_state.error[2]); + return NULL; + } + + assert(selected_simd < 3); + + if (!nir->info.workgroup_size_variable) + prog_data->prog_mask = 1 << selected_simd; + + fs_generator g(compiler, ¶ms->base, &prog_data->base, + MESA_SHADER_COMPUTE); + if (unlikely(debug_enabled)) { + char *name = ralloc_asprintf(params->base.mem_ctx, + "%s compute shader %s", + nir->info.label ? + nir->info.label : "unnamed", + nir->info.name); + g.enable_debug(name); + } + + uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1); + + struct brw_compile_stats *stats = params->base.stats; + for (unsigned simd = 0; simd < 3; simd++) { + if (prog_data->prog_mask & (1u << simd)) { + assert(v[simd]); + prog_data->prog_offset[simd] = + g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats, + v[simd]->performance_analysis.require(), stats); + if (stats) + stats->max_dispatch_width = max_dispatch_width; + stats = stats ? stats + 1 : NULL; + max_dispatch_width = 8u << simd; + } + } + + g.add_const_data(nir->constant_data, nir->constant_data_size); + + return g.get_assembly(); +} + diff --git a/src/intel/compiler/brw_compile_fs.cpp b/src/intel/compiler/brw_compile_fs.cpp new file mode 100644 index 00000000000..59565502221 --- /dev/null +++ b/src/intel/compiler/brw_compile_fs.cpp @@ -0,0 +1,867 @@ +/* + * Copyright © 2010 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "brw_eu.h" +#include "brw_fs.h" +#include "brw_fs_live_variables.h" +#include "brw_nir.h" +#include "brw_cfg.h" +#include "brw_private.h" +#include "intel_nir.h" +#include "shader_enums.h" +#include "dev/intel_debug.h" +#include "dev/intel_wa.h" + +#include + +using namespace brw; + +/** + * Turn one of the two CENTROID barycentric modes into PIXEL mode. + */ +static enum brw_barycentric_mode +centroid_to_pixel(enum brw_barycentric_mode bary) +{ + assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID || + bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID); + return (enum brw_barycentric_mode) ((unsigned) bary - 1); +} + +static void +calculate_urb_setup(const struct intel_device_info *devinfo, + const struct brw_wm_prog_key *key, + struct brw_wm_prog_data *prog_data, + const nir_shader *nir, + const struct brw_mue_map *mue_map) +{ + memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup)); + memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel)); + + int urb_next = 0; /* in vec4s */ + + const uint64_t inputs_read = + nir->info.inputs_read & ~nir->info.per_primitive_inputs; + + /* Figure out where each of the incoming setup attributes lands. */ + if (key->mesh_input != BRW_NEVER) { + /* Per-Primitive Attributes are laid out by Hardware before the regular + * attributes, so order them like this to make easy later to map setup + * into real HW registers. + */ + if (nir->info.per_primitive_inputs) { + uint64_t per_prim_inputs_read = + nir->info.inputs_read & nir->info.per_primitive_inputs; + + /* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots + * are always at the beginning, because they come from MUE + * Primitive Header, not Per-Primitive Attributes. + */ + const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT | + VARYING_BIT_LAYER | + VARYING_BIT_PRIMITIVE_SHADING_RATE; + + if (mue_map) { + unsigned per_prim_start_dw = mue_map->per_primitive_start_dw; + unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw; + + bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0; + + if (reads_header || mue_map->user_data_in_primitive_header) { + /* Primitive Shading Rate, Layer and Viewport live in the same + * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport + * is dword 2). + */ + if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE) + prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0; + + if (per_prim_inputs_read & VARYING_BIT_LAYER) + prog_data->urb_setup[VARYING_SLOT_LAYER] = 0; + + if (per_prim_inputs_read & VARYING_BIT_VIEWPORT) + prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0; + + per_prim_inputs_read &= ~primitive_header_bits; + } else { + /* If fs doesn't need primitive header, then it won't be made + * available through SBE_MESH, so we have to skip them when + * calculating offset from start of per-prim data. + */ + per_prim_start_dw += mue_map->per_primitive_header_size_dw; + per_prim_size_dw -= mue_map->per_primitive_header_size_dw; + } + + u_foreach_bit64(i, per_prim_inputs_read) { + int start = mue_map->start_dw[i]; + + assert(start >= 0); + assert(mue_map->len_dw[i] > 0); + + assert(unsigned(start) >= per_prim_start_dw); + unsigned pos_dw = unsigned(start) - per_prim_start_dw; + + prog_data->urb_setup[i] = urb_next + pos_dw / 4; + prog_data->urb_setup_channel[i] = pos_dw % 4; + } + + urb_next = per_prim_size_dw / 4; + } else { + /* With no MUE map, we never read the primitive header, and + * per-primitive attributes won't be packed either, so just lay + * them in varying order. + */ + per_prim_inputs_read &= ~primitive_header_bits; + + for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { + if (per_prim_inputs_read & BITFIELD64_BIT(i)) { + prog_data->urb_setup[i] = urb_next++; + } + } + + /* The actual setup attributes later must be aligned to a full GRF. */ + urb_next = ALIGN(urb_next, 2); + } + + prog_data->num_per_primitive_inputs = urb_next; + } + + const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 | + VARYING_BIT_CLIP_DIST1; + + uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK; + + if (inputs_read & clip_dist_bits) { + assert(!mue_map || mue_map->per_vertex_header_size_dw > 8); + unique_fs_attrs &= ~clip_dist_bits; + } + + if (mue_map) { + unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw; + unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw; + + /* Per-Vertex header is available to fragment shader only if there's + * user data there. + */ + if (!mue_map->user_data_in_vertex_header) { + per_vertex_start_dw += 8; + per_vertex_size_dw -= 8; + } + + /* In Mesh, CLIP_DIST slots are always at the beginning, because + * they come from MUE Vertex Header, not Per-Vertex Attributes. + */ + if (inputs_read & clip_dist_bits) { + prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next; + prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1; + } else if (mue_map && mue_map->per_vertex_header_size_dw > 8) { + /* Clip distances are in MUE, but we are not reading them in FS. */ + per_vertex_start_dw += 8; + per_vertex_size_dw -= 8; + } + + /* Per-Vertex attributes are laid out ordered. Because we always link + * Mesh and Fragment shaders, the which slots are written and read by + * each of them will match. */ + u_foreach_bit64(i, unique_fs_attrs) { + int start = mue_map->start_dw[i]; + + assert(start >= 0); + assert(mue_map->len_dw[i] > 0); + + assert(unsigned(start) >= per_vertex_start_dw); + unsigned pos_dw = unsigned(start) - per_vertex_start_dw; + + prog_data->urb_setup[i] = urb_next + pos_dw / 4; + prog_data->urb_setup_channel[i] = pos_dw % 4; + } + + urb_next += per_vertex_size_dw / 4; + } else { + /* If we don't have an MUE map, just lay down the inputs the FS reads + * in varying order, as we do for the legacy pipeline. + */ + if (inputs_read & clip_dist_bits) { + prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++; + prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++; + } + + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { + if (unique_fs_attrs & BITFIELD64_BIT(i)) + prog_data->urb_setup[i] = urb_next++; + } + } + } else { + assert(!nir->info.per_primitive_inputs); + + uint64_t vue_header_bits = + VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT; + + uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK; + + /* VUE header fields all live in the same URB slot, so we pass them + * as a single FS input attribute. We want to only count them once. + */ + if (inputs_read & vue_header_bits) { + unique_fs_attrs &= ~vue_header_bits; + unique_fs_attrs |= VARYING_BIT_PSIZ; + } + + if (util_bitcount64(unique_fs_attrs) <= 16) { + /* The SF/SBE pipeline stage can do arbitrary rearrangement of the + * first 16 varying inputs, so we can put them wherever we want. + * Just put them in order. + * + * This is useful because it means that (a) inputs not used by the + * fragment shader won't take up valuable register space, and (b) we + * won't have to recompile the fragment shader if it gets paired with + * a different vertex (or geometry) shader. + * + * VUE header fields share the same FS input attribute. + */ + if (inputs_read & vue_header_bits) { + if (inputs_read & VARYING_BIT_PSIZ) + prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next; + if (inputs_read & VARYING_BIT_LAYER) + prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next; + if (inputs_read & VARYING_BIT_VIEWPORT) + prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next; + + urb_next++; + } + + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { + if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits & + BITFIELD64_BIT(i)) { + prog_data->urb_setup[i] = urb_next++; + } + } + } else { + /* We have enough input varyings that the SF/SBE pipeline stage can't + * arbitrarily rearrange them to suit our whim; we have to put them + * in an order that matches the output of the previous pipeline stage + * (geometry or vertex shader). + */ + + /* Re-compute the VUE map here in the case that the one coming from + * geometry has more than one position slot (used for Primitive + * Replication). + */ + struct intel_vue_map prev_stage_vue_map; + brw_compute_vue_map(devinfo, &prev_stage_vue_map, + key->input_slots_valid, + nir->info.separate_shader, 1); + + int first_slot = + brw_compute_first_urb_slot_required(inputs_read, + &prev_stage_vue_map); + + assert(prev_stage_vue_map.num_slots <= first_slot + 32); + for (int slot = first_slot; slot < prev_stage_vue_map.num_slots; + slot++) { + int varying = prev_stage_vue_map.slot_to_varying[slot]; + if (varying != BRW_VARYING_SLOT_PAD && + (inputs_read & BRW_FS_VARYING_INPUT_MASK & + BITFIELD64_BIT(varying))) { + prog_data->urb_setup[varying] = slot - first_slot; + } + } + urb_next = prev_stage_vue_map.num_slots - first_slot; + } + } + + prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs; + prog_data->inputs = inputs_read; + + brw_compute_urb_setup_index(prog_data); +} +static bool +is_used_in_not_interp_frag_coord(nir_def *def) +{ + nir_foreach_use_including_if(src, def) { + if (nir_src_is_if(src)) + return true; + + if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic) + return true; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src)); + if (intrin->intrinsic != nir_intrinsic_load_frag_coord) + return true; + } + + return false; +} + +/** + * Return a bitfield where bit n is set if barycentric interpolation mode n + * (see enum brw_barycentric_mode) is needed by the fragment shader. + * + * We examine the load_barycentric intrinsics rather than looking at input + * variables so that we catch interpolateAtCentroid() messages too, which + * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up. + */ +static unsigned +brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo, + const struct brw_wm_prog_key *key, + const nir_shader *shader) +{ + unsigned barycentric_interp_modes = 0; + + nir_foreach_function_impl(impl, shader) { + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_at_sample: + case nir_intrinsic_load_barycentric_at_offset: + break; + default: + continue; + } + + /* Ignore WPOS; it doesn't require interpolation. */ + if (!is_used_in_not_interp_frag_coord(&intrin->def)) + continue; + + nir_intrinsic_op bary_op = intrin->intrinsic; + enum brw_barycentric_mode bary = + brw_barycentric_mode(key, intrin); + + barycentric_interp_modes |= 1 << bary; + + if (devinfo->needs_unlit_centroid_workaround && + bary_op == nir_intrinsic_load_barycentric_centroid) + barycentric_interp_modes |= 1 << centroid_to_pixel(bary); + } + } + } + + return barycentric_interp_modes; +} + +/** + * Return a bitfield where bit n is set if barycentric interpolation + * mode n (see enum brw_barycentric_mode) is needed by the fragment + * shader barycentric intrinsics that take an explicit offset or + * sample as argument. + */ +static unsigned +brw_compute_offset_barycentric_interp_modes(const struct brw_wm_prog_key *key, + const nir_shader *shader) +{ + unsigned barycentric_interp_modes = 0; + + nir_foreach_function_impl(impl, shader) { + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic == nir_intrinsic_load_barycentric_at_offset || + intrin->intrinsic == nir_intrinsic_load_barycentric_at_sample) + barycentric_interp_modes |= 1 << brw_barycentric_mode(key, intrin); + } + } + } + + return barycentric_interp_modes; +} + +static void +brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data, + const nir_shader *shader) +{ + prog_data->flat_inputs = 0; + + nir_foreach_shader_in_variable(var, shader) { + /* flat shading */ + if (var->data.interpolation != INTERP_MODE_FLAT) + continue; + + if (var->data.per_primitive) + continue; + + unsigned slots = glsl_count_attribute_slots(var->type, false); + for (unsigned s = 0; s < slots; s++) { + int input_index = prog_data->urb_setup[var->data.location + s]; + + if (input_index >= 0) + prog_data->flat_inputs |= 1 << input_index; + } + } +} + +static uint8_t +computed_depth_mode(const nir_shader *shader) +{ + if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { + switch (shader->info.fs.depth_layout) { + case FRAG_DEPTH_LAYOUT_NONE: + case FRAG_DEPTH_LAYOUT_ANY: + return BRW_PSCDEPTH_ON; + case FRAG_DEPTH_LAYOUT_GREATER: + return BRW_PSCDEPTH_ON_GE; + case FRAG_DEPTH_LAYOUT_LESS: + return BRW_PSCDEPTH_ON_LE; + case FRAG_DEPTH_LAYOUT_UNCHANGED: + /* We initially set this to OFF, but having the shader write the + * depth means we allocate register space in the SEND message. The + * difference between the SEND register count and the OFF state + * programming makes the HW hang. + * + * Removing the depth writes also leads to test failures. So use + * LesserThanOrEqual, which fits writing the same value + * (unchanged/equal). + * + */ + return BRW_PSCDEPTH_ON_LE; + } + } + return BRW_PSCDEPTH_OFF; +} + +static void +brw_nir_populate_wm_prog_data(nir_shader *shader, + const struct intel_device_info *devinfo, + const struct brw_wm_prog_key *key, + struct brw_wm_prog_data *prog_data, + const struct brw_mue_map *mue_map) +{ + prog_data->uses_kill = shader->info.fs.uses_discard; + prog_data->uses_omask = !key->ignore_sample_mask_out && + (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)); + prog_data->max_polygons = 1; + prog_data->computed_depth_mode = computed_depth_mode(shader); + prog_data->computed_stencil = + shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); + + prog_data->sample_shading = + shader->info.fs.uses_sample_shading || + shader->info.outputs_read; + + assert(key->multisample_fbo != BRW_NEVER || + key->persample_interp == BRW_NEVER); + + prog_data->persample_dispatch = key->persample_interp; + if (prog_data->sample_shading) + prog_data->persample_dispatch = BRW_ALWAYS; + + /* We can only persample dispatch if we have a multisample FBO */ + prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch, + key->multisample_fbo); + + /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If + * persample_dispatch & multisample_fbo are not dynamic, Anv should be able + * to definitively tell whether alpha_to_coverage is on or off. + */ + prog_data->alpha_to_coverage = key->alpha_to_coverage; + + prog_data->uses_sample_mask = + BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN); + + /* From the Ivy Bridge PRM documentation for 3DSTATE_PS: + * + * "MSDISPMODE_PERSAMPLE is required in order to select + * POSOFFSET_SAMPLE" + * + * So we can only really get sample positions if we are doing real + * per-sample dispatch. If we need gl_SamplePosition and we don't have + * persample dispatch, we hard-code it to 0.5. + */ + prog_data->uses_pos_offset = + prog_data->persample_dispatch != BRW_NEVER && + (BITSET_TEST(shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_POS) || + BITSET_TEST(shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_POS_OR_CENTER)); + + prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; + prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage; + prog_data->inner_coverage = shader->info.fs.inner_coverage; + + prog_data->barycentric_interp_modes = + brw_compute_barycentric_interp_modes(devinfo, key, shader); + + /* From the BDW PRM documentation for 3DSTATE_WM: + * + * "MSDISPMODE_PERSAMPLE is required in order to select Perspective + * Sample or Non- perspective Sample barycentric coordinates." + * + * So cleanup any potentially set sample barycentric mode when not in per + * sample dispatch. + */ + if (prog_data->persample_dispatch == BRW_NEVER) { + prog_data->barycentric_interp_modes &= + ~BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE); + } + + if (devinfo->ver >= 20) { + const unsigned offset_bary_modes = + brw_compute_offset_barycentric_interp_modes(key, shader); + + prog_data->uses_npc_bary_coefficients = + offset_bary_modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS; + prog_data->uses_pc_bary_coefficients = + offset_bary_modes & ~BRW_BARYCENTRIC_NONPERSPECTIVE_BITS; + prog_data->uses_sample_offsets = + offset_bary_modes & ((1 << BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE) | + (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE)); + } + + prog_data->uses_nonperspective_interp_modes = + (prog_data->barycentric_interp_modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) || + prog_data->uses_npc_bary_coefficients; + + /* The current VK_EXT_graphics_pipeline_library specification requires + * coarse to specified at compile time. But per sample interpolation can be + * dynamic. So we should never be in a situation where coarse & + * persample_interp are both respectively true & BRW_ALWAYS. + * + * Coarse will dynamically turned off when persample_interp is active. + */ + assert(!key->coarse_pixel || key->persample_interp != BRW_ALWAYS); + + prog_data->coarse_pixel_dispatch = + brw_sometimes_invert(prog_data->persample_dispatch); + if (!key->coarse_pixel || + prog_data->uses_omask || + prog_data->sample_shading || + prog_data->uses_sample_mask || + (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) || + prog_data->computed_stencil) { + prog_data->coarse_pixel_dispatch = BRW_NEVER; + } + + /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater, + * Message Descriptor : + * + * "Message Type. Specifies the type of message being sent when + * pixel-rate evaluation is requested : + * + * Format = U2 + * 0: Per Message Offset (eval_snapped with immediate offset) + * 1: Sample Position Offset (eval_sindex) + * 2: Centroid Position Offset (eval_centroid) + * 3: Per Slot Offset (eval_snapped with register offset) + * + * Message Type. Specifies the type of message being sent when + * coarse-rate evaluation is requested : + * + * Format = U2 + * 0: Coarse to Pixel Mapping Message (internal message) + * 1: Reserved + * 2: Coarse Centroid Position (eval_centroid) + * 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)" + * + * The Sample Position Offset is marked as reserved for coarse rate + * evaluation and leads to hangs if we try to use it. So disable coarse + * pixel shading if we have any intrinsic that will result in a pixel + * interpolater message at sample. + */ + if (intel_nir_pulls_at_sample(shader)) + prog_data->coarse_pixel_dispatch = BRW_NEVER; + + /* We choose to always enable VMask prior to XeHP, as it would cause + * us to lose out on the eliminate_find_live_channel() optimization. + */ + prog_data->uses_vmask = devinfo->verx10 < 125 || + shader->info.fs.needs_quad_helper_invocations || + shader->info.uses_wide_subgroup_intrinsics || + prog_data->coarse_pixel_dispatch != BRW_NEVER; + + prog_data->uses_src_w = + BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD); + prog_data->uses_src_depth = + BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) && + prog_data->coarse_pixel_dispatch != BRW_ALWAYS; + prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients || + (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) && + prog_data->coarse_pixel_dispatch != BRW_NEVER); + + calculate_urb_setup(devinfo, key, prog_data, shader, mue_map); + brw_compute_flat_inputs(prog_data, shader); +} + +const unsigned * +brw_compile_fs(const struct brw_compiler *compiler, + struct brw_compile_fs_params *params) +{ + struct nir_shader *nir = params->base.nir; + const struct brw_wm_prog_key *key = params->key; + struct brw_wm_prog_data *prog_data = params->prog_data; + bool allow_spilling = params->allow_spilling; + const bool debug_enabled = + brw_should_print_shader(nir, params->base.debug_flag ? + params->base.debug_flag : DEBUG_WM); + + prog_data->base.stage = MESA_SHADER_FRAGMENT; + prog_data->base.ray_queries = nir->info.ray_queries; + prog_data->base.total_scratch = 0; + + const struct intel_device_info *devinfo = compiler->devinfo; + const unsigned max_subgroup_size = 32; + + brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size); + brw_nir_lower_fs_inputs(nir, devinfo, key); + brw_nir_lower_fs_outputs(nir); + + /* From the SKL PRM, Volume 7, "Alpha Coverage": + * "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in + * hardware, regardless of the state setting for this feature." + */ + if (key->alpha_to_coverage != BRW_NEVER) { + /* Run constant fold optimization in order to get the correct source + * offset to determine render target 0 store instruction in + * emit_alpha_to_coverage pass. + */ + NIR_PASS(_, nir, nir_opt_constant_folding); + NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage, key, prog_data); + } + + NIR_PASS(_, nir, brw_nir_move_interpolation_to_top); + brw_postprocess_nir(nir, compiler, debug_enabled, + key->base.robust_flags); + + brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data, + params->mue_map); + + std::unique_ptr v8, v16, v32, vmulti; + cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL, + *multi_cfg = NULL; + float throughput = 0; + bool has_spilled = false; + + if (devinfo->ver < 20) { + v8 = std::make_unique(compiler, ¶ms->base, key, + prog_data, nir, 8, 1, + params->base.stats != NULL, + debug_enabled); + if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) { + params->base.error_str = ralloc_strdup(params->base.mem_ctx, + v8->fail_msg); + return NULL; + } else if (INTEL_SIMD(FS, 8)) { + simd8_cfg = v8->cfg; + + assert(v8->payload().num_regs % reg_unit(devinfo) == 0); + prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo); + + const performance &perf = v8->performance_analysis.require(); + throughput = MAX2(throughput, perf.throughput); + has_spilled = v8->spilled_any_registers; + allow_spilling = false; + } + } + + if (key->coarse_pixel && devinfo->ver < 20) { + if (prog_data->dual_src_blend) { + v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot" + " use SIMD8 messages.\n"); + } + v8->limit_dispatch_width(16, "SIMD32 not supported with coarse" + " pixel shading.\n"); + } + + if (!has_spilled && + (!v8 || v8->max_dispatch_width >= 16) && + (INTEL_SIMD(FS, 16) || params->use_rep_send)) { + /* Try a SIMD16 compile */ + v16 = std::make_unique(compiler, ¶ms->base, key, + prog_data, nir, 16, 1, + params->base.stats != NULL, + debug_enabled); + if (v8) + v16->import_uniforms(v8.get()); + if (!v16->run_fs(allow_spilling, params->use_rep_send)) { + brw_shader_perf_log(compiler, params->base.log_data, + "SIMD16 shader failed to compile: %s\n", + v16->fail_msg); + } else { + simd16_cfg = v16->cfg; + + assert(v16->payload().num_regs % reg_unit(devinfo) == 0); + prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo); + + const performance &perf = v16->performance_analysis.require(); + throughput = MAX2(throughput, perf.throughput); + has_spilled = v16->spilled_any_registers; + allow_spilling = false; + } + } + + const bool simd16_failed = v16 && !simd16_cfg; + + /* Currently, the compiler only supports SIMD32 on SNB+ */ + if (!has_spilled && + (!v8 || v8->max_dispatch_width >= 32) && + (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send && + !simd16_failed && + INTEL_SIMD(FS, 32)) { + /* Try a SIMD32 compile */ + v32 = std::make_unique(compiler, ¶ms->base, key, + prog_data, nir, 32, 1, + params->base.stats != NULL, + debug_enabled); + if (v8) + v32->import_uniforms(v8.get()); + else if (v16) + v32->import_uniforms(v16.get()); + + if (!v32->run_fs(allow_spilling, false)) { + brw_shader_perf_log(compiler, params->base.log_data, + "SIMD32 shader failed to compile: %s\n", + v32->fail_msg); + } else { + const performance &perf = v32->performance_analysis.require(); + + if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) { + brw_shader_perf_log(compiler, params->base.log_data, + "SIMD32 shader inefficient\n"); + } else { + simd32_cfg = v32->cfg; + + assert(v32->payload().num_regs % reg_unit(devinfo) == 0); + prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo); + + throughput = MAX2(throughput, perf.throughput); + } + } + } + + if (devinfo->ver >= 12 && !has_spilled && + params->max_polygons >= 2 && !key->coarse_pixel) { + fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get(); + assert(vbase); + + if (devinfo->ver >= 20 && + params->max_polygons >= 4 && + vbase->max_dispatch_width >= 32 && + 4 * prog_data->num_varying_inputs <= MAX_VARYING && + INTEL_SIMD(FS, 4X8)) { + /* Try a quad-SIMD8 compile */ + vmulti = std::make_unique(compiler, ¶ms->base, key, + prog_data, nir, 32, 4, + params->base.stats != NULL, + debug_enabled); + vmulti->import_uniforms(vbase); + if (!vmulti->run_fs(false, params->use_rep_send)) { + brw_shader_perf_log(compiler, params->base.log_data, + "Quad-SIMD8 shader failed to compile: %s\n", + vmulti->fail_msg); + } else { + multi_cfg = vmulti->cfg; + assert(!vmulti->spilled_any_registers); + } + } + + if (!multi_cfg && devinfo->ver >= 20 && + vbase->max_dispatch_width >= 32 && + 2 * prog_data->num_varying_inputs <= MAX_VARYING && + INTEL_SIMD(FS, 2X16)) { + /* Try a dual-SIMD16 compile */ + vmulti = std::make_unique(compiler, ¶ms->base, key, + prog_data, nir, 32, 2, + params->base.stats != NULL, + debug_enabled); + vmulti->import_uniforms(vbase); + if (!vmulti->run_fs(false, params->use_rep_send)) { + brw_shader_perf_log(compiler, params->base.log_data, + "Dual-SIMD16 shader failed to compile: %s\n", + vmulti->fail_msg); + } else { + multi_cfg = vmulti->cfg; + assert(!vmulti->spilled_any_registers); + } + } + + if (!multi_cfg && vbase->max_dispatch_width >= 16 && + 2 * prog_data->num_varying_inputs <= MAX_VARYING && + INTEL_SIMD(FS, 2X8)) { + /* Try a dual-SIMD8 compile */ + vmulti = std::make_unique(compiler, ¶ms->base, key, + prog_data, nir, 16, 2, + params->base.stats != NULL, + debug_enabled); + vmulti->import_uniforms(vbase); + if (!vmulti->run_fs(allow_spilling, params->use_rep_send)) { + brw_shader_perf_log(compiler, params->base.log_data, + "Dual-SIMD8 shader failed to compile: %s\n", + vmulti->fail_msg); + } else { + multi_cfg = vmulti->cfg; + } + } + + if (multi_cfg) { + assert(vmulti->payload().num_regs % reg_unit(devinfo) == 0); + prog_data->base.dispatch_grf_start_reg = vmulti->payload().num_regs / reg_unit(devinfo); + } + } + + /* When the caller requests a repclear shader, they want SIMD16-only */ + if (params->use_rep_send) + simd8_cfg = NULL; + + fs_generator g(compiler, ¶ms->base, &prog_data->base, + MESA_SHADER_FRAGMENT); + + if (unlikely(debug_enabled)) { + g.enable_debug(ralloc_asprintf(params->base.mem_ctx, + "%s fragment shader %s", + nir->info.label ? + nir->info.label : "unnamed", + nir->info.name)); + } + + struct brw_compile_stats *stats = params->base.stats; + uint32_t max_dispatch_width = 0; + + if (multi_cfg) { + prog_data->dispatch_multi = vmulti->dispatch_width; + prog_data->max_polygons = vmulti->max_polygons; + g.generate_code(multi_cfg, vmulti->dispatch_width, vmulti->shader_stats, + vmulti->performance_analysis.require(), + stats, vmulti->max_polygons); + stats = stats ? stats + 1 : NULL; + max_dispatch_width = vmulti->dispatch_width; + + } else if (simd8_cfg) { + prog_data->dispatch_8 = true; + g.generate_code(simd8_cfg, 8, v8->shader_stats, + v8->performance_analysis.require(), stats, 1); + stats = stats ? stats + 1 : NULL; + max_dispatch_width = 8; + } + + if (simd16_cfg) { + prog_data->dispatch_16 = true; + prog_data->prog_offset_16 = g.generate_code( + simd16_cfg, 16, v16->shader_stats, + v16->performance_analysis.require(), stats, 1); + stats = stats ? stats + 1 : NULL; + max_dispatch_width = 16; + } + + if (simd32_cfg) { + prog_data->dispatch_32 = true; + prog_data->prog_offset_32 = g.generate_code( + simd32_cfg, 32, v32->shader_stats, + v32->performance_analysis.require(), stats, 1); + stats = stats ? stats + 1 : NULL; + max_dispatch_width = 32; + } + + for (struct brw_compile_stats *s = params->base.stats; s != NULL && s != stats; s++) + s->max_dispatch_width = max_dispatch_width; + + g.add_const_data(nir->constant_data, nir->constant_data_size); + return g.get_assembly(); +} diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_compile_mesh.cpp similarity index 100% rename from src/intel/compiler/brw_mesh.cpp rename to src/intel/compiler/brw_compile_mesh.cpp diff --git a/src/intel/compiler/brw_compile_tes.cpp b/src/intel/compiler/brw_compile_tes.cpp new file mode 100644 index 00000000000..c8baca58cb1 --- /dev/null +++ b/src/intel/compiler/brw_compile_tes.cpp @@ -0,0 +1,140 @@ +/* + * Copyright © 2010 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "brw_cfg.h" +#include "brw_eu.h" +#include "brw_fs.h" +#include "brw_nir.h" +#include "brw_private.h" +#include "dev/intel_debug.h" +#include "util/macros.h" + +const unsigned * +brw_compile_tes(const struct brw_compiler *compiler, + brw_compile_tes_params *params) +{ + const struct intel_device_info *devinfo = compiler->devinfo; + nir_shader *nir = params->base.nir; + const struct brw_tes_prog_key *key = params->key; + const struct intel_vue_map *input_vue_map = params->input_vue_map; + struct brw_tes_prog_data *prog_data = params->prog_data; + + const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TES); + + prog_data->base.base.stage = MESA_SHADER_TESS_EVAL; + prog_data->base.base.ray_queries = nir->info.ray_queries; + + nir->info.inputs_read = key->inputs_read; + nir->info.patch_inputs_read = key->patch_inputs_read; + + brw_nir_apply_key(nir, compiler, &key->base, + brw_geometry_stage_dispatch_width(compiler->devinfo)); + brw_nir_lower_tes_inputs(nir, input_vue_map); + brw_nir_lower_vue_outputs(nir); + brw_postprocess_nir(nir, compiler, debug_enabled, + key->base.robust_flags); + + brw_compute_vue_map(devinfo, &prog_data->base.vue_map, + nir->info.outputs_written, + nir->info.separate_shader, 1); + + unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4; + + assert(output_size_bytes >= 1); + if (output_size_bytes > GFX7_MAX_DS_URB_ENTRY_SIZE_BYTES) { + params->base.error_str = ralloc_strdup(params->base.mem_ctx, + "DS outputs exceed maximum size"); + return NULL; + } + + prog_data->base.clip_distance_mask = + ((1 << nir->info.clip_distance_array_size) - 1); + prog_data->base.cull_distance_mask = + ((1 << nir->info.cull_distance_array_size) - 1) << + nir->info.clip_distance_array_size; + + prog_data->include_primitive_id = + BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID); + + /* URB entry sizes are stored as a multiple of 64 bytes. */ + prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64; + + prog_data->base.urb_read_length = 0; + + STATIC_ASSERT(INTEL_TESS_PARTITIONING_INTEGER == TESS_SPACING_EQUAL - 1); + STATIC_ASSERT(INTEL_TESS_PARTITIONING_ODD_FRACTIONAL == + TESS_SPACING_FRACTIONAL_ODD - 1); + STATIC_ASSERT(INTEL_TESS_PARTITIONING_EVEN_FRACTIONAL == + TESS_SPACING_FRACTIONAL_EVEN - 1); + + prog_data->partitioning = + (enum intel_tess_partitioning) (nir->info.tess.spacing - 1); + + switch (nir->info.tess._primitive_mode) { + case TESS_PRIMITIVE_QUADS: + prog_data->domain = INTEL_TESS_DOMAIN_QUAD; + break; + case TESS_PRIMITIVE_TRIANGLES: + prog_data->domain = INTEL_TESS_DOMAIN_TRI; + break; + case TESS_PRIMITIVE_ISOLINES: + prog_data->domain = INTEL_TESS_DOMAIN_ISOLINE; + break; + default: + unreachable("invalid domain shader primitive mode"); + } + + if (nir->info.tess.point_mode) { + prog_data->output_topology = INTEL_TESS_OUTPUT_TOPOLOGY_POINT; + } else if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) { + prog_data->output_topology = INTEL_TESS_OUTPUT_TOPOLOGY_LINE; + } else { + /* Hardware winding order is backwards from OpenGL */ + prog_data->output_topology = + nir->info.tess.ccw ? INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW + : INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW; + } + + if (unlikely(debug_enabled)) { + fprintf(stderr, "TES Input "); + brw_print_vue_map(stderr, input_vue_map, MESA_SHADER_TESS_EVAL); + fprintf(stderr, "TES Output "); + brw_print_vue_map(stderr, &prog_data->base.vue_map, + MESA_SHADER_TESS_EVAL); + } + + const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8; + fs_visitor v(compiler, ¶ms->base, &key->base, + &prog_data->base.base, nir, dispatch_width, + params->base.stats != NULL, debug_enabled); + if (!v.run_tes()) { + params->base.error_str = + ralloc_strdup(params->base.mem_ctx, v.fail_msg); + return NULL; + } + + assert(v.payload().num_regs % reg_unit(devinfo) == 0); + prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo); + + prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8; + + fs_generator g(compiler, ¶ms->base, + &prog_data->base.base, MESA_SHADER_TESS_EVAL); + if (unlikely(debug_enabled)) { + g.enable_debug(ralloc_asprintf(params->base.mem_ctx, + "%s tessellation evaluation shader %s", + nir->info.label ? nir->info.label + : "unnamed", + nir->info.name)); + } + + g.generate_code(v.cfg, dispatch_width, v.shader_stats, + v.performance_analysis.require(), params->base.stats); + + g.add_const_data(nir->constant_data, nir->constant_data_size); + + return g.get_assembly(); +} + diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 34dc2827d7e..36d5679244b 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -43,8 +43,6 @@ #include "compiler/nir/nir_builder.h" #include "util/u_math.h" -#include - using namespace brw; static void @@ -1091,17 +1089,6 @@ brw_barycentric_mode(const struct brw_wm_prog_key *key, return (enum brw_barycentric_mode) bary; } -/** - * Turn one of the two CENTROID barycentric modes into PIXEL mode. - */ -static enum brw_barycentric_mode -centroid_to_pixel(enum brw_barycentric_mode bary) -{ - assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID || - bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID); - return (enum brw_barycentric_mode) ((unsigned) bary - 1); -} - /** * Walk backwards from the end of the program looking for a URB write that * isn't in control flow, and mark it with EOT. @@ -1367,253 +1354,6 @@ brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data) wm_prog_data->urb_setup_attribs_count = index; } -static void -calculate_urb_setup(const struct intel_device_info *devinfo, - const struct brw_wm_prog_key *key, - struct brw_wm_prog_data *prog_data, - const nir_shader *nir, - const struct brw_mue_map *mue_map) -{ - memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup)); - memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel)); - - int urb_next = 0; /* in vec4s */ - - const uint64_t inputs_read = - nir->info.inputs_read & ~nir->info.per_primitive_inputs; - - /* Figure out where each of the incoming setup attributes lands. */ - if (key->mesh_input != BRW_NEVER) { - /* Per-Primitive Attributes are laid out by Hardware before the regular - * attributes, so order them like this to make easy later to map setup - * into real HW registers. - */ - if (nir->info.per_primitive_inputs) { - uint64_t per_prim_inputs_read = - nir->info.inputs_read & nir->info.per_primitive_inputs; - - /* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots - * are always at the beginning, because they come from MUE - * Primitive Header, not Per-Primitive Attributes. - */ - const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT | - VARYING_BIT_LAYER | - VARYING_BIT_PRIMITIVE_SHADING_RATE; - - if (mue_map) { - unsigned per_prim_start_dw = mue_map->per_primitive_start_dw; - unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw; - - bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0; - - if (reads_header || mue_map->user_data_in_primitive_header) { - /* Primitive Shading Rate, Layer and Viewport live in the same - * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport - * is dword 2). - */ - if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE) - prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0; - - if (per_prim_inputs_read & VARYING_BIT_LAYER) - prog_data->urb_setup[VARYING_SLOT_LAYER] = 0; - - if (per_prim_inputs_read & VARYING_BIT_VIEWPORT) - prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0; - - per_prim_inputs_read &= ~primitive_header_bits; - } else { - /* If fs doesn't need primitive header, then it won't be made - * available through SBE_MESH, so we have to skip them when - * calculating offset from start of per-prim data. - */ - per_prim_start_dw += mue_map->per_primitive_header_size_dw; - per_prim_size_dw -= mue_map->per_primitive_header_size_dw; - } - - u_foreach_bit64(i, per_prim_inputs_read) { - int start = mue_map->start_dw[i]; - - assert(start >= 0); - assert(mue_map->len_dw[i] > 0); - - assert(unsigned(start) >= per_prim_start_dw); - unsigned pos_dw = unsigned(start) - per_prim_start_dw; - - prog_data->urb_setup[i] = urb_next + pos_dw / 4; - prog_data->urb_setup_channel[i] = pos_dw % 4; - } - - urb_next = per_prim_size_dw / 4; - } else { - /* With no MUE map, we never read the primitive header, and - * per-primitive attributes won't be packed either, so just lay - * them in varying order. - */ - per_prim_inputs_read &= ~primitive_header_bits; - - for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { - if (per_prim_inputs_read & BITFIELD64_BIT(i)) { - prog_data->urb_setup[i] = urb_next++; - } - } - - /* The actual setup attributes later must be aligned to a full GRF. */ - urb_next = ALIGN(urb_next, 2); - } - - prog_data->num_per_primitive_inputs = urb_next; - } - - const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 | - VARYING_BIT_CLIP_DIST1; - - uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK; - - if (inputs_read & clip_dist_bits) { - assert(!mue_map || mue_map->per_vertex_header_size_dw > 8); - unique_fs_attrs &= ~clip_dist_bits; - } - - if (mue_map) { - unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw; - unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw; - - /* Per-Vertex header is available to fragment shader only if there's - * user data there. - */ - if (!mue_map->user_data_in_vertex_header) { - per_vertex_start_dw += 8; - per_vertex_size_dw -= 8; - } - - /* In Mesh, CLIP_DIST slots are always at the beginning, because - * they come from MUE Vertex Header, not Per-Vertex Attributes. - */ - if (inputs_read & clip_dist_bits) { - prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next; - prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1; - } else if (mue_map && mue_map->per_vertex_header_size_dw > 8) { - /* Clip distances are in MUE, but we are not reading them in FS. */ - per_vertex_start_dw += 8; - per_vertex_size_dw -= 8; - } - - /* Per-Vertex attributes are laid out ordered. Because we always link - * Mesh and Fragment shaders, the which slots are written and read by - * each of them will match. */ - u_foreach_bit64(i, unique_fs_attrs) { - int start = mue_map->start_dw[i]; - - assert(start >= 0); - assert(mue_map->len_dw[i] > 0); - - assert(unsigned(start) >= per_vertex_start_dw); - unsigned pos_dw = unsigned(start) - per_vertex_start_dw; - - prog_data->urb_setup[i] = urb_next + pos_dw / 4; - prog_data->urb_setup_channel[i] = pos_dw % 4; - } - - urb_next += per_vertex_size_dw / 4; - } else { - /* If we don't have an MUE map, just lay down the inputs the FS reads - * in varying order, as we do for the legacy pipeline. - */ - if (inputs_read & clip_dist_bits) { - prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++; - prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++; - } - - for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { - if (unique_fs_attrs & BITFIELD64_BIT(i)) - prog_data->urb_setup[i] = urb_next++; - } - } - } else { - assert(!nir->info.per_primitive_inputs); - - uint64_t vue_header_bits = - VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT; - - uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK; - - /* VUE header fields all live in the same URB slot, so we pass them - * as a single FS input attribute. We want to only count them once. - */ - if (inputs_read & vue_header_bits) { - unique_fs_attrs &= ~vue_header_bits; - unique_fs_attrs |= VARYING_BIT_PSIZ; - } - - if (util_bitcount64(unique_fs_attrs) <= 16) { - /* The SF/SBE pipeline stage can do arbitrary rearrangement of the - * first 16 varying inputs, so we can put them wherever we want. - * Just put them in order. - * - * This is useful because it means that (a) inputs not used by the - * fragment shader won't take up valuable register space, and (b) we - * won't have to recompile the fragment shader if it gets paired with - * a different vertex (or geometry) shader. - * - * VUE header fields share the same FS input attribute. - */ - if (inputs_read & vue_header_bits) { - if (inputs_read & VARYING_BIT_PSIZ) - prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next; - if (inputs_read & VARYING_BIT_LAYER) - prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next; - if (inputs_read & VARYING_BIT_VIEWPORT) - prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next; - - urb_next++; - } - - for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { - if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits & - BITFIELD64_BIT(i)) { - prog_data->urb_setup[i] = urb_next++; - } - } - } else { - /* We have enough input varyings that the SF/SBE pipeline stage can't - * arbitrarily rearrange them to suit our whim; we have to put them - * in an order that matches the output of the previous pipeline stage - * (geometry or vertex shader). - */ - - /* Re-compute the VUE map here in the case that the one coming from - * geometry has more than one position slot (used for Primitive - * Replication). - */ - struct intel_vue_map prev_stage_vue_map; - brw_compute_vue_map(devinfo, &prev_stage_vue_map, - key->input_slots_valid, - nir->info.separate_shader, 1); - - int first_slot = - brw_compute_first_urb_slot_required(inputs_read, - &prev_stage_vue_map); - - assert(prev_stage_vue_map.num_slots <= first_slot + 32); - for (int slot = first_slot; slot < prev_stage_vue_map.num_slots; - slot++) { - int varying = prev_stage_vue_map.slot_to_varying[slot]; - if (varying != BRW_VARYING_SLOT_PAD && - (inputs_read & BRW_FS_VARYING_INPUT_MASK & - BITFIELD64_BIT(varying))) { - prog_data->urb_setup[varying] = slot - first_slot; - } - } - urb_next = prev_stage_vue_map.num_slots - first_slot; - } - } - - prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs; - prog_data->inputs = inputs_read; - - brw_compute_urb_setup_index(prog_data); -} - void fs_visitor::assign_urb_setup() { @@ -3434,158 +3174,6 @@ fs_visitor::run_mesh(bool allow_spilling) return !failed; } -static bool -is_used_in_not_interp_frag_coord(nir_def *def) -{ - nir_foreach_use_including_if(src, def) { - if (nir_src_is_if(src)) - return true; - - if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic) - return true; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src)); - if (intrin->intrinsic != nir_intrinsic_load_frag_coord) - return true; - } - - return false; -} - -/** - * Return a bitfield where bit n is set if barycentric interpolation mode n - * (see enum brw_barycentric_mode) is needed by the fragment shader. - * - * We examine the load_barycentric intrinsics rather than looking at input - * variables so that we catch interpolateAtCentroid() messages too, which - * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up. - */ -static unsigned -brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo, - const struct brw_wm_prog_key *key, - const nir_shader *shader) -{ - unsigned barycentric_interp_modes = 0; - - nir_foreach_function_impl(impl, shader) { - nir_foreach_block(block, impl) { - nir_foreach_instr(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - switch (intrin->intrinsic) { - case nir_intrinsic_load_barycentric_pixel: - case nir_intrinsic_load_barycentric_centroid: - case nir_intrinsic_load_barycentric_sample: - case nir_intrinsic_load_barycentric_at_sample: - case nir_intrinsic_load_barycentric_at_offset: - break; - default: - continue; - } - - /* Ignore WPOS; it doesn't require interpolation. */ - if (!is_used_in_not_interp_frag_coord(&intrin->def)) - continue; - - nir_intrinsic_op bary_op = intrin->intrinsic; - enum brw_barycentric_mode bary = - brw_barycentric_mode(key, intrin); - - barycentric_interp_modes |= 1 << bary; - - if (devinfo->needs_unlit_centroid_workaround && - bary_op == nir_intrinsic_load_barycentric_centroid) - barycentric_interp_modes |= 1 << centroid_to_pixel(bary); - } - } - } - - return barycentric_interp_modes; -} - -/** - * Return a bitfield where bit n is set if barycentric interpolation - * mode n (see enum brw_barycentric_mode) is needed by the fragment - * shader barycentric intrinsics that take an explicit offset or - * sample as argument. - */ -static unsigned -brw_compute_offset_barycentric_interp_modes(const struct brw_wm_prog_key *key, - const nir_shader *shader) -{ - unsigned barycentric_interp_modes = 0; - - nir_foreach_function_impl(impl, shader) { - nir_foreach_block(block, impl) { - nir_foreach_instr(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - if (intrin->intrinsic == nir_intrinsic_load_barycentric_at_offset || - intrin->intrinsic == nir_intrinsic_load_barycentric_at_sample) - barycentric_interp_modes |= 1 << brw_barycentric_mode(key, intrin); - } - } - } - - return barycentric_interp_modes; -} - -static void -brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data, - const nir_shader *shader) -{ - prog_data->flat_inputs = 0; - - nir_foreach_shader_in_variable(var, shader) { - /* flat shading */ - if (var->data.interpolation != INTERP_MODE_FLAT) - continue; - - if (var->data.per_primitive) - continue; - - unsigned slots = glsl_count_attribute_slots(var->type, false); - for (unsigned s = 0; s < slots; s++) { - int input_index = prog_data->urb_setup[var->data.location + s]; - - if (input_index >= 0) - prog_data->flat_inputs |= 1 << input_index; - } - } -} - -static uint8_t -computed_depth_mode(const nir_shader *shader) -{ - if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { - switch (shader->info.fs.depth_layout) { - case FRAG_DEPTH_LAYOUT_NONE: - case FRAG_DEPTH_LAYOUT_ANY: - return BRW_PSCDEPTH_ON; - case FRAG_DEPTH_LAYOUT_GREATER: - return BRW_PSCDEPTH_ON_GE; - case FRAG_DEPTH_LAYOUT_LESS: - return BRW_PSCDEPTH_ON_LE; - case FRAG_DEPTH_LAYOUT_UNCHANGED: - /* We initially set this to OFF, but having the shader write the - * depth means we allocate register space in the SEND message. The - * difference between the SEND register count and the OFF state - * programming makes the HW hang. - * - * Removing the depth writes also leads to test failures. So use - * LesserThanOrEqual, which fits writing the same value - * (unchanged/equal). - * - */ - return BRW_PSCDEPTH_ON_LE; - } - } - return BRW_PSCDEPTH_OFF; -} /** * Move load_interpolated_input with simple (payload-based) barycentric modes @@ -3656,444 +3244,6 @@ brw_nir_move_interpolation_to_top(nir_shader *nir) return progress; } -static void -brw_nir_populate_wm_prog_data(nir_shader *shader, - const struct intel_device_info *devinfo, - const struct brw_wm_prog_key *key, - struct brw_wm_prog_data *prog_data, - const struct brw_mue_map *mue_map) -{ - prog_data->uses_kill = shader->info.fs.uses_discard; - prog_data->uses_omask = !key->ignore_sample_mask_out && - (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)); - prog_data->max_polygons = 1; - prog_data->computed_depth_mode = computed_depth_mode(shader); - prog_data->computed_stencil = - shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); - - prog_data->sample_shading = - shader->info.fs.uses_sample_shading || - shader->info.outputs_read; - - assert(key->multisample_fbo != BRW_NEVER || - key->persample_interp == BRW_NEVER); - - prog_data->persample_dispatch = key->persample_interp; - if (prog_data->sample_shading) - prog_data->persample_dispatch = BRW_ALWAYS; - - /* We can only persample dispatch if we have a multisample FBO */ - prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch, - key->multisample_fbo); - - /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If - * persample_dispatch & multisample_fbo are not dynamic, Anv should be able - * to definitively tell whether alpha_to_coverage is on or off. - */ - prog_data->alpha_to_coverage = key->alpha_to_coverage; - - prog_data->uses_sample_mask = - BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN); - - /* From the Ivy Bridge PRM documentation for 3DSTATE_PS: - * - * "MSDISPMODE_PERSAMPLE is required in order to select - * POSOFFSET_SAMPLE" - * - * So we can only really get sample positions if we are doing real - * per-sample dispatch. If we need gl_SamplePosition and we don't have - * persample dispatch, we hard-code it to 0.5. - */ - prog_data->uses_pos_offset = - prog_data->persample_dispatch != BRW_NEVER && - (BITSET_TEST(shader->info.system_values_read, - SYSTEM_VALUE_SAMPLE_POS) || - BITSET_TEST(shader->info.system_values_read, - SYSTEM_VALUE_SAMPLE_POS_OR_CENTER)); - - prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; - prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage; - prog_data->inner_coverage = shader->info.fs.inner_coverage; - - prog_data->barycentric_interp_modes = - brw_compute_barycentric_interp_modes(devinfo, key, shader); - - /* From the BDW PRM documentation for 3DSTATE_WM: - * - * "MSDISPMODE_PERSAMPLE is required in order to select Perspective - * Sample or Non- perspective Sample barycentric coordinates." - * - * So cleanup any potentially set sample barycentric mode when not in per - * sample dispatch. - */ - if (prog_data->persample_dispatch == BRW_NEVER) { - prog_data->barycentric_interp_modes &= - ~BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE); - } - - if (devinfo->ver >= 20) { - const unsigned offset_bary_modes = - brw_compute_offset_barycentric_interp_modes(key, shader); - - prog_data->uses_npc_bary_coefficients = - offset_bary_modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS; - prog_data->uses_pc_bary_coefficients = - offset_bary_modes & ~BRW_BARYCENTRIC_NONPERSPECTIVE_BITS; - prog_data->uses_sample_offsets = - offset_bary_modes & ((1 << BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE) | - (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE)); - } - - prog_data->uses_nonperspective_interp_modes = - (prog_data->barycentric_interp_modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) || - prog_data->uses_npc_bary_coefficients; - - /* The current VK_EXT_graphics_pipeline_library specification requires - * coarse to specified at compile time. But per sample interpolation can be - * dynamic. So we should never be in a situation where coarse & - * persample_interp are both respectively true & BRW_ALWAYS. - * - * Coarse will dynamically turned off when persample_interp is active. - */ - assert(!key->coarse_pixel || key->persample_interp != BRW_ALWAYS); - - prog_data->coarse_pixel_dispatch = - brw_sometimes_invert(prog_data->persample_dispatch); - if (!key->coarse_pixel || - prog_data->uses_omask || - prog_data->sample_shading || - prog_data->uses_sample_mask || - (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) || - prog_data->computed_stencil) { - prog_data->coarse_pixel_dispatch = BRW_NEVER; - } - - /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater, - * Message Descriptor : - * - * "Message Type. Specifies the type of message being sent when - * pixel-rate evaluation is requested : - * - * Format = U2 - * 0: Per Message Offset (eval_snapped with immediate offset) - * 1: Sample Position Offset (eval_sindex) - * 2: Centroid Position Offset (eval_centroid) - * 3: Per Slot Offset (eval_snapped with register offset) - * - * Message Type. Specifies the type of message being sent when - * coarse-rate evaluation is requested : - * - * Format = U2 - * 0: Coarse to Pixel Mapping Message (internal message) - * 1: Reserved - * 2: Coarse Centroid Position (eval_centroid) - * 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)" - * - * The Sample Position Offset is marked as reserved for coarse rate - * evaluation and leads to hangs if we try to use it. So disable coarse - * pixel shading if we have any intrinsic that will result in a pixel - * interpolater message at sample. - */ - if (intel_nir_pulls_at_sample(shader)) - prog_data->coarse_pixel_dispatch = BRW_NEVER; - - /* We choose to always enable VMask prior to XeHP, as it would cause - * us to lose out on the eliminate_find_live_channel() optimization. - */ - prog_data->uses_vmask = devinfo->verx10 < 125 || - shader->info.fs.needs_quad_helper_invocations || - shader->info.uses_wide_subgroup_intrinsics || - prog_data->coarse_pixel_dispatch != BRW_NEVER; - - prog_data->uses_src_w = - BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD); - prog_data->uses_src_depth = - BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) && - prog_data->coarse_pixel_dispatch != BRW_ALWAYS; - prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients || - (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) && - prog_data->coarse_pixel_dispatch != BRW_NEVER); - - calculate_urb_setup(devinfo, key, prog_data, shader, mue_map); - brw_compute_flat_inputs(prog_data, shader); -} - -const unsigned * -brw_compile_fs(const struct brw_compiler *compiler, - struct brw_compile_fs_params *params) -{ - struct nir_shader *nir = params->base.nir; - const struct brw_wm_prog_key *key = params->key; - struct brw_wm_prog_data *prog_data = params->prog_data; - bool allow_spilling = params->allow_spilling; - const bool debug_enabled = - brw_should_print_shader(nir, params->base.debug_flag ? - params->base.debug_flag : DEBUG_WM); - - prog_data->base.stage = MESA_SHADER_FRAGMENT; - prog_data->base.ray_queries = nir->info.ray_queries; - prog_data->base.total_scratch = 0; - - const struct intel_device_info *devinfo = compiler->devinfo; - const unsigned max_subgroup_size = 32; - - brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size); - brw_nir_lower_fs_inputs(nir, devinfo, key); - brw_nir_lower_fs_outputs(nir); - - /* From the SKL PRM, Volume 7, "Alpha Coverage": - * "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in - * hardware, regardless of the state setting for this feature." - */ - if (key->alpha_to_coverage != BRW_NEVER) { - /* Run constant fold optimization in order to get the correct source - * offset to determine render target 0 store instruction in - * emit_alpha_to_coverage pass. - */ - NIR_PASS(_, nir, nir_opt_constant_folding); - NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage, key, prog_data); - } - - NIR_PASS(_, nir, brw_nir_move_interpolation_to_top); - brw_postprocess_nir(nir, compiler, debug_enabled, - key->base.robust_flags); - - brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data, - params->mue_map); - - std::unique_ptr v8, v16, v32, vmulti; - cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL, - *multi_cfg = NULL; - float throughput = 0; - bool has_spilled = false; - - if (devinfo->ver < 20) { - v8 = std::make_unique(compiler, ¶ms->base, key, - prog_data, nir, 8, 1, - params->base.stats != NULL, - debug_enabled); - if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) { - params->base.error_str = ralloc_strdup(params->base.mem_ctx, - v8->fail_msg); - return NULL; - } else if (INTEL_SIMD(FS, 8)) { - simd8_cfg = v8->cfg; - - assert(v8->payload().num_regs % reg_unit(devinfo) == 0); - prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo); - - const performance &perf = v8->performance_analysis.require(); - throughput = MAX2(throughput, perf.throughput); - has_spilled = v8->spilled_any_registers; - allow_spilling = false; - } - } - - if (key->coarse_pixel && devinfo->ver < 20) { - if (prog_data->dual_src_blend) { - v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot" - " use SIMD8 messages.\n"); - } - v8->limit_dispatch_width(16, "SIMD32 not supported with coarse" - " pixel shading.\n"); - } - - if (!has_spilled && - (!v8 || v8->max_dispatch_width >= 16) && - (INTEL_SIMD(FS, 16) || params->use_rep_send)) { - /* Try a SIMD16 compile */ - v16 = std::make_unique(compiler, ¶ms->base, key, - prog_data, nir, 16, 1, - params->base.stats != NULL, - debug_enabled); - if (v8) - v16->import_uniforms(v8.get()); - if (!v16->run_fs(allow_spilling, params->use_rep_send)) { - brw_shader_perf_log(compiler, params->base.log_data, - "SIMD16 shader failed to compile: %s\n", - v16->fail_msg); - } else { - simd16_cfg = v16->cfg; - - assert(v16->payload().num_regs % reg_unit(devinfo) == 0); - prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo); - - const performance &perf = v16->performance_analysis.require(); - throughput = MAX2(throughput, perf.throughput); - has_spilled = v16->spilled_any_registers; - allow_spilling = false; - } - } - - const bool simd16_failed = v16 && !simd16_cfg; - - /* Currently, the compiler only supports SIMD32 on SNB+ */ - if (!has_spilled && - (!v8 || v8->max_dispatch_width >= 32) && - (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send && - !simd16_failed && - INTEL_SIMD(FS, 32)) { - /* Try a SIMD32 compile */ - v32 = std::make_unique(compiler, ¶ms->base, key, - prog_data, nir, 32, 1, - params->base.stats != NULL, - debug_enabled); - if (v8) - v32->import_uniforms(v8.get()); - else if (v16) - v32->import_uniforms(v16.get()); - - if (!v32->run_fs(allow_spilling, false)) { - brw_shader_perf_log(compiler, params->base.log_data, - "SIMD32 shader failed to compile: %s\n", - v32->fail_msg); - } else { - const performance &perf = v32->performance_analysis.require(); - - if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) { - brw_shader_perf_log(compiler, params->base.log_data, - "SIMD32 shader inefficient\n"); - } else { - simd32_cfg = v32->cfg; - - assert(v32->payload().num_regs % reg_unit(devinfo) == 0); - prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo); - - throughput = MAX2(throughput, perf.throughput); - } - } - } - - if (devinfo->ver >= 12 && !has_spilled && - params->max_polygons >= 2 && !key->coarse_pixel) { - fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get(); - assert(vbase); - - if (devinfo->ver >= 20 && - params->max_polygons >= 4 && - vbase->max_dispatch_width >= 32 && - 4 * prog_data->num_varying_inputs <= MAX_VARYING && - INTEL_SIMD(FS, 4X8)) { - /* Try a quad-SIMD8 compile */ - vmulti = std::make_unique(compiler, ¶ms->base, key, - prog_data, nir, 32, 4, - params->base.stats != NULL, - debug_enabled); - vmulti->import_uniforms(vbase); - if (!vmulti->run_fs(false, params->use_rep_send)) { - brw_shader_perf_log(compiler, params->base.log_data, - "Quad-SIMD8 shader failed to compile: %s\n", - vmulti->fail_msg); - } else { - multi_cfg = vmulti->cfg; - assert(!vmulti->spilled_any_registers); - } - } - - if (!multi_cfg && devinfo->ver >= 20 && - vbase->max_dispatch_width >= 32 && - 2 * prog_data->num_varying_inputs <= MAX_VARYING && - INTEL_SIMD(FS, 2X16)) { - /* Try a dual-SIMD16 compile */ - vmulti = std::make_unique(compiler, ¶ms->base, key, - prog_data, nir, 32, 2, - params->base.stats != NULL, - debug_enabled); - vmulti->import_uniforms(vbase); - if (!vmulti->run_fs(false, params->use_rep_send)) { - brw_shader_perf_log(compiler, params->base.log_data, - "Dual-SIMD16 shader failed to compile: %s\n", - vmulti->fail_msg); - } else { - multi_cfg = vmulti->cfg; - assert(!vmulti->spilled_any_registers); - } - } - - if (!multi_cfg && vbase->max_dispatch_width >= 16 && - 2 * prog_data->num_varying_inputs <= MAX_VARYING && - INTEL_SIMD(FS, 2X8)) { - /* Try a dual-SIMD8 compile */ - vmulti = std::make_unique(compiler, ¶ms->base, key, - prog_data, nir, 16, 2, - params->base.stats != NULL, - debug_enabled); - vmulti->import_uniforms(vbase); - if (!vmulti->run_fs(allow_spilling, params->use_rep_send)) { - brw_shader_perf_log(compiler, params->base.log_data, - "Dual-SIMD8 shader failed to compile: %s\n", - vmulti->fail_msg); - } else { - multi_cfg = vmulti->cfg; - } - } - - if (multi_cfg) { - assert(vmulti->payload().num_regs % reg_unit(devinfo) == 0); - prog_data->base.dispatch_grf_start_reg = vmulti->payload().num_regs / reg_unit(devinfo); - } - } - - /* When the caller requests a repclear shader, they want SIMD16-only */ - if (params->use_rep_send) - simd8_cfg = NULL; - - fs_generator g(compiler, ¶ms->base, &prog_data->base, - MESA_SHADER_FRAGMENT); - - if (unlikely(debug_enabled)) { - g.enable_debug(ralloc_asprintf(params->base.mem_ctx, - "%s fragment shader %s", - nir->info.label ? - nir->info.label : "unnamed", - nir->info.name)); - } - - struct brw_compile_stats *stats = params->base.stats; - uint32_t max_dispatch_width = 0; - - if (multi_cfg) { - prog_data->dispatch_multi = vmulti->dispatch_width; - prog_data->max_polygons = vmulti->max_polygons; - g.generate_code(multi_cfg, vmulti->dispatch_width, vmulti->shader_stats, - vmulti->performance_analysis.require(), - stats, vmulti->max_polygons); - stats = stats ? stats + 1 : NULL; - max_dispatch_width = vmulti->dispatch_width; - - } else if (simd8_cfg) { - prog_data->dispatch_8 = true; - g.generate_code(simd8_cfg, 8, v8->shader_stats, - v8->performance_analysis.require(), stats, 1); - stats = stats ? stats + 1 : NULL; - max_dispatch_width = 8; - } - - if (simd16_cfg) { - prog_data->dispatch_16 = true; - prog_data->prog_offset_16 = g.generate_code( - simd16_cfg, 16, v16->shader_stats, - v16->performance_analysis.require(), stats, 1); - stats = stats ? stats + 1 : NULL; - max_dispatch_width = 16; - } - - if (simd32_cfg) { - prog_data->dispatch_32 = true; - prog_data->prog_offset_32 = g.generate_code( - simd32_cfg, 32, v32->shader_stats, - v32->performance_analysis.require(), stats, 1); - stats = stats ? stats + 1 : NULL; - max_dispatch_width = 32; - } - - for (struct brw_compile_stats *s = params->base.stats; s != NULL && s != stats; s++) - s->max_dispatch_width = max_dispatch_width; - - g.add_const_data(nir->constant_data, nir->constant_data_size); - return g.get_assembly(); -} - unsigned brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data, unsigned threads) @@ -4104,47 +3254,6 @@ brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data, cs_prog_data->push.cross_thread.size; } -static void -fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords) -{ - block->dwords = dwords; - block->regs = DIV_ROUND_UP(dwords, 8); - block->size = block->regs * 32; -} - -static void -cs_fill_push_const_info(const struct intel_device_info *devinfo, - struct brw_cs_prog_data *cs_prog_data) -{ - const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - int subgroup_id_index = brw_get_subgroup_id_param_index(devinfo, prog_data); - - /* The thread ID should be stored in the last param dword */ - assert(subgroup_id_index == -1 || - subgroup_id_index == (int)prog_data->nr_params - 1); - - unsigned cross_thread_dwords, per_thread_dwords; - if (subgroup_id_index >= 0) { - /* Fill all but the last register with cross-thread payload */ - cross_thread_dwords = 8 * (subgroup_id_index / 8); - per_thread_dwords = prog_data->nr_params - cross_thread_dwords; - assert(per_thread_dwords > 0 && per_thread_dwords <= 8); - } else { - /* Fill all data using cross-thread payload */ - cross_thread_dwords = prog_data->nr_params; - per_thread_dwords = 0u; - } - - fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords); - fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords); - - assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 || - cs_prog_data->push.per_thread.size == 0); - assert(cs_prog_data->push.cross_thread.dwords + - cs_prog_data->push.per_thread.dwords == - prog_data->nr_params); -} - static bool filter_simd(const nir_instr *instr, const void * /* options */) { @@ -4195,131 +3304,6 @@ brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width) (void *)(uintptr_t)dispatch_width); } -const unsigned * -brw_compile_cs(const struct brw_compiler *compiler, - struct brw_compile_cs_params *params) -{ - const nir_shader *nir = params->base.nir; - const struct brw_cs_prog_key *key = params->key; - struct brw_cs_prog_data *prog_data = params->prog_data; - - const bool debug_enabled = - brw_should_print_shader(nir, params->base.debug_flag ? - params->base.debug_flag : DEBUG_CS); - - prog_data->base.stage = MESA_SHADER_COMPUTE; - prog_data->base.total_shared = nir->info.shared_size; - prog_data->base.ray_queries = nir->info.ray_queries; - prog_data->base.total_scratch = 0; - - if (!nir->info.workgroup_size_variable) { - prog_data->local_size[0] = nir->info.workgroup_size[0]; - prog_data->local_size[1] = nir->info.workgroup_size[1]; - prog_data->local_size[2] = nir->info.workgroup_size[2]; - } - - brw_simd_selection_state simd_state{ - .devinfo = compiler->devinfo, - .prog_data = prog_data, - .required_width = brw_required_dispatch_width(&nir->info), - }; - - std::unique_ptr v[3]; - - for (unsigned simd = 0; simd < 3; simd++) { - if (!brw_simd_should_compile(simd_state, simd)) - continue; - - const unsigned dispatch_width = 8u << simd; - - nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir); - brw_nir_apply_key(shader, compiler, &key->base, - dispatch_width); - - NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width); - - /* Clean up after the local index and ID calculations. */ - NIR_PASS(_, shader, nir_opt_constant_folding); - NIR_PASS(_, shader, nir_opt_dce); - - brw_postprocess_nir(shader, compiler, debug_enabled, - key->base.robust_flags); - - v[simd] = std::make_unique(compiler, ¶ms->base, - &key->base, - &prog_data->base, - shader, dispatch_width, - params->base.stats != NULL, - debug_enabled); - - const int first = brw_simd_first_compiled(simd_state); - if (first >= 0) - v[simd]->import_uniforms(v[first].get()); - - const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable; - - if (v[simd]->run_cs(allow_spilling)) { - cs_fill_push_const_info(compiler->devinfo, prog_data); - - brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); - } else { - simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg); - if (simd > 0) { - brw_shader_perf_log(compiler, params->base.log_data, - "SIMD%u shader failed to compile: %s\n", - dispatch_width, v[simd]->fail_msg); - } - } - } - - const int selected_simd = brw_simd_select(simd_state); - if (selected_simd < 0) { - params->base.error_str = - ralloc_asprintf(params->base.mem_ctx, - "Can't compile shader: " - "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n", - simd_state.error[0], simd_state.error[1], - simd_state.error[2]); - return NULL; - } - - assert(selected_simd < 3); - - if (!nir->info.workgroup_size_variable) - prog_data->prog_mask = 1 << selected_simd; - - fs_generator g(compiler, ¶ms->base, &prog_data->base, - MESA_SHADER_COMPUTE); - if (unlikely(debug_enabled)) { - char *name = ralloc_asprintf(params->base.mem_ctx, - "%s compute shader %s", - nir->info.label ? - nir->info.label : "unnamed", - nir->info.name); - g.enable_debug(name); - } - - uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1); - - struct brw_compile_stats *stats = params->base.stats; - for (unsigned simd = 0; simd < 3; simd++) { - if (prog_data->prog_mask & (1u << simd)) { - assert(v[simd]); - prog_data->prog_offset[simd] = - g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats, - v[simd]->performance_analysis.require(), stats); - if (stats) - stats->max_dispatch_width = max_dispatch_width; - stats = stats ? stats + 1 : NULL; - max_dispatch_width = 8u << simd; - } - } - - g.add_const_data(nir->constant_data, nir->constant_data_size); - - return g.get_assembly(); -} - struct intel_cs_dispatch_info brw_cs_get_dispatch_info(const struct intel_device_info *devinfo, const struct brw_cs_prog_data *prog_data, @@ -4347,185 +3331,6 @@ brw_cs_get_dispatch_info(const struct intel_device_info *devinfo, return info; } -static uint8_t -compile_single_bs(const struct brw_compiler *compiler, - struct brw_compile_bs_params *params, - const struct brw_bs_prog_key *key, - struct brw_bs_prog_data *prog_data, - nir_shader *shader, - fs_generator *g, - struct brw_compile_stats *stats, - int *prog_offset) -{ - const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT); - - prog_data->base.stage = shader->info.stage; - prog_data->max_stack_size = MAX2(prog_data->max_stack_size, - shader->scratch_size); - - const unsigned max_dispatch_width = 16; - brw_nir_apply_key(shader, compiler, &key->base, max_dispatch_width); - brw_postprocess_nir(shader, compiler, debug_enabled, - key->base.robust_flags); - - brw_simd_selection_state simd_state{ - .devinfo = compiler->devinfo, - .prog_data = prog_data, - - /* Since divergence is a lot more likely in RT than compute, it makes - * sense to limit ourselves to the smallest available SIMD for now. - */ - .required_width = compiler->devinfo->ver >= 20 ? 16u : 8u, - }; - - std::unique_ptr v[2]; - - for (unsigned simd = 0; simd < ARRAY_SIZE(v); simd++) { - if (!brw_simd_should_compile(simd_state, simd)) - continue; - - const unsigned dispatch_width = 8u << simd; - - if (dispatch_width == 8 && compiler->devinfo->ver >= 20) - continue; - - v[simd] = std::make_unique(compiler, ¶ms->base, - &key->base, - &prog_data->base, shader, - dispatch_width, - stats != NULL, - debug_enabled); - - const bool allow_spilling = !brw_simd_any_compiled(simd_state); - if (v[simd]->run_bs(allow_spilling)) { - brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); - } else { - simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, - v[simd]->fail_msg); - if (simd > 0) { - brw_shader_perf_log(compiler, params->base.log_data, - "SIMD%u shader failed to compile: %s", - dispatch_width, v[simd]->fail_msg); - } - } - } - - const int selected_simd = brw_simd_select(simd_state); - if (selected_simd < 0) { - params->base.error_str = - ralloc_asprintf(params->base.mem_ctx, - "Can't compile shader: " - "SIMD8 '%s' and SIMD16 '%s'.\n", - simd_state.error[0], simd_state.error[1]); - return 0; - } - - assert(selected_simd < int(ARRAY_SIZE(v))); - fs_visitor *selected = v[selected_simd].get(); - assert(selected); - - const unsigned dispatch_width = selected->dispatch_width; - - int offset = g->generate_code(selected->cfg, dispatch_width, selected->shader_stats, - selected->performance_analysis.require(), stats); - if (prog_offset) - *prog_offset = offset; - else - assert(offset == 0); - - return dispatch_width; -} - -uint64_t -brw_bsr(const struct intel_device_info *devinfo, - uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset) -{ - assert(offset % 64 == 0); - assert(simd_size == 8 || simd_size == 16); - assert(local_arg_offset % 8 == 0); - - return offset | - SET_BITS(simd_size == 8, 4, 4) | - SET_BITS(local_arg_offset / 8, 2, 0); -} - -const unsigned * -brw_compile_bs(const struct brw_compiler *compiler, - struct brw_compile_bs_params *params) -{ - nir_shader *shader = params->base.nir; - struct brw_bs_prog_data *prog_data = params->prog_data; - unsigned num_resume_shaders = params->num_resume_shaders; - nir_shader **resume_shaders = params->resume_shaders; - const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT); - - prog_data->base.stage = shader->info.stage; - prog_data->base.ray_queries = shader->info.ray_queries; - prog_data->base.total_scratch = 0; - - prog_data->max_stack_size = 0; - prog_data->num_resume_shaders = num_resume_shaders; - - fs_generator g(compiler, ¶ms->base, &prog_data->base, - shader->info.stage); - if (unlikely(debug_enabled)) { - char *name = ralloc_asprintf(params->base.mem_ctx, - "%s %s shader %s", - shader->info.label ? - shader->info.label : "unnamed", - gl_shader_stage_name(shader->info.stage), - shader->info.name); - g.enable_debug(name); - } - - prog_data->simd_size = - compile_single_bs(compiler, params, params->key, prog_data, - shader, &g, params->base.stats, NULL); - if (prog_data->simd_size == 0) - return NULL; - - uint64_t *resume_sbt = ralloc_array(params->base.mem_ctx, - uint64_t, num_resume_shaders); - for (unsigned i = 0; i < num_resume_shaders; i++) { - if (INTEL_DEBUG(DEBUG_RT)) { - char *name = ralloc_asprintf(params->base.mem_ctx, - "%s %s resume(%u) shader %s", - shader->info.label ? - shader->info.label : "unnamed", - gl_shader_stage_name(shader->info.stage), - i, shader->info.name); - g.enable_debug(name); - } - - /* TODO: Figure out shader stats etc. for resume shaders */ - int offset = 0; - uint8_t simd_size = - compile_single_bs(compiler, params, params->key, - prog_data, resume_shaders[i], &g, NULL, &offset); - if (simd_size == 0) - return NULL; - - assert(offset > 0); - resume_sbt[i] = brw_bsr(compiler->devinfo, offset, simd_size, 0); - } - - /* We only have one constant data so we want to make sure they're all the - * same. - */ - for (unsigned i = 0; i < num_resume_shaders; i++) { - assert(resume_shaders[i]->constant_data_size == - shader->constant_data_size); - assert(memcmp(resume_shaders[i]->constant_data, - shader->constant_data, - shader->constant_data_size) == 0); - } - - g.add_const_data(shader->constant_data, shader->constant_data_size); - g.add_resume_sbt(num_resume_shaders, resume_sbt); - - return g.get_assembly(); -} - unsigned fs_visitor::workgroup_size() const { diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index b1aa49a792b..59cb909a4ec 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -22,11 +22,7 @@ */ #include "brw_cfg.h" -#include "brw_eu.h" #include "brw_fs.h" -#include "brw_nir.h" -#include "brw_private.h" -#include "dev/intel_debug.h" #include "util/macros.h" bool @@ -605,129 +601,3 @@ fs_inst::remove(bblock_t *block, bool defer_later_block_ip_updates) exec_node::remove(); } -extern "C" const unsigned * -brw_compile_tes(const struct brw_compiler *compiler, - brw_compile_tes_params *params) -{ - const struct intel_device_info *devinfo = compiler->devinfo; - nir_shader *nir = params->base.nir; - const struct brw_tes_prog_key *key = params->key; - const struct intel_vue_map *input_vue_map = params->input_vue_map; - struct brw_tes_prog_data *prog_data = params->prog_data; - - const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TES); - - prog_data->base.base.stage = MESA_SHADER_TESS_EVAL; - prog_data->base.base.ray_queries = nir->info.ray_queries; - - nir->info.inputs_read = key->inputs_read; - nir->info.patch_inputs_read = key->patch_inputs_read; - - brw_nir_apply_key(nir, compiler, &key->base, - brw_geometry_stage_dispatch_width(compiler->devinfo)); - brw_nir_lower_tes_inputs(nir, input_vue_map); - brw_nir_lower_vue_outputs(nir); - brw_postprocess_nir(nir, compiler, debug_enabled, - key->base.robust_flags); - - brw_compute_vue_map(devinfo, &prog_data->base.vue_map, - nir->info.outputs_written, - nir->info.separate_shader, 1); - - unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4; - - assert(output_size_bytes >= 1); - if (output_size_bytes > GFX7_MAX_DS_URB_ENTRY_SIZE_BYTES) { - params->base.error_str = ralloc_strdup(params->base.mem_ctx, - "DS outputs exceed maximum size"); - return NULL; - } - - prog_data->base.clip_distance_mask = - ((1 << nir->info.clip_distance_array_size) - 1); - prog_data->base.cull_distance_mask = - ((1 << nir->info.cull_distance_array_size) - 1) << - nir->info.clip_distance_array_size; - - prog_data->include_primitive_id = - BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID); - - /* URB entry sizes are stored as a multiple of 64 bytes. */ - prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64; - - prog_data->base.urb_read_length = 0; - - STATIC_ASSERT(INTEL_TESS_PARTITIONING_INTEGER == TESS_SPACING_EQUAL - 1); - STATIC_ASSERT(INTEL_TESS_PARTITIONING_ODD_FRACTIONAL == - TESS_SPACING_FRACTIONAL_ODD - 1); - STATIC_ASSERT(INTEL_TESS_PARTITIONING_EVEN_FRACTIONAL == - TESS_SPACING_FRACTIONAL_EVEN - 1); - - prog_data->partitioning = - (enum intel_tess_partitioning) (nir->info.tess.spacing - 1); - - switch (nir->info.tess._primitive_mode) { - case TESS_PRIMITIVE_QUADS: - prog_data->domain = INTEL_TESS_DOMAIN_QUAD; - break; - case TESS_PRIMITIVE_TRIANGLES: - prog_data->domain = INTEL_TESS_DOMAIN_TRI; - break; - case TESS_PRIMITIVE_ISOLINES: - prog_data->domain = INTEL_TESS_DOMAIN_ISOLINE; - break; - default: - unreachable("invalid domain shader primitive mode"); - } - - if (nir->info.tess.point_mode) { - prog_data->output_topology = INTEL_TESS_OUTPUT_TOPOLOGY_POINT; - } else if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) { - prog_data->output_topology = INTEL_TESS_OUTPUT_TOPOLOGY_LINE; - } else { - /* Hardware winding order is backwards from OpenGL */ - prog_data->output_topology = - nir->info.tess.ccw ? INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW - : INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW; - } - - if (unlikely(debug_enabled)) { - fprintf(stderr, "TES Input "); - brw_print_vue_map(stderr, input_vue_map, MESA_SHADER_TESS_EVAL); - fprintf(stderr, "TES Output "); - brw_print_vue_map(stderr, &prog_data->base.vue_map, - MESA_SHADER_TESS_EVAL); - } - - const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8; - fs_visitor v(compiler, ¶ms->base, &key->base, - &prog_data->base.base, nir, dispatch_width, - params->base.stats != NULL, debug_enabled); - if (!v.run_tes()) { - params->base.error_str = - ralloc_strdup(params->base.mem_ctx, v.fail_msg); - return NULL; - } - - assert(v.payload().num_regs % reg_unit(devinfo) == 0); - prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo); - - prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8; - - fs_generator g(compiler, ¶ms->base, - &prog_data->base.base, MESA_SHADER_TESS_EVAL); - if (unlikely(debug_enabled)) { - g.enable_debug(ralloc_asprintf(params->base.mem_ctx, - "%s tessellation evaluation shader %s", - nir->info.label ? nir->info.label - : "unnamed", - nir->info.name)); - } - - g.generate_code(v.cfg, dispatch_width, v.shader_stats, - v.performance_analysis.require(), params->base.stats); - - g.add_const_data(nir->constant_data, nir->constant_data_size); - - return g.get_assembly(); -} diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index 2de26b9e655..12a8a8de201 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -23,8 +23,13 @@ intel_nir_files = files( libintel_compiler_brw_files = files( 'brw_cfg.cpp', 'brw_cfg.h', + 'brw_compile_bs.cpp', + 'brw_compile_cs.cpp', + 'brw_compile_fs.cpp', 'brw_compile_gs.cpp', + 'brw_compile_mesh.cpp', 'brw_compile_tcs.cpp', + 'brw_compile_tes.cpp', 'brw_compile_vs.cpp', 'brw_compiler.c', 'brw_compiler.h', @@ -80,7 +85,6 @@ libintel_compiler_brw_files = files( 'brw_ir_performance.cpp', 'brw_isa_info.h', 'brw_lower_logical_sends.cpp', - 'brw_mesh.cpp', 'brw_nir.h', 'brw_nir.c', 'brw_nir_analyze_ubo_ranges.c',