diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build index 75fbd1c5c74..516397fdafc 100644 --- a/src/compiler/nir/meson.build +++ b/src/compiler/nir/meson.build @@ -273,6 +273,7 @@ files_libnir = files( 'nir_opt_undef.c', 'nir_opt_uniform_atomics.c', 'nir_opt_uniform_subgroup.c', + 'nir_opt_varyings.c', 'nir_opt_vectorize.c', 'nir_passthrough_gs.c', 'nir_passthrough_tcs.c', diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index a008d0cd055..52d51b0f7f0 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -3577,6 +3577,16 @@ typedef enum { */ nir_io_has_flexible_input_interpolation_except_flat = BITFIELD_BIT(0), + /** + * nir_opt_varyings compacts (relocates) components of varyings by + * rewriting their locations completely, effectively moving components of + * varyings between slots. This option forces nir_opt_varyings to make + * VARYING_SLOT_POS unused by moving its contents to VARn if the consumer + * is not FS. If this option is not set and POS is unused, it moves + * components of VARn to POS until it's fully used. + */ + nir_io_dont_use_pos_for_non_fs_varyings = BITFIELD_BIT(1), + /* Options affecting the GLSL compiler are below. */ /** @@ -3584,6 +3594,17 @@ typedef enum { * This is only affects GLSL compilation. */ nir_io_glsl_lower_derefs = BITFIELD_BIT(16), + + /** + * Run nir_opt_varyings in the GLSL linker. If false, optimize varyings + * the old way and lower IO later. + * + * nir_io_lower_to_intrinsics must be set for this to take effect. + * + * TODO: remove this and default to enabled once we are sure that this + * codepath is solid. + */ + nir_io_glsl_opt_varyings = BITFIELD_BIT(17), } nir_io_options; /** An instruction filtering callback @@ -4083,6 +4104,31 @@ typedef struct nir_shader_compiler_options { * Used by nir_lower_io_passes. */ void (*lower_mediump_io)(struct nir_shader *nir); + + /** + * Return the maximum cost of an expression that's written to a shader + * output that can be moved into the next shader to remove that output. + * + * Currently only uniform expressions are moved. A uniform expression is + * any ALU expression sourcing only constants, uniforms, and UBO loads. + * + * Set to NULL or return 0 if you only want to propagate constants from + * outputs to inputs. + * + * Drivers can set the maximum cost based on the types of consecutive + * shaders or shader SHA1s. + * + * Drivers should also set "varying_estimate_instr_cost". + */ + unsigned (*varying_expression_max_cost)(struct nir_shader *consumer, + struct nir_shader *producer); + + /** + * Return the cost of an instruction that could be moved into the next + * shader. If the cost of all instructions in an expression is <= + * varying_expression_max_cost(), the instruction is moved. + */ + unsigned (*varying_estimate_instr_cost)(struct nir_instr *instr); } nir_shader_compiler_options; typedef struct nir_shader { @@ -5204,6 +5250,22 @@ nir_deref_instr *nir_clone_deref_instr(struct nir_builder *b, nir_variable *var, nir_deref_instr *deref); + +/* Return status from nir_opt_varyings. */ +typedef enum { + /* Whether the IR changed such that NIR optimizations should be run, such + * as due to removal of loads and stores. IO semantic changes such as + * compaction don't count as IR changes because they don't affect NIR + * optimizations. + */ + nir_progress_producer = BITFIELD_BIT(0), + nir_progress_consumer = BITFIELD_BIT(1), +} nir_opt_varyings_progress; + +nir_opt_varyings_progress +nir_opt_varyings(nir_shader *producer, nir_shader *consumer, bool spirv, + unsigned max_uniform_components, unsigned max_ubos_per_stage); + bool nir_slot_is_sysval_output(gl_varying_slot slot, gl_shader_stage next_shader); bool nir_slot_is_varying(gl_varying_slot slot); diff --git a/src/compiler/nir/nir_opt_varyings.c b/src/compiler/nir/nir_opt_varyings.c new file mode 100644 index 00000000000..f4331c0b879 --- /dev/null +++ b/src/compiler/nir/nir_opt_varyings.c @@ -0,0 +1,4091 @@ +/* + * Copyright © 2023 Advanced Micro Devices, Inc. + * + * SPDX-License-Identifier: MIT + */ + +/* Introduction + * ============ + * + * This pass optimizes varyings between 2 shaders, which means dead input/ + * output removal, constant and uniform load propagation, deduplication, + * compaction, and inter-shader code motion. This is used during the shader + * linking process. + * + * + * Notes on behavior + * ================= + * + * The pass operates on scalar varyings using 32-bit and 16-bit types. Vector + * varyings are not allowed. + * + * Indirectly-indexed varying slots (not vertices) are not optimized or + * compacted, but unused slots of indirectly-indexed varyings are still filled + * with directly-indexed varyings during compaction. Indirectly-indexed + * varyings are still removed if they are unused by the other shader. + * + * Indirectly-indexed vertices don't disallow optimizations, but compromises + * are made depending on how they are accessed. They are common in TCS, TES, + * and GS, so there is a desire to optimize them as much as possible. More on + * that in various sections below. + * + * Transform feedback doesn't prevent most optimizations such as constant + * propagation and compaction. Shaders can be left with output stores that set + * the no_varying flag, meaning the output is not consumed by the next shader, + * which means that optimizations did their job and now the output is only + * consumed by transform feedback. + * + * All legacy varying slots are optimized when it's allowed. + * + * + * Convergence property of shader outputs + * ====================================== + * + * When an output stores an SSA that is convergent and all stores of that + * output appear in unconditional blocks or conditional blocks with + * a convergent entry condition and the shader is not GS, it implies that all + * vertices of that output have the same value, therefore the output can be + * promoted to flat because all interpolation modes lead to the same result + * as flat. Such outputs are opportunistically compacted with both flat and + * non-flat varyings based on whichever has unused slots in their vec4s. This + * pass refers to such inputs, outputs, and varyings as "convergent" (meaning + * all vertices are always equal). + * + * Flat varyings are the only ones that are never considered convergent + * because we want the flexibility to pack convergent varyings with both flat + * and non-flat varyings, and since flat varyings can contain integers and + * doubles, we can never interpolate them as FP32 or FP16. Optimizations start + * with separate interpolated, flat, and convergent groups of varyings, and + * they choose whether they want to promote convergent to interpolated or + * flat, or whether to leave that decision to the end when the compaction + * happens. + * + * TES patch inputs are always convergent because they are uniform within + * a primitive. + * + * + * Optimization steps + * ================== + * + * 1. Determine which varying slots can be optimized and how. + * + * * When a varying is said to be "optimized" in the following text, it + * means all optimizations are performed, such as removal, constant + * propagation, and deduplication. + * * All VARn, PATCHn, and FOGC varyings are always optimized and + * compacted. + * * PRIMITIVE_ID is treated as VARn in (GS, FS). + * * TEXn are removed if they are dead (except TEXn inputs, which can't be + * removed due to being affected by the coord replace state). TEXn can’t + * also be optimized or compacted due to being affected by the coord + * replace state. TEXn not consumed by FS are treated as VARn. + * * COLn and BFCn only propagate constants if they are between 0 and 1 + * because of the clamp vertex color state, and they are only + * deduplicated and compacted among themselves because they are affected + * by the flat shade, provoking vertex, two-side color selection, and + * clamp vertex color states. COLn and BFCn not consumed by FS are + * treated as VARn. + * * All system value outputs like POS, PSIZ, CLIP_DISTn, etc. can’t be + * removed, but they are demoted to sysval-only outputs by setting + * the "no_varying" flag (i.e. they can be removed as varyings), so + * drivers should look at the "no_varying" flag. If an output is not + * a sysval output in a specific stage, it's treated as VARn. (such as + * POS in TCS) + * * TESS_LEVEL_* inputs in TES can’t be touched if TCS is missing. + * + * 2. Remove unused inputs and outputs + * + * * Outputs not used in the next shader are removed. + * * Inputs not initialized by the previous shader are replaced with undef + * except: + * * LAYER and VIEWPORT are replaced with 0 in FS. + * * TEXn.xy is untouched because the coord replace state can set it, and + * TEXn.zw is replaced by (0, 1), which is equal to the coord replace + * value. + * * Output loads that have no output stores anywhere in the shader are + * replaced with undef. (for TCS, though it works with any shader) + * * Output stores with transform feedback are preserved, but get + * the “no_varying” flag, meaning they are not consumed by the next + * shader stage. Later, transform-feedback-only varyings are compacted + * (relocated) such that they are always last. + * * TCS outputs that are read by TCS, but not used by TES get + * the "no_varying" flag to indicate that they are only read by TCS and + * not consumed by TES. Later, such TCS outputs are compacted (relocated) + * such that they are always last to keep all outputs consumed by TES + * consecutive without holes. + * + * 3. Constant, uniform, UBO load, and uniform expression propagation + * + * * Define “uniform expressions” as ALU expressions only sourcing + * constants, uniforms, and UBO loads. + * * Constants, uniforms, UBO loads, and uniform expressions stored + * in outputs are moved into the next shader, and the outputs are removed. + * * The same propagation is done from output stores to output loads. + * (for TCS, though it works with any shader) + * * If there are multiple stores to the same output, all such stores + * should store the same constant, uniform, UBO load, or uniform + * expression for the expression to be propagated. If an output has + * multiple vertices, all vertices should store the same expression. + * * nir->options has callbacks that are used to estimate the cost of + * uniform expressions that drivers can set to control the complexity of + * uniform expressions that are propagated. This is to ensure that + * we don't increase the GPU overhead measurably by moving code across + * pipeline stages that amplify GPU work. + * * Special cases: + * * Constant COLn and BFCn are propagated only if the constants are + * in the [0, 1] range because of the clamp vertex color state. + * If both COLn and BFCn are written, they must write the same + * constant. If BFCn is written but not COLn, the constant is + * propagated from BFCn to COLn. + * * TEX.xy is untouched because of the coord replace state. + * If TEX.zw is (0, 1), only those constants are propagated because + * they match the coord replace values. + * * CLIP_DISTn, LAYER and VIEWPORT are always propagated. + * Eliminated output stores get the "no_varying" flag if they are also + * xfb stores or write sysval outputs. + * + * 4. Remove duplicated output components + * + * * By comparing SSA defs. + * * If there are multiple stores to the same output, all such stores + * should store the same SSA as all stores of another output for + * the output to be considered duplicated. If an output has multiple + * vertices, all vertices should store the same SSA. + * * Deduplication can only be done between outputs of the same category. + * Those are: interpolated, patch, flat, interpolated color, flat color, + * and conditionally interpolated color based on the flat + * shade state + * * Everything is deduplicated except TEXn due to the coord replace state. + * * Eliminated output stores get the "no_varying" flag if they are also + * xfb stores or write sysval outputs. + * + * 5. Backward inter-shader code motion + * + * "Backward" refers to moving code in the opposite direction that shaders + * are executed, i.e. moving code from the consumer to the producer. + * + * Fragment shader example: + * ``` + * result = input0 * uniform + input1 * constant + UBO.variable; + * ``` + * + * The computation of "result" in the above example can be moved into + * the previous shader and both inputs can be replaced with a new input + * holding the value of "result", thus making the shader smaller and + * possibly reducing the number of inputs, uniforms, and UBOs by 1. + * + * Such code motion can be performed for any expression sourcing only + * inputs, constants, and uniforms except for fragment shaders, which can + * also do it but with the following limitations: + * * Only these transformations can be perfomed with interpolated inputs + * and any composition of these transformations (such as lerp), which can + * all be proven mathematically: + * * interp(x, i, j) + interp(y, i, j) = interp(x + y, i, j) + * * interp(x, i, j) + convergent_expr = interp(x + convergent_expr, i, j) + * * interp(x, i, j) * convergent_expr = interp(x * convergent_expr, i, j) + * * all of these transformations are considered "inexact" in NIR + * * interp interpolates an input according to the barycentric + * coordinates (i, j), which are different for perspective, + * noperspective, center, centroid, sample, at_offset, and at_sample + * modes. + * * convergent_expr is any expression sourcing only constants, + * uniforms, and convergent inputs. The only requirement on + * convergent_expr is that it doesn't vary between vertices of + * the same primitive, but it can vary between primitives. + * * If inputs are flat or convergent, there are no limitations on + * expressions that can be moved. + * * Interpolated and flat inputs can't mix in the same expression, but + * convergent inputs can mix with both. + * * The interpolation qualifier of the new input is inherited from + * the removed non-convergent inputs that should all have the same (i, j). + * If there are no non-convergent inputs, then the new input is declared + * as flat (for simplicity; we can't choose the barycentric coordinates + * at random because AMD doesn't like when there are multiple sets of + * barycentric coordinates in the same shader unnecessarily). + * * Inf values break code motion across interpolation. See the section + * discussing how we handle it near the end. + * + * The above rules also apply to open-coded TES input interpolation, which + * is handled the same as FS input interpolation. The only differences are: + * * Open-coded TES input interpolation must match one of the allowed + * equations. Different interpolation equations are treated the same as + * different interpolation qualifiers in FS. + * * Patch varyings are always treated as convergent. + * + * Prerequisites: + * * We need a post-dominator tree that is constructed from a graph where + * vertices are instructions and directed edges going into them are + * the values of their source operands. This is different from how NIR + * dominance works, which represents all instructions within a basic + * block as a linear chain of vertices in the graph. + * In our graph, all loads without source operands and all constants are + * entry nodes in the graph, and all stores and discards are exit nodes + * in the graph. Each shader can have multiple disjoint graphs where + * the Lowest Common Ancestor of 2 instructions doesn't exist. + * * Given the above definition, the instruction whose result is the best + * candidate for a new input is the farthest instruction that + * post-dominates one of more inputs and is movable between shaders. + * + * Algorithm Idea Part 1: Search + * * Pick any input load that is hypothetically movable and call it + * the iterator. + * * Get the immediate post-dominator of the iterator, and if it's movable, + * replace the iterator with it. + * * Repeat the previous step until the obtained immediate post-dominator + * is not movable. + * * The iterator now contains the farthest post-dominator that is movable. + * * Gather all input loads that the post-dominator consumes. + * * For each of those input loads, all matching output stores must be + * in the same block (because they will be replaced by a single store). + * + * Algorithm Idea Part 2: Code Motion + * * Clone the post-dominator in the producer except input loads, which + * should be replaced by stored output values. Uniform and UBO loads, + * if any, should be cloned too. + * * Remove the original output stores. + * * Replace the post-dominator from the consumer with a new input load. + * * The step above makes the post-dominated input load that we picked + * at the beginning dead, but other input loads used by the post- + * dominator might still have other uses (shown in the example below). + * + * Example SSA-use graph - initial shader and the result: + * ``` + * input0 input1 input0 input1 + * \ / \ | \ + * constant alu ... ======> | ... + * \ / + * alu + * (post-dominator) + * ``` + * + * Description: + * On the right, the algorithm moved the constant and both ALU opcodes + * into the previous shader and input0 now contains the value of + * the post-dominator. input1 stays the same because it still has one + * use left. If input1 hadn't had the other use, it would have been + * removed. + * + * If the algorithm moves any code, the algorithm is repeated until there + * is no code that it can move. + * + * Which shader pairs are supported: + * * (VS, FS), (TES, FS): yes, fully + * * Limitation: If Infs must be preserved, no code is moved across + * interpolation, so only flat varyings are optimized. + * * (VS, TCS), (VS, GS), (TES, GS): no, but possible -- TODO + * * Current behavior: + * * Per-vertex inputs are rejected. + * * Possible solution: + * * All input loads used by an accepted post-dominator must use + * the same vertex index. The post-dominator must use all loads with + * that vertex index. + * * If a post-dominator is found for an input load from a specific + * slot, all other input loads from that slot must also have + * an accepted post-dominator, and all such post-dominators should + * be identical expressions. + * * (TCS, TES), (VS, TES): yes, with limitations + * * Limitations: + * * Only 1 store and 1 load per slot allowed. + * * No output loads allowed. + * * All stores used by an accepted post-dominator must be in + * the same block. + * * TCS barriers don't matter because there are no output loads. + * * Patch varyings are handled trivially with the above constraints. + * * Per-vertex outputs should only be indexed by gl_InvocationID. + * * An interpolated TES load is any ALU instruction that computes + * the result of linear interpolation of per-vertex inputs from + * the same slot using gl_TessCoord. If such an ALU instruction is + * found, it must be the only one, and all per-vertex input loads + * from that slot must feed into it. The interpolation equation must + * be equal to one of the allowed equations. Then the same rules as + * for interpolated FS inputs are used, treating different + * interpolation equations just like different interpolation + * qualifiers. + * * Patch inputs are treated as convergent, which means they are + * allowed to be in the same movable expression as interpolated TES + * inputs, and the same rules as for convergent FS inputs apply. + * * (GS, FS), (MS, FS): no + * * Workaround: Add a passthrough VS between GS/MS and FS, run + * the pass on the (VS, FS) pair to move code out of FS, + * and inline that VS at the end of your hw-specific + * GS/MS if it's possible. + * * (TS, MS): no + * + * The disadvantage of using the post-dominator tree is that it's a tree, + * which means there is only 1 post-dominator of each input. This example + * shows a case that could be optimized by replacing 3 inputs with 2 inputs, + * reducing the number of inputs by 1, but the immediate post-dominator of + * all input loads is NULL: + * ``` + * temp0 = input0 + input1 + input2; + * temp1 = input0 + input1 * const1 + input2 * const2; + * ``` + * + * If there is a graph algorithm that returns the best solution to + * the above case (which is temp0 and temp1 to replace all 3 inputs), let + * us know. + * + * 6. Forward inter-shader code motion + * + * TODO: Not implemented. The text below is a draft of the description. + * + * "Forward" refers to moving code in the direction that shaders are + * executed, i.e. moving code from the producer to the consumer. + * + * Vertex shader example: + * ``` + * output0 = value + 1; + * output1 = value * 2; + * ``` + * + * Both outputs can be replaced by 1 output storing "value", and both ALU + * operations can be moved into the next shader. + * + * The same dominance algorithm as in the previous optimization is used, + * except that: + * * Instead of inputs, we use outputs. + * * Instead of a post-dominator tree, we use a dominator tree of the exact + * same graph. + * + * The algorithm idea is: For each pair of 2 output stores, find their + * Lowest Common Ancestor in the dominator tree, and that's a candidate + * for a new output. All movable loads like load_const should be removed + * from the graph, otherwise the LCA wouldn't exist. + * + * The limitations on instructions that can be moved between shaders across + * interpolated loads are exactly the same as the previous optimization. + * + * nir->options has callbacks that are used to estimate the cost of + * expressions that drivers can set to control the complexity of + * expressions that can be moved to later shaders. This is to ensure that + * we don't increase the GPU overhead measurably by moving code across + * pipeline stages that amplify GPU work. + * + * 7. Compaction to vec4 slots (AKA packing) + * + * First, varyings are divided into these groups, and each group is + * compacted separately with some exceptions listed below: + * + * Non-FS groups (patch and non-patch are packed separately): + * * 32-bit flat + * * 16-bit flat + * * 32-bit no-varying (TCS outputs read by TCS but not TES) + * * 16-bit no-varying (TCS outputs read by TCS but not TES) + * + * FS groups: + * * 32-bit interpolated (always FP32) + * * 32-bit flat + * * 32-bit convergent (always FP32) + * * 16-bit interpolated (always FP16) + * * 16-bit flat + * * 16-bit convergent (always FP16) + * * 32-bit transform feedback only + * * 16-bit transform feedback only + * + * Then, all scalar varyings are relocated into new slots, starting from + * VAR0.x and increasing the scalar slot offset in 32-bit or 16-bit + * increments. Rules: + * * Both 32-bit and 16-bit flat varyings are packed in the same vec4. + * * Convergent varyings can be packed with interpolated varyings of + * the same type or flat. The group to pack with is chosen based on + * whichever has unused scalar slots because we want to reduce the total + * number of vec4s. After filling all unused scalar slots, the remaining + * convergent varyings are packed as flat. + * * Transform-feedback-only slots and no-varying slots are packed last, + * so that they are consecutive and not intermixed with varyings consumed + * by the next shader stage, and 32-bit and 16-bit slots are packed in + * the same vec4. This allows reducing memory for outputs by ignoring + * the trailing outputs that the next shader stage doesn't read. + * + * In the end, we should end up with these groups for FS: + * * 32-bit interpolated (always FP32) on separate vec4s + * * 16-bit interpolated (always FP16) on separate vec4s + * * 32-bit flat and 16-bit flat, mixed in the same vec4 + * * 32-bit and 16-bit transform feedback only, sharing vec4s with flat + * + * Colors are compacted the same but separately because they can't be mixed + * with VARn. Colors are divided into 3 FS groups. They are: + * * 32-bit maybe-interpolated (affected by the flat-shade state) + * * 32-bit interpolated (not affected by the flat-shade state) + * * 32-bit flat (not affected by the flat-shade state) + * + * To facilitate driver-specific output merging, color channels are + * assigned in a rotated order depending on which one the first unused VARn + * channel is. For example, if the first unused VARn channel is VAR0.z, + * color channels are allocated in this order: + * COL0.z, COL0.w, COL0.x, COL0.y, COL1.z, COL1.w, COL1.x, COL1.y + * The reason is that some drivers merge outputs if each output sets + * different components, for example 2 outputs defining VAR0.xy and COL0.z. + * If drivers do interpolation in the fragment shader and color + * interpolation can differ for each component, VAR0.xy and COL.z can be + * stored in the same output storage slot, and the consumer can load VAR0 + * and COL0 from the same slot. + * + * If COLn, BFCn, and TEXn are transform-feedback-only, they are moved to + * VARn. PRIMITIVE_ID in (GS, FS) and FOGC in (xx, FS) are always moved to + * VARn for better packing. + * + * + * Issue: Interpolation converts Infs to NaNs + * ========================================== + * + * Interpolation converts Infs to NaNs, i.e. interp(Inf, i, j) = NaN, which + * impacts and limits backward inter-shader code motion, uniform expression + * propagation, and compaction. + * + * When we decide not to interpolate a varying, we need to convert Infs to + * NaNs manually. Infs can be converted to NaNs like this: x*0 + x + * (suggested by Ian Romanick, the multiplication must be "exact") + * + * Changes to optimizations: + * - When we propagate a uniform expression and NaNs must be preserved, + * convert Infs in the result to NaNs using "x*0 + x" in the consumer. + * - When we change interpolation to flat for convergent varyings and NaNs + * must be preserved, apply "x*0 + x" to the stored output value + * in the producer. + * - There is no solution for backward inter-shader code motion with + * interpolation if Infs must be preserved. As an alternative, we can allow + * code motion across interpolation only for specific shader hashes in + * can_move_alu_across_interp. We can use shader-db to automatically produce + * a list of shader hashes that benefit from this optimization. + * + * + * Usage + * ===== + * + * Requirements: + * - ALUs should be scalarized + * - Dot products and other vector opcodes should be lowered (recommended) + * - Input loads and output stores should be scalarized + * - 64-bit varyings should be lowered to 32 bits + * - nir_vertex_divergence_analysis must be called on the producer if + * the constumer is a fragment shader + * + * It's recommended to run this for all shader pairs from the first shader + * to the last shader first (to propagate constants etc.). If the optimization + * of (S1, S2) stages leads to changes in S1, remember the highest S1. Then + * re-run this for all shader pairs in the descending order from S1 to VS. + * + * NIR optimizations should be performed after every run that changes the IR. + * + * + * Analyzing the optimization potential of linking separate shaders + * ================================================================ + * + * We can use this pass in an analysis pass that decides whether a separate + * shader has the potential to benefit from full draw-time linking. The way + * it would work is that we would create a passthrough shader adjacent to + * the separate shader, run this pass on both shaders, and check if the number + * of varyings decreased. This way we can decide to perform the draw-time + * linking only if we are confident that it would help performance. + * + * TODO: not implemented, mention the pass that implements it + */ + +#include "nir.h" +#include "nir_builder.h" +#include "util/u_math.h" + +/* nir_opt_varyings works at scalar 16-bit granularity across all varyings. + * + * Slots (i % 8 == 0,2,4,6) are 32-bit channels or low bits of 16-bit channels. + * Slots (i % 8 == 1,3,5,7) are high bits of 16-bit channels. 32-bit channels + * don't set these slots as used in bitmasks. + */ +#define NUM_SCALAR_SLOTS (NUM_TOTAL_VARYING_SLOTS * 8) + +/* Fragment shader input slots can be packed with indirectly-indexed vec4 + * slots if there are unused components, but only if the vec4 slot has + * the same interpolation type. There are only 3 types: FLAT, FP32, FP16. + */ +enum fs_vec4_type { + FS_VEC4_TYPE_NONE = 0, + FS_VEC4_TYPE_FLAT, + FS_VEC4_TYPE_INTERP_FP32, + FS_VEC4_TYPE_INTERP_FP16, + FS_VEC4_TYPE_INTERP_COLOR, +}; + +static unsigned +get_scalar_16bit_slot(nir_io_semantics sem, unsigned component) +{ + return sem.location * 8 + component * 2 + sem.high_16bits; +} + +static unsigned +intr_get_scalar_16bit_slot(nir_intrinsic_instr *intr) +{ + return get_scalar_16bit_slot(nir_intrinsic_io_semantics(intr), + nir_intrinsic_component(intr)); +} + +static unsigned +vec4_slot(unsigned scalar_slot) +{ + return scalar_slot / 8; +} + +struct list_node { + struct list_head head; + nir_intrinsic_instr *instr; +}; + +/* Information about 1 scalar varying slot for both shader stages. */ +struct scalar_slot { + struct { + /* Linked list of all store instructions writing into the scalar slot + * in the producer. + */ + struct list_head stores; + + /* Only for TCS: Linked list of all load instructions read the scalar + * slot in the producer. + */ + struct list_head loads; + + /* If there is only one store instruction or if all store instructions + * store the same value in the producer, this is the instruction + * computing the stored value. Used by constant and uniform propagation + * to the next shader. + */ + nir_instr *value; + } producer; + + struct { + /* Linked list of all load instructions loading from the scalar slot + * in the consumer. + */ + struct list_head loads; + + /* The result of TES input interpolation. */ + nir_alu_instr *tes_interp_load; + unsigned tes_interp_mode; /* FLAG_INTERP_TES_* */ + nir_def *tes_load_tess_coord; + } consumer; + + /* The number of accessed slots if this slot has indirect indexing. */ + unsigned num_slots; +}; + +struct linkage_info { + struct scalar_slot slot[NUM_SCALAR_SLOTS]; + + bool spirv; + bool can_move_uniforms; + bool can_move_ubos; + + gl_shader_stage producer_stage; + gl_shader_stage consumer_stage; + nir_builder producer_builder; + nir_builder consumer_builder; + unsigned max_varying_expression_cost; + + /* Memory context for linear_alloc_child (fast allocation). */ + void *linear_mem_ctx; + + /* If any component of a vec4 slot is accessed indirectly, this is its + * FS vec4 qualifier type, which is either FLAT, FP32, or FP16. + * Components with different qualifier types can't be compacted + * in the same vec4. + */ + uint8_t fs_vec4_type[NUM_TOTAL_VARYING_SLOTS]; + + /* Mask of all varyings that can be removed. Only a few non-VARn non-PATCHn + * varyings can't be removed. + */ + BITSET_DECLARE(removable_mask, NUM_SCALAR_SLOTS); + + /* Mask of all slots that have transform feedback info. */ + BITSET_DECLARE(xfb_mask, NUM_SCALAR_SLOTS); + + /* Mask of all slots that have transform feedback info, but are not used + * by the next shader. Separate masks for 32-bit and 16-bit outputs. + */ + BITSET_DECLARE(xfb32_only_mask, NUM_SCALAR_SLOTS); + BITSET_DECLARE(xfb16_only_mask, NUM_SCALAR_SLOTS); + + /* Mask of all TCS->TES slots that are read by TCS, but not TES. */ + BITSET_DECLARE(no_varying32_mask, NUM_SCALAR_SLOTS); + BITSET_DECLARE(no_varying16_mask, NUM_SCALAR_SLOTS); + + /* Mask of all slots accessed with indirect indexing. */ + BITSET_DECLARE(indirect_mask, NUM_SCALAR_SLOTS); + + /* The following masks only contain slots that can be compacted and + * describe the groups in which they should be compacted. Non-fragment + * shaders only use the flat bitmasks. + * + * Some legacy varyings are excluded when they can't be compacted due to + * being affected by pipeline states (like coord replace). That only + * applies to xx->FS shader pairs. Other shader pairs get all legacy + * varyings compacted and relocated to VARn. + * + * Indirectly-indexed varyings are also excluded because they are not + * compacted. + */ + BITSET_DECLARE(interp_fp32_mask, NUM_SCALAR_SLOTS); + BITSET_DECLARE(interp_fp16_mask, NUM_SCALAR_SLOTS); + BITSET_DECLARE(flat32_mask, NUM_SCALAR_SLOTS); + BITSET_DECLARE(flat16_mask, NUM_SCALAR_SLOTS); + + /* Color interpolation unqualified (follows the flat-shade state). */ + BITSET_DECLARE(color32_mask, NUM_SCALAR_SLOTS); + + /* Mask of output components that have only one store instruction, or if + * they have multiple store instructions, all those instructions store + * the same value. If the output has multiple vertices, all vertices store + * the same value. This is a useful property for: + * - constant and uniform propagation to the next shader + * - deduplicating outputs + */ + BITSET_DECLARE(output_equal_mask, NUM_SCALAR_SLOTS); + + /* Mask of output components that store values that are convergent, + * i.e. all values stored into the outputs are equal within a primitive. + * + * This is different from output_equal_mask, which says that all stores + * to the same slot in the same thread are equal, while this says that + * each store to the same slot can be different, but it always stores + * a convergent value, which means the stored value is equal among all + * threads within a primitive. + * + * The advantage is that these varyings can always be promoted to flat + * regardless of the original interpolation mode, and they can always be + * compacted with both interpolated and flat varyings. + */ + BITSET_DECLARE(convergent32_mask, NUM_SCALAR_SLOTS); + BITSET_DECLARE(convergent16_mask, NUM_SCALAR_SLOTS); +}; + +/****************************************************************** + * HELPERS + ******************************************************************/ + +/* Return whether the low or high 16-bit slot is 1. */ +#define BITSET_TEST32(m, b) \ + (BITSET_TEST(m, (b) & ~0x1) || BITSET_TEST(m, ((b) & ~0x1) + 1)) + +static void +print_linkage(struct linkage_info *linkage) +{ + printf("Linkage: %s -> %s\n", + _mesa_shader_stage_to_abbrev(linkage->producer_stage), + _mesa_shader_stage_to_abbrev(linkage->consumer_stage)); + + for (unsigned i = 0; i < NUM_SCALAR_SLOTS; i++) { + struct scalar_slot *slot = &linkage->slot[i]; + + if (!slot->num_slots && + list_is_empty(&slot->producer.stores) && + list_is_empty(&slot->producer.loads) && + list_is_empty(&slot->consumer.loads) && + !BITSET_TEST(linkage->removable_mask, i) && + !BITSET_TEST(linkage->indirect_mask, i) && + !BITSET_TEST(linkage->xfb32_only_mask, i) && + !BITSET_TEST(linkage->xfb16_only_mask, i) && + !BITSET_TEST(linkage->no_varying32_mask, i) && + !BITSET_TEST(linkage->no_varying16_mask, i) && + !BITSET_TEST(linkage->interp_fp32_mask, i) && + !BITSET_TEST(linkage->interp_fp16_mask, i) && + !BITSET_TEST(linkage->flat32_mask, i) && + !BITSET_TEST(linkage->flat16_mask, i) && + !BITSET_TEST(linkage->convergent32_mask, i) && + !BITSET_TEST(linkage->convergent16_mask, i) && + !BITSET_TEST(linkage->output_equal_mask, i)) + continue; + + printf(" %7s.%c.%s: num_slots=%2u%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + gl_varying_slot_name_for_stage(vec4_slot(i), + linkage->producer_stage) + 13, + "xyzw"[(i / 2) % 4], + i % 2 ? "hi" : "lo", + slot->num_slots, + BITSET_TEST(linkage->removable_mask, i) ? " removable" : "", + BITSET_TEST(linkage->indirect_mask, i) ? " indirect" : "", + BITSET_TEST(linkage->xfb32_only_mask, i) ? " xfb32_only" : "", + BITSET_TEST(linkage->xfb16_only_mask, i) ? " xfb16_only" : "", + BITSET_TEST(linkage->no_varying32_mask, i) ? " no_varying32" : "", + BITSET_TEST(linkage->no_varying16_mask, i) ? " no_varying16" : "", + BITSET_TEST(linkage->interp_fp32_mask, i) ? " interp_fp32" : "", + BITSET_TEST(linkage->interp_fp16_mask, i) ? " interp_fp16" : "", + BITSET_TEST(linkage->flat32_mask, i) ? " flat32" : "", + BITSET_TEST(linkage->flat16_mask, i) ? " flat16" : "", + BITSET_TEST(linkage->convergent32_mask, i) ? " convergent32" : "", + BITSET_TEST(linkage->convergent16_mask, i) ? " convergent16" : "", + BITSET_TEST(linkage->output_equal_mask, i) ? " output_equal" : "", + !list_is_empty(&slot->producer.stores) ? " producer_stores" : "", + !list_is_empty(&slot->producer.loads) ? " producer_loads" : "", + !list_is_empty(&slot->consumer.loads) ? " consumer_loads" : ""); + } +} + +static void +slot_disable_optimizations_and_compaction(struct linkage_info *linkage, + unsigned i) +{ + BITSET_CLEAR(linkage->output_equal_mask, i); + BITSET_CLEAR(linkage->convergent32_mask, i); + BITSET_CLEAR(linkage->convergent16_mask, i); + BITSET_CLEAR(linkage->interp_fp32_mask, i); + BITSET_CLEAR(linkage->interp_fp16_mask, i); + BITSET_CLEAR(linkage->flat32_mask, i); + BITSET_CLEAR(linkage->flat16_mask, i); + BITSET_CLEAR(linkage->no_varying32_mask, i); + BITSET_CLEAR(linkage->no_varying16_mask, i); + BITSET_CLEAR(linkage->color32_mask, i); +} + +static void +clear_slot_info_after_removal(struct linkage_info *linkage, unsigned i, bool uses_xfb) +{ + slot_disable_optimizations_and_compaction(linkage, i); + + if (uses_xfb) + return; + + linkage->slot[i].num_slots = 0; + + BITSET_CLEAR(linkage->indirect_mask, i); + BITSET_CLEAR(linkage->removable_mask, i); + + /* Transform feedback stores can't be removed. */ + assert(!BITSET_TEST(linkage->xfb32_only_mask, i)); + assert(!BITSET_TEST(linkage->xfb16_only_mask, i)); +} + +static bool +has_xfb(nir_intrinsic_instr *intr) +{ + /* This means whether the instrinsic is ABLE to have xfb info. */ + if (!nir_intrinsic_has_io_xfb(intr)) + return false; + + unsigned comp = nir_intrinsic_component(intr); + + if (comp >= 2) + return nir_intrinsic_io_xfb2(intr).out[comp - 2].num_components > 0; + else + return nir_intrinsic_io_xfb(intr).out[comp].num_components > 0; +} + +static bool +is_interpolated_color(struct linkage_info *linkage, unsigned i) +{ + if (linkage->consumer_stage != MESA_SHADER_FRAGMENT) + return false; + + /* BFCn stores are bunched in the COLn slots with COLn, so we should never + * get BFCn here. + */ + assert(vec4_slot(i) != VARYING_SLOT_BFC0 && + vec4_slot(i) != VARYING_SLOT_BFC1); + + return vec4_slot(i) == VARYING_SLOT_COL0 || + vec4_slot(i) == VARYING_SLOT_COL1; +} + +static bool +is_interpolated_texcoord(struct linkage_info *linkage, unsigned i) +{ + if (linkage->consumer_stage != MESA_SHADER_FRAGMENT) + return false; + + return vec4_slot(i) >= VARYING_SLOT_TEX0 && + vec4_slot(i) <= VARYING_SLOT_TEX7; +} + +static bool +color_uses_shade_model(struct linkage_info *linkage, unsigned i) +{ + if (!is_interpolated_color(linkage, i)) + return false; + + list_for_each_entry(struct list_node, iter, + &linkage->slot[i].consumer.loads, head) { + assert(iter->instr->intrinsic == nir_intrinsic_load_interpolated_input); + + nir_intrinsic_instr *baryc = + nir_instr_as_intrinsic(iter->instr->src[0].ssa->parent_instr); + if (nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NONE) + return true; + } + + return false; +} + +static bool +preserve_infs_nans(nir_shader *nir, unsigned bit_size) +{ + unsigned mode = nir->info.float_controls_execution_mode; + + return nir_is_float_control_inf_preserve(mode, bit_size) || + nir_is_float_control_nan_preserve(mode, bit_size); +} + +static bool +preserve_nans(nir_shader *nir, unsigned bit_size) +{ + unsigned mode = nir->info.float_controls_execution_mode; + + return nir_is_float_control_nan_preserve(mode, bit_size); +} + +static nir_def * +build_convert_inf_to_nan(nir_builder *b, nir_def *x) +{ + /* Do x*0 + x. The multiplication by 0 can't be optimized out. */ + nir_def *fma = nir_ffma_imm1(b, x, 0, x); + nir_instr_as_alu(fma->parent_instr)->exact = true; + return fma; +} + +/****************************************************************** + * GATHERING INPUTS & OUTPUTS + ******************************************************************/ + +static bool +is_active_sysval_output(struct linkage_info *linkage, unsigned slot, + nir_intrinsic_instr *intr) +{ + return nir_slot_is_sysval_output(vec4_slot(slot), + linkage->consumer_stage) && + !nir_intrinsic_io_semantics(intr).no_sysval_output; +} + +/** + * This function acts like a filter. The pass won't touch varyings that + * return false here, and the return value is saved in the linkage bitmasks, + * so that all subpasses will *automatically* skip such varyings. + */ +static bool +can_remove_varying(struct linkage_info *linkage, gl_varying_slot location) +{ + if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { + /* User-defined varyings and fog coordinates can always be removed. */ + if (location >= VARYING_SLOT_VAR0 || + location == VARYING_SLOT_FOGC) + return true; + + /* These can be removed as varyings, which means they will be demoted to + * sysval-only outputs keeping their culling/rasterization functions + * while not passing the values to FS. Drivers should handle + * the "no_varying" semantic to benefit from this. + * + * Note: When removing unset LAYER and VIEWPORT FS inputs, they will + * be replaced by 0 instead of undef. + */ + if (location == VARYING_SLOT_CLIP_DIST0 || + location == VARYING_SLOT_CLIP_DIST1 || + location == VARYING_SLOT_CULL_DIST0 || + location == VARYING_SLOT_CULL_DIST1 || + location == VARYING_SLOT_LAYER || + location == VARYING_SLOT_VIEWPORT) + return true; + + /* COLn inputs can be removed only if both COLn and BFCn are not + * written. Both COLn and BFCn outputs can be removed if COLn inputs + * aren't read. + * + * TEXn inputs can never be removed in FS because of the coord replace + * state, but TEXn outputs can be removed if they are not read by FS. + */ + if (location == VARYING_SLOT_COL0 || + location == VARYING_SLOT_COL1 || + location == VARYING_SLOT_BFC0 || + location == VARYING_SLOT_BFC1 || + (location >= VARYING_SLOT_TEX0 && location <= VARYING_SLOT_TEX7)) + return true; + + /* "GS -> FS" can remove the primitive ID if not written or not read. */ + if ((linkage->producer_stage == MESA_SHADER_GEOMETRY || + linkage->producer_stage == MESA_SHADER_MESH) && + location == VARYING_SLOT_PRIMITIVE_ID) + return true; + + /* No other varyings can be removed. */ + return false; + } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL) { + /* Only VS->TES shouldn't remove TESS_LEVEL_* inputs because the values + * come from glPatchParameterfv. + * + * For TCS->TES, TESS_LEVEL_* outputs can be removed as varyings, which + * means they will be demoted to sysval-only outputs, so that drivers + * know that TES doesn't read them. + */ + if (linkage->producer_stage == MESA_SHADER_VERTEX && + (location == VARYING_SLOT_TESS_LEVEL_INNER || + location == VARYING_SLOT_TESS_LEVEL_OUTER)) + return false; + + return true; + } + + /* All other varyings can be removed. */ + return true; +} + +struct opt_options { + bool propagate_uniform_expr:1; + bool deduplicate:1; + bool inter_shader_code_motion:1; + bool compact:1; + bool disable_all:1; +}; + +/** + * Return which optimizations are allowed. + */ +static struct opt_options +can_optimize_varying(struct linkage_info *linkage, gl_varying_slot location) +{ + struct opt_options options_var = { + .propagate_uniform_expr = true, + .deduplicate = true, + .inter_shader_code_motion = true, + .compact = true, + }; + struct opt_options options_color = { + .propagate_uniform_expr = true, /* only constants in [0, 1] */ + .deduplicate = true, + .compact = true, + }; + struct opt_options options_tex = { + .propagate_uniform_expr = true, /* only TEX.zw if equal to (0, 1) */ + }; + struct opt_options options_sysval_output = { + .propagate_uniform_expr = true, + .deduplicate = true, + }; + struct opt_options options_tess_levels = { + .propagate_uniform_expr = true, + .deduplicate = true, + }; + struct opt_options options_disable_all = { + .disable_all = true, + }; + + assert(can_remove_varying(linkage, location)); + + if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { + /* xx -> FS */ + /* User-defined varyings and fog coordinates can always be optimized. */ + if (location >= VARYING_SLOT_VAR0 || + location == VARYING_SLOT_FOGC) + return options_var; + + /* The primitive ID can always be optimized in GS -> FS. */ + if (linkage->producer_stage == MESA_SHADER_GEOMETRY && + location == VARYING_SLOT_PRIMITIVE_ID) + return options_var; + + /* Colors can only do constant propagation if COLn and BFCn store the + * same constant and the constant is between 0 and 1 (because clamp + * vertex color state is unknown). Uniform propagation isn't possible + * because of the clamping. + * + * Color components can only be deduplicated and compacted among + * themselves if they have the same interpolation qualifier, and can't + * be mixed with other varyings. + */ + if (location == VARYING_SLOT_COL0 || + location == VARYING_SLOT_COL1 || + location == VARYING_SLOT_BFC0 || + location == VARYING_SLOT_BFC1) + return options_color; + + /* TEXn.zw can only be constant-propagated if the value is (0, 1) + * because it matches the coord replace values. + */ + if (location >= VARYING_SLOT_TEX0 && location <= VARYING_SLOT_TEX7) + return options_tex; + + /* LAYER, VIEWPORT, CLIP_DISTn, and CULL_DISTn can only propagate + * uniform expressions and be compacted (moved to VARn while keeping + * the sysval outputs where they are). + */ + if (location == VARYING_SLOT_LAYER || + location == VARYING_SLOT_VIEWPORT || + location == VARYING_SLOT_CLIP_DIST0 || + location == VARYING_SLOT_CLIP_DIST1 || + location == VARYING_SLOT_CULL_DIST0 || + location == VARYING_SLOT_CULL_DIST1) + return options_sysval_output; + + /* Everything else can't be read by the consumer, such as POS, PSIZ, + * CLIP_VERTEX, EDGE, PRIMITIVE_SHADING_RATE, etc. + */ + return options_disable_all; + } + + if (linkage->producer_stage == MESA_SHADER_TESS_CTRL) { + /* TESS_LEVEL_* can only propagate uniform expressions. + * Compaction is disabled because AMD doesn't want the varying to be + * moved to PATCHn while keeping the sysval output where it is. + */ + if (location == VARYING_SLOT_TESS_LEVEL_INNER || + location == VARYING_SLOT_TESS_LEVEL_OUTER) + return options_tess_levels; + } + + /* All other shader pairs, which are (VS, TCS), (TCS, TES), (VS, TES), + * (TES, GS), and (VS, GS) can compact and optimize all varyings. + */ + return options_var; +} + +static bool +gather_inputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_data) +{ + struct linkage_info *linkage = (struct linkage_info *)cb_data; + + if (intr->intrinsic != nir_intrinsic_load_input && + intr->intrinsic != nir_intrinsic_load_per_vertex_input && + intr->intrinsic != nir_intrinsic_load_interpolated_input) + return false; + + /* nir_lower_io_to_scalar is required before this */ + assert(intr->def.num_components == 1); + /* Non-zero constant offsets should have been folded by + * nir_io_add_const_offset_to_base. + */ + nir_src offset = *nir_get_io_offset_src(intr); + assert(!nir_src_is_const(offset) || nir_src_as_uint(offset) == 0); + + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + + if (!can_remove_varying(linkage, sem.location)) + return false; + + /* Insert the load into the list of loads for this scalar slot. */ + unsigned slot = intr_get_scalar_16bit_slot(intr); + struct scalar_slot *in = &linkage->slot[slot]; + struct list_node *node = linear_alloc_child(linkage->linear_mem_ctx, + sizeof(struct list_node)); + node->instr = intr; + list_addtail(&node->head, &in->consumer.loads); + in->num_slots = MAX2(in->num_slots, sem.num_slots); + + BITSET_SET(linkage->removable_mask, slot); + + enum fs_vec4_type fs_vec4_type = FS_VEC4_TYPE_NONE; + + /* Determine the type of the input for compaction. Other inputs + * can be compacted with indirectly-indexed vec4 slots if they + * have unused components, but only if they are of the same type. + */ + if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { + if (intr->intrinsic == nir_intrinsic_load_input) + fs_vec4_type = FS_VEC4_TYPE_FLAT; + else if (color_uses_shade_model(linkage, slot)) + fs_vec4_type = FS_VEC4_TYPE_INTERP_COLOR; + else if (intr->def.bit_size == 32) + fs_vec4_type = FS_VEC4_TYPE_INTERP_FP32; + else if (intr->def.bit_size == 16) + fs_vec4_type = FS_VEC4_TYPE_INTERP_FP16; + else + unreachable("invalid load_interpolate_input type"); + + linkage->fs_vec4_type[sem.location] = fs_vec4_type; + } + + /* Indirect indexing. */ + if (!nir_src_is_const(offset)) { + /* Only the indirectly-indexed component is marked as indirect. */ + for (unsigned i = 0; i < sem.num_slots; i++) + BITSET_SET(linkage->indirect_mask, slot + i * 8); + + /* Set the same vec4 type as the first element in all slots. */ + if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { + for (unsigned i = 1; i < sem.num_slots; i++) + linkage->fs_vec4_type[sem.location + i] = fs_vec4_type; + } + return false; + } + + if (!can_optimize_varying(linkage, sem.location).compact) + return false; + + /* Record inputs that can be compacted. */ + if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { + if (intr->intrinsic == nir_intrinsic_load_input) { + if (intr->def.bit_size == 32) + BITSET_SET(linkage->flat32_mask, slot); + else if (intr->def.bit_size == 16) + BITSET_SET(linkage->flat16_mask, slot); + else + unreachable("invalid load_input type"); + } else { + if (color_uses_shade_model(linkage, slot)) + BITSET_SET(linkage->color32_mask, slot); + else if (intr->def.bit_size == 32) + BITSET_SET(linkage->interp_fp32_mask, slot); + else if (intr->def.bit_size == 16) + BITSET_SET(linkage->interp_fp16_mask, slot); + else + unreachable("invalid load_interpolate_input type"); + } + } else { + if (intr->def.bit_size == 32) + BITSET_SET(linkage->flat32_mask, slot); + else if (intr->def.bit_size == 16) + BITSET_SET(linkage->flat16_mask, slot); + else + unreachable("invalid load_input type"); + } + return false; +} + +static bool +gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_data) +{ + struct linkage_info *linkage = (struct linkage_info *)cb_data; + + if (intr->intrinsic != nir_intrinsic_store_output && + intr->intrinsic != nir_intrinsic_load_output && + intr->intrinsic != nir_intrinsic_store_per_vertex_output && + intr->intrinsic != nir_intrinsic_load_per_vertex_output) + return false; + + bool is_store = + intr->intrinsic == nir_intrinsic_store_output || + intr->intrinsic == nir_intrinsic_store_per_vertex_output; + + if (is_store) { + /* nir_lower_io_to_scalar is required before this */ + assert(intr->src[0].ssa->num_components == 1); + /* nit_opt_undef is required before this. */ + assert(intr->src[0].ssa->parent_instr->type != + nir_instr_type_undef); + } else { + /* nir_lower_io_to_scalar is required before this */ + assert(intr->def.num_components == 1); + /* Outputs loads are only allowed in TCS. */ + assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL); + } + + /* Non-zero constant offsets should have been folded by + * nir_io_add_const_offset_to_base. + */ + nir_src offset = *nir_get_io_offset_src(intr); + assert(!nir_src_is_const(offset) || nir_src_as_uint(offset) == 0); + + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + + if (!can_remove_varying(linkage, sem.location)) + return false; + + /* For "xx -> FS", treat BFCn stores as COLn to make dead varying + * elimination do the right thing automatically. The rules are: + * - COLn inputs can be removed only if both COLn and BFCn are not + * written. + * - Both COLn and BFCn outputs can be removed if COLn inputs + * aren't read. + */ + if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { + if (sem.location == VARYING_SLOT_BFC0) + sem.location = VARYING_SLOT_COL0; + else if (sem.location == VARYING_SLOT_BFC1) + sem.location = VARYING_SLOT_COL1; + } + + /* Insert the instruction into the list of stores or loads for this + * scalar slot. + */ + unsigned slot = + get_scalar_16bit_slot(sem, nir_intrinsic_component(intr)); + + struct scalar_slot *out = &linkage->slot[slot]; + struct list_node *node = linear_alloc_child(linkage->linear_mem_ctx, + sizeof(struct list_node)); + node->instr = intr; + out->num_slots = MAX2(out->num_slots, sem.num_slots); + + if (is_store) { + list_addtail(&node->head, &out->producer.stores); + + if (has_xfb(intr)) { + BITSET_SET(linkage->xfb_mask, slot); + + if (sem.no_varying && + !is_active_sysval_output(linkage, slot, intr)) { + if (intr->src[0].ssa->bit_size == 32) + BITSET_SET(linkage->xfb32_only_mask, slot); + else if (intr->src[0].ssa->bit_size == 16) + BITSET_SET(linkage->xfb16_only_mask, slot); + else + unreachable("invalid load_input type"); + } + } + } else { + list_addtail(&node->head, &out->producer.loads); + } + + BITSET_SET(linkage->removable_mask, slot); + + /* Indirect indexing. */ + if (!nir_src_is_const(offset)) { + /* Only the indirectly-indexed component is marked as indirect. */ + for (unsigned i = 0; i < sem.num_slots; i++) + BITSET_SET(linkage->indirect_mask, slot + i * 8); + + /* Set the same vec4 type as the first element in all slots. */ + if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { + enum fs_vec4_type fs_vec4_type = + linkage->fs_vec4_type[sem.location]; + + for (unsigned i = 1; i < sem.num_slots; i++) + linkage->fs_vec4_type[sem.location + i] = fs_vec4_type; + } + return false; + } + + if (can_optimize_varying(linkage, sem.location).disable_all) + return false; + + if (is_store) { + nir_def *value = intr->src[0].ssa; + /* If the store instruction is executed in a divergent block, the value + * that's stored in the output becomes divergent. + */ + bool divergent = value->divergent || + intr->instr.block->divergent; + + if (!out->producer.value) { + /* This is the first store to this output. */ + BITSET_SET(linkage->output_equal_mask, slot); + out->producer.value = value->parent_instr; + + /* Set whether the value is convergent. Such varyings can be + * promoted to flat regardless of their original interpolation + * mode. + */ + if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && !divergent) { + if (value->bit_size == 32) + BITSET_SET(linkage->convergent32_mask, slot); + else if (value->bit_size == 16) + BITSET_SET(linkage->convergent16_mask, slot); + else + unreachable("invalid store_output type"); + } + } else { + /* There are multiple stores to the same output. If they store + * different values, clear the mask. + */ + if (out->producer.value != value->parent_instr) + BITSET_CLEAR(linkage->output_equal_mask, slot); + + /* Update divergence information. */ + if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && divergent) { + if (value->bit_size == 32) + BITSET_CLEAR(linkage->convergent32_mask, slot); + else if (value->bit_size == 16) + BITSET_CLEAR(linkage->convergent16_mask, slot); + else + unreachable("invalid store_output type"); + } + } + } else { + /* Only TCS output loads can get here. + * + * We need to record output loads as flat32 or flat16, otherwise + * compaction will think that the slot is free and will put some + * other output in its place. + */ + assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL); + + if (!can_optimize_varying(linkage, sem.location).compact) + return false; + + if (intr->def.bit_size == 32) + BITSET_SET(linkage->flat32_mask, slot); + else if (intr->def.bit_size == 16) + BITSET_SET(linkage->flat16_mask, slot); + else + unreachable("invalid load_input type"); + } + return false; +} + +/****************************************************************** + * TIDYING UP INDIRECT VARYINGS (BEFORE DEAD VARYINGS REMOVAL) + ******************************************************************/ + +static void +tidy_up_indirect_varyings(struct linkage_info *linkage) +{ + unsigned i; + + /* Indirectly-indexed slots can have direct access too and thus set + * various bitmasks, so clear those bitmasks to make sure they are not + * touched. + */ + BITSET_FOREACH_SET(i, linkage->indirect_mask, NUM_SCALAR_SLOTS) { + slot_disable_optimizations_and_compaction(linkage, i); + } + + /* If some slots have both direct and indirect accesses, move instructions + * of such slots to the slot representing the first array element, so that + * we can remove all loads/stores of dead indirectly-indexed varyings + * by only looking at the first element. + */ + BITSET_FOREACH_SET(i, linkage->indirect_mask, NUM_SCALAR_SLOTS) { + struct scalar_slot *first = &linkage->slot[i]; + + /* Skip if this is not the first array element. The first element + * always sets num_slots to at least 2. + */ + if (first->num_slots <= 1) + continue; + + /* Move instructions from other elements of the indirectly-accessed + * array to the first element (by merging the linked lists). + */ + for (unsigned elem = 1; elem < first->num_slots; elem++) { + /* The component slots are at 16-bit granularity, so we need to + * increment by 8 to get the same component in the next vec4 slot. + */ + struct scalar_slot *other = &linkage->slot[i + elem * 8]; + + list_splicetail(&other->producer.stores, &first->producer.stores); + list_splicetail(&other->producer.loads, &first->producer.loads); + list_splicetail(&other->consumer.loads, &first->consumer.loads); + list_inithead(&other->producer.stores); + list_inithead(&other->producer.loads); + list_inithead(&other->consumer.loads); + } + } +} + +/****************************************************************** + * TIDYING UP CONVERGENT VARYINGS + ******************************************************************/ + +/** + * Reorganize bitmasks for FS because they are initialized such that they can + * intersect with the convergent bitmasks. We want them to be disjoint, so + * that masks of interpolated, flat, and convergent varyings don't intersect. + */ +static void +tidy_up_convergent_varyings(struct linkage_info *linkage) +{ + if (linkage->consumer_stage != MESA_SHADER_FRAGMENT) + return; + + unsigned i; + /* Whether to promote convergent interpolated slots to flat if it + * doesn't lead to worse compaction. + */ + bool optimize_convergent_slots = true; /* only turn off for debugging */ + + if (optimize_convergent_slots) { + /* If a slot is flat and convergent, keep the flat bit and remove + * the convergent bit. + * + * If a slot is interpolated and convergent, remove the interpolated + * bit and keep the convergent bit, which means that it's interpolated, + * but can be promoted to flat. + * + * Since the geometry shader is the only shader that can store values + * in multiple vertices before FS, it's required that all stores are + * equal to be considered convergent (output_equal_mask), otherwise + * the promotion to flat would be incorrect. + */ + BITSET_FOREACH_SET(i, linkage->convergent32_mask, NUM_SCALAR_SLOTS) { + if (!BITSET_TEST(linkage->interp_fp32_mask, i) && + !BITSET_TEST(linkage->flat32_mask, i) && + !BITSET_TEST(linkage->color32_mask, i)) { + /* Compaction disallowed. */ + BITSET_CLEAR(linkage->convergent32_mask, i); + } else if (BITSET_TEST(linkage->flat32_mask, i) || + (linkage->producer_stage == MESA_SHADER_GEOMETRY && + !BITSET_TEST(linkage->output_equal_mask, i))) { + /* Keep the original qualifier. */ + BITSET_CLEAR(linkage->convergent32_mask, i); + } else { + /* Keep it convergent. */ + BITSET_CLEAR(linkage->interp_fp32_mask, i); + BITSET_CLEAR(linkage->color32_mask, i); + } + } + BITSET_FOREACH_SET(i, linkage->convergent16_mask, NUM_SCALAR_SLOTS) { + if (!BITSET_TEST(linkage->interp_fp16_mask, i) && + !BITSET_TEST(linkage->flat16_mask, i)) { + /* Compaction disallowed. */ + BITSET_CLEAR(linkage->convergent16_mask, i); + } else if (BITSET_TEST(linkage->flat16_mask, i) || + (linkage->producer_stage == MESA_SHADER_GEOMETRY && + !BITSET_TEST(linkage->output_equal_mask, i))) { + /* Keep the original qualifier. */ + BITSET_CLEAR(linkage->convergent16_mask, i); + } else { + /* Keep it convergent. */ + BITSET_CLEAR(linkage->interp_fp16_mask, i); + } + } + } else { + /* Don't do anything with convergent slots. */ + BITSET_ZERO(linkage->convergent32_mask); + BITSET_ZERO(linkage->convergent16_mask); + } +} + +/****************************************************************** + * DETERMINING UNIFORM AND UBO MOVABILITY BASED ON DRIVER LIMITS + ******************************************************************/ + +static bool +is_variable_present(nir_shader *nir, nir_variable *var, + nir_variable_mode mode, bool spirv) +{ + nir_foreach_variable_with_modes(it, nir, mode) { + if ((spirv && it->data.binding == var->data.binding) || + (!spirv && !strcmp(it->name, var->name))) + return true; + } + return false; +} + +/* TODO: this should be a helper in common code */ +static unsigned +get_uniform_components(const struct glsl_type *type) +{ + unsigned size = glsl_get_aoa_size(type); + size = MAX2(size, 1); + size *= glsl_get_matrix_columns(glsl_without_array(type)); + + if (glsl_type_is_dual_slot(glsl_without_array(type))) + size *= 2; + + /* Convert from vec4 to scalar. */ + return size * 4; +} + +static unsigned +get_ubo_slots(const nir_variable *var) +{ + if (glsl_type_is_interface(glsl_without_array(var->type))) { + unsigned slots = glsl_get_aoa_size(var->type); + return MAX2(slots, 1); + } + + return 1; +} + +/** + * Count uniforms and see if the combined uniform component count is over + * the limit. If it is, don't move any uniforms. It's sufficient if drivers + * declare a very high limit. + */ +static void +determine_uniform_movability(struct linkage_info *linkage, + unsigned max_uniform_components) +{ + nir_shader *producer = linkage->producer_builder.shader; + nir_shader *consumer = linkage->consumer_builder.shader; + unsigned num_producer_uniforms = 0; + unsigned num_consumer_uniforms = 0; + unsigned num_shared_uniforms = 0; + + nir_foreach_variable_with_modes(var, producer, nir_var_uniform) { + if (is_variable_present(consumer, var, nir_var_uniform, linkage->spirv)) + num_shared_uniforms += get_uniform_components(var->type); + else + num_producer_uniforms += get_uniform_components(var->type); + } + + nir_foreach_variable_with_modes(var, consumer, nir_var_uniform) { + if (!is_variable_present(producer, var, nir_var_uniform, linkage->spirv)) + num_consumer_uniforms += get_uniform_components(var->type); + } + + linkage->can_move_uniforms = + num_producer_uniforms + num_consumer_uniforms + num_shared_uniforms <= + max_uniform_components; +} + +/** + * Count UBOs and see if the combined UBO count is over the limit. If it is, + * don't move any UBOs. It's sufficient if drivers declare a very high limit. + */ +static void +determine_ubo_movability(struct linkage_info *linkage, + unsigned max_ubos_per_stage) +{ + nir_shader *producer = linkage->producer_builder.shader; + nir_shader *consumer = linkage->consumer_builder.shader; + unsigned num_producer_ubos = 0; + unsigned num_consumer_ubos = 0; + unsigned num_shared_ubos = 0; + + nir_foreach_variable_with_modes(var, producer, nir_var_mem_ubo) { + if (is_variable_present(consumer, var, nir_var_mem_ubo, linkage->spirv)) + num_shared_ubos += get_ubo_slots(var); + else + num_producer_ubos += get_ubo_slots(var); + } + + nir_foreach_variable_with_modes(var, consumer, nir_var_mem_ubo) { + if (!is_variable_present(producer, var, nir_var_mem_ubo, + linkage->spirv)) + num_consumer_ubos += get_ubo_slots(var); + } + + linkage->can_move_ubos = + num_producer_ubos + num_consumer_ubos + num_shared_ubos <= + max_ubos_per_stage; +} + +/****************************************************************** + * DEAD VARYINGS REMOVAL + ******************************************************************/ + +static void +remove_all_stores(struct linkage_info *linkage, unsigned i, + bool *uses_xfb, nir_opt_varyings_progress *progress) +{ + struct scalar_slot *slot = &linkage->slot[i]; + + assert(!list_is_empty(&slot->producer.stores) && + list_is_empty(&slot->producer.loads) && + list_is_empty(&slot->consumer.loads)); + + /* Remove all stores. */ + list_for_each_entry_safe(struct list_node, iter, &slot->producer.stores, head) { + if (nir_remove_varying(iter->instr, linkage->consumer_stage)) { + list_del(&iter->head); + *progress |= nir_progress_producer; + } else { + if (has_xfb(iter->instr)) { + *uses_xfb = true; + + if (!is_active_sysval_output(linkage, i, iter->instr)) { + if (iter->instr->src[0].ssa->bit_size == 32) + BITSET_SET(linkage->xfb32_only_mask, i); + else if (iter->instr->src[0].ssa->bit_size == 16) + BITSET_SET(linkage->xfb16_only_mask, i); + else + unreachable("invalid load_input type"); + } + } + } + } +} + +static void +remove_dead_varyings(struct linkage_info *linkage, + nir_opt_varyings_progress *progress) +{ + unsigned i; + + /* Remove dead inputs and outputs. */ + BITSET_FOREACH_SET(i, linkage->removable_mask, NUM_SCALAR_SLOTS) { + struct scalar_slot *slot = &linkage->slot[i]; + + /* Only indirect access can have no loads and stores because we moved + * them to the first element in tidy_up_indirect_varyings(). + */ + assert(!list_is_empty(&slot->producer.stores) || + !list_is_empty(&slot->producer.loads) || + !list_is_empty(&slot->consumer.loads) || + BITSET_TEST(linkage->indirect_mask, i)); + + /* Nothing to do if there are no loads and stores. */ + if (list_is_empty(&slot->producer.stores) && + list_is_empty(&slot->producer.loads) && + list_is_empty(&slot->consumer.loads)) + continue; + + /* If there are producer loads (e.g. TCS) but no consumer loads + * (e.g. TES), set the "no_varying" flag to indicate that the outputs + * are not consumed by the next shader stage (e.g. TES). + */ + if (!list_is_empty(&slot->producer.stores) && + !list_is_empty(&slot->producer.loads) && + list_is_empty(&slot->consumer.loads)) { + for (unsigned list_index = 0; list_index < 2; list_index++) { + struct list_head *list = list_index ? &slot->producer.stores : + &slot->producer.loads; + + list_for_each_entry(struct list_node, iter, list, head) { + nir_io_semantics sem = nir_intrinsic_io_semantics(iter->instr); + sem.no_varying = 1; + nir_intrinsic_set_io_semantics(iter->instr, sem); + } + } + + /* This tells the compaction to move these varyings to the end. */ + if (BITSET_TEST(linkage->flat32_mask, i)) { + assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT); + BITSET_CLEAR(linkage->flat32_mask, i); + BITSET_SET(linkage->no_varying32_mask, i); + } + if (BITSET_TEST(linkage->flat16_mask, i)) { + assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT); + BITSET_CLEAR(linkage->flat16_mask, i); + BITSET_SET(linkage->no_varying16_mask, i); + } + continue; + } + + /* The varyings aren't dead if both loads and stores are present. */ + if (!list_is_empty(&slot->producer.stores) && + (!list_is_empty(&slot->producer.loads) || + !list_is_empty(&slot->consumer.loads))) + continue; + + bool uses_xfb = false; + + if (list_is_empty(&slot->producer.stores)) { + /* There are no stores. */ + assert(!list_is_empty(&slot->producer.loads) || + !list_is_empty(&slot->consumer.loads)); + + /* TEXn.xy loads can't be removed in FS because of the coord + * replace state, but TEXn outputs can be removed if they are + * not read by FS. + * + * TEXn.zw loads can be eliminated and replaced by (0, 1), which + * is equal to the coord replace value. + */ + if (is_interpolated_texcoord(linkage, i)) { + assert(i % 2 == 0); /* high 16-bit slots disallowed */ + /* Keep TEXn.xy. */ + if (i % 8 < 4) + continue; + } + + /* Replace all loads with undef. Do that for both input loads + * in the consumer stage and output loads in the producer stage + * because we also want to eliminate TCS loads that have no + * corresponding TCS stores. + */ + for (unsigned list_index = 0; list_index < 2; list_index++) { + struct list_head *list = list_index ? &slot->producer.loads : + &slot->consumer.loads; + nir_builder *b = list_index ? &linkage->producer_builder : + &linkage->consumer_builder; + + list_for_each_entry(struct list_node, iter, list, head) { + nir_intrinsic_instr *loadi = iter->instr; + nir_def *replacement = NULL; + + b->cursor = nir_before_instr(&loadi->instr); + + /* LAYER and VIEWPORT FS inputs should be replaced by 0 + * instead of undef. + */ + gl_varying_slot location = (gl_varying_slot)(vec4_slot(i)); + + if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && + (location == VARYING_SLOT_LAYER || + location == VARYING_SLOT_VIEWPORT || + /* TEXn.z is replaced by 0 (matching coord replace) */ + (is_interpolated_texcoord(linkage, i) && i % 8 == 4))) + replacement = nir_imm_intN_t(b, 0, loadi->def.bit_size); + else if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && + /* TEXn.w is replaced by 1 (matching coord replace) */ + is_interpolated_texcoord(linkage, i) && i % 8 == 6) + replacement = nir_imm_floatN_t(b, 1, loadi->def.bit_size); + else + replacement = nir_undef(b, 1, loadi->def.bit_size); + + nir_def_rewrite_uses(&loadi->def, replacement); + nir_instr_remove(&loadi->instr); + + *progress |= list_index ? nir_progress_producer : + nir_progress_consumer; + } + } + + /* Clear the lists. */ + list_inithead(&slot->producer.loads); + list_inithead(&slot->consumer.loads); + } else { + /* There are no loads. */ + remove_all_stores(linkage, i, &uses_xfb, progress); + } + + /* Clear bitmasks associated with this varying slot or array. */ + for (unsigned elem = 0; elem < slot->num_slots; elem++) + clear_slot_info_after_removal(linkage, i + elem, uses_xfb); + } +} + +/****************************************************************** + * SSA CLONING HELPERS + ******************************************************************/ + +/* Pass flags for inter-shader code motion. Also used by helpers. */ +#define FLAG_ALU_IS_TES_INTERP_LOAD BITFIELD_BIT(0) +#define FLAG_MOVABLE BITFIELD_BIT(1) +#define FLAG_UNMOVABLE BITFIELD_BIT(2) +#define FLAG_POST_DOMINATOR_PROCESSED BITFIELD_BIT(3) +#define FLAG_GATHER_LOADS_VISITED BITFIELD_BIT(4) + +#define FLAG_INTERP_MASK BITFIELD_RANGE(5, 3) +#define FLAG_INTERP_CONVERGENT (0 << 5) +#define FLAG_INTERP_FLAT (1 << 5) +/* FS-only interpolation modes. */ +#define FLAG_INTERP_PERSP_PIXEL (2 << 5) +#define FLAG_INTERP_PERSP_CENTROID (3 << 5) +#define FLAG_INTERP_PERSP_SAMPLE (4 << 5) +#define FLAG_INTERP_LINEAR_PIXEL (5 << 5) +#define FLAG_INTERP_LINEAR_CENTROID (6 << 5) +#define FLAG_INTERP_LINEAR_SAMPLE (7 << 5) +/* TES-only interpolation modes. (these were found in shaders) */ +#define FLAG_INTERP_TES_TRIANGLE_UVW (2 << 5) /* v0*u + v1*v + v2*w */ +#define FLAG_INTERP_TES_TRIANGLE_WUV (3 << 5) /* v0*w + v1*u + v2*v */ +/* TODO: Feel free to insert more TES interpolation equations here. */ + +static bool +can_move_deref_between_shaders(struct linkage_info *linkage, nir_instr *instr) +{ + nir_deref_instr *deref = nir_instr_as_deref(instr); + unsigned allowed_modes = + (linkage->can_move_uniforms ? nir_var_uniform : 0) | + (linkage->can_move_ubos ? nir_var_mem_ubo : 0); + + if (!nir_deref_mode_is_one_of(deref, allowed_modes)) + return false; + + /* Indirectly-indexed uniforms and UBOs are not moved into later shaders + * due to performance concerns, and they are not moved into previous shaders + * because it's unimplemented (TODO). + */ + if (nir_deref_instr_has_indirect(deref)) + return false; + + nir_variable *var = nir_deref_instr_get_variable(deref); + + /* Subroutine uniforms are not moved. Even though it works and subroutine + * uniforms are moved correctly and subroutines have been inlined at this + * point, subroutine functions aren't moved and the linker doesn't like + * when a shader only contains a subroutine uniform but no subroutine + * functions. This could be fixed in the linker, but for now, don't + * move subroutine uniforms. + */ + if (var->name && strstr(var->name, "__subu_") == var->name) + return false; + + return true; +} + +static nir_intrinsic_instr * +find_per_vertex_load_for_tes_interp(nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(instr); + unsigned num_srcs = nir_op_infos[alu->op].num_inputs; + + for (unsigned i = 0; i < num_srcs; i++) { + nir_instr *src = alu->src[i].src.ssa->parent_instr; + nir_intrinsic_instr *intr = find_per_vertex_load_for_tes_interp(src); + + if (intr) + return intr; + } + return NULL; + } + + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + return intr->intrinsic == nir_intrinsic_load_per_vertex_input ? + intr : NULL; + } + + default: + unreachable("unexpected instruction type"); + } +} + +static nir_def * +get_stored_value_for_load(struct linkage_info *linkage, nir_instr *instr) +{ + nir_intrinsic_instr *intr; + + if (instr->type == nir_instr_type_intrinsic) { + intr = nir_instr_as_intrinsic(instr); + } else { + assert(instr->type == nir_instr_type_alu && + instr->pass_flags & FLAG_ALU_IS_TES_INTERP_LOAD); + intr = find_per_vertex_load_for_tes_interp(instr); + } + + unsigned slot_index = intr_get_scalar_16bit_slot(intr); + assert(list_is_singular(&linkage->slot[slot_index].producer.stores)); + + nir_def *stored_value = + list_first_entry(&linkage->slot[slot_index].producer.stores, + struct list_node, head)->instr->src[0].ssa; + assert(stored_value->num_components == 1); + return stored_value; +} + +/* Clone the SSA, which can be in a different shader. */ +static nir_def * +clone_ssa(struct linkage_info *linkage, nir_builder *b, nir_def *ssa) +{ + switch (ssa->parent_instr->type) { + case nir_instr_type_load_const: + return nir_build_imm(b, ssa->num_components, ssa->bit_size, + nir_instr_as_load_const(ssa->parent_instr)->value); + + case nir_instr_type_undef: + return nir_undef(b, ssa->num_components, ssa->bit_size); + + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(ssa->parent_instr); + + if (alu->instr.pass_flags & FLAG_ALU_IS_TES_INTERP_LOAD) { + /* We are cloning an interpolated TES load in the producer for + * backward inter-shader code motion. + */ + assert(&linkage->producer_builder == b); + return get_stored_value_for_load(linkage, &alu->instr); + } + + nir_def *src[4] = {0}; + unsigned num_srcs = nir_op_infos[alu->op].num_inputs; + assert(num_srcs <= ARRAY_SIZE(src)); + + for (unsigned i = 0; i < num_srcs; i++) + src[i] = clone_ssa(linkage, b, alu->src[i].src.ssa); + + nir_def *clone = nir_build_alu(b, alu->op, src[0], src[1], src[2], src[3]); + nir_alu_instr *alu_clone = nir_instr_as_alu(clone->parent_instr); + + alu_clone->exact = alu->exact; + alu_clone->no_signed_wrap = alu->no_signed_wrap; + alu_clone->no_unsigned_wrap = alu->no_unsigned_wrap; + alu_clone->def.num_components = alu->def.num_components; + alu_clone->def.bit_size = alu->def.bit_size; + + for (unsigned i = 0; i < num_srcs; i++) { + memcpy(alu_clone->src[i].swizzle, alu->src[i].swizzle, + NIR_MAX_VEC_COMPONENTS); + } + + return clone; + } + + case nir_instr_type_intrinsic: { + /* Clone load_deref of uniform or ubo. It's the only thing that can + * occur here. + */ + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(ssa->parent_instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_deref: { + nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); + + assert(deref); + assert(nir_deref_mode_is_one_of(deref, nir_var_uniform | nir_var_mem_ubo)); + /* Indirect uniform indexing is disallowed here. */ + assert(!nir_deref_instr_has_indirect(deref)); + + /* Get the uniform from the original shader. */ + nir_variable *var = nir_deref_instr_get_variable(deref); + assert(!(var->data.mode & nir_var_mem_ubo) || linkage->can_move_ubos); + + /* Declare the uniform in the target shader. If it's the same shader + * (in the case of replacing output loads with a uniform), this has + * no effect. + */ + var = nir_clone_uniform_variable(b->shader, var, linkage->spirv); + + /* Re-build the uniform deref load before the load. */ + nir_deref_instr *load_uniform_deref = + nir_clone_deref_instr(b, var, deref); + + return nir_load_deref(b, load_uniform_deref); + } + + case nir_intrinsic_load_input: + case nir_intrinsic_load_interpolated_input: { + /* We are cloning load_input in the producer for backward + * inter-shader code motion. Replace the input load with the stored + * output value. That way we can clone any expression using inputs + * from the consumer in the producer. + */ + assert(&linkage->producer_builder == b); + return get_stored_value_for_load(linkage, &intr->instr); + } + + default: + unreachable("unexpected intrinsic"); + } + } + + default: + unreachable("unexpected instruction type"); + } +} + +/****************************************************************** + * UNIFORM EXPRESSION PROPAGATION (CONSTANTS, UNIFORMS, UBO LOADS) + ******************************************************************/ + +static void +remove_all_stores_and_clear_slot(struct linkage_info *linkage, unsigned slot, + nir_opt_varyings_progress *progress) +{ + bool uses_xfb = false; + remove_all_stores(linkage, slot, &uses_xfb, progress); + clear_slot_info_after_removal(linkage, slot, uses_xfb); +} + +struct is_uniform_expr_state { + struct linkage_info *linkage; + unsigned cost; +}; + +static bool +is_uniform_expression(nir_instr *instr, struct is_uniform_expr_state *state); + +static bool +src_is_uniform_expression(nir_src *src, void *data) +{ + return is_uniform_expression(src->ssa->parent_instr, + (struct is_uniform_expr_state*)data); +} + +/** + * Return whether instr is a uniform expression that can be moved into + * the next shader. + */ +static bool +is_uniform_expression(nir_instr *instr, struct is_uniform_expr_state *state) +{ + const nir_shader_compiler_options *options = + state->linkage->producer_builder.shader->options; + + switch (instr->type) { + case nir_instr_type_load_const: + case nir_instr_type_undef: + return true; + + case nir_instr_type_alu: + state->cost += options->varying_estimate_instr_cost ? + options->varying_estimate_instr_cost(instr) : 1; + return nir_foreach_src(instr, src_is_uniform_expression, state); + + case nir_instr_type_intrinsic: + if (nir_instr_as_intrinsic(instr)->intrinsic == + nir_intrinsic_load_deref) { + state->cost += options->varying_estimate_instr_cost ? + options->varying_estimate_instr_cost(instr) : 1; + return nir_foreach_src(instr, src_is_uniform_expression, state); + } + return false; + + case nir_instr_type_deref: + return can_move_deref_between_shaders(state->linkage, instr); + + default: + return false; + } +} + +/** + * Propagate constants, uniforms, UBO loads, and uniform expressions + * in output components to inputs loads in the next shader and output + * loads in the current stage, and remove the output components. + * + * Uniform expressions are ALU expressions only sourcing constants, uniforms, + * and UBO loads. + */ +static void +propagate_uniform_expressions(struct linkage_info *linkage, + nir_opt_varyings_progress *progress) +{ + unsigned i; + + /* Clear pass_flags, which is used by clone_ssa. */ + nir_shader_clear_pass_flags(linkage->consumer_builder.shader); + + /* Find uniform expressions. If there are multiple stores, they should all + * store the same value. That's guaranteed by output_equal_mask. + */ + BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) { + if (!can_optimize_varying(linkage, vec4_slot(i)).propagate_uniform_expr) + continue; + + struct scalar_slot *slot = &linkage->slot[i]; + assert(!list_is_empty(&slot->producer.loads) || + !list_is_empty(&slot->consumer.loads)); + + struct is_uniform_expr_state state = { + .linkage = linkage, + .cost = 0, + }; + + if (!is_uniform_expression(slot->producer.value, &state)) + continue; + + if (state.cost > linkage->max_varying_expression_cost) + continue; + + /* Colors can be propagated only if they are constant between [0, 1] + * because that's the only case when the clamp vertex color state has + * no effect. + */ + if (is_interpolated_color(linkage, i) && + (slot->producer.value->type != nir_instr_type_load_const || + nir_instr_as_load_const(slot->producer.value)->value[0].f32 < 0 || + nir_instr_as_load_const(slot->producer.value)->value[0].f32 > 1)) + continue; + + /* TEXn.zw can be propagated only if it's equal to (0, 1) because it's + * the coord replace value. + */ + if (is_interpolated_texcoord(linkage, i)) { + assert(i % 2 == 0); /* high 16-bit slots disallowed */ + + if (i % 8 == 0 || /* TEXn.x */ + i % 8 == 2 || /* TEXn.y */ + slot->producer.value->type != nir_instr_type_load_const) + continue; + + float value = + nir_instr_as_load_const(slot->producer.value)->value[0].f32; + + /* This ignores signed zeros, but those are destroyed by + * interpolation, so it doesn't matter. + */ + if ((i % 8 == 4 && value != 0) || + (i % 8 == 6 && value != 1)) + continue; + } + + /* Replace all loads. Do that for both input and output loads. */ + for (unsigned list_index = 0; list_index < 2; list_index++) { + struct list_head *load = list_index ? &slot->producer.loads : + &slot->consumer.loads; + nir_builder *b = list_index ? &linkage->producer_builder : + &linkage->consumer_builder; + + list_for_each_entry(struct list_node, node, load, head) { + nir_intrinsic_instr *loadi = node->instr; + b->cursor = nir_before_instr(&loadi->instr); + + /* Copy the uniform expression before the load. */ + nir_def *clone = clone_ssa(linkage, b, + nir_instr_def(slot->producer.value)); + + /* Interpolation converts Infs to NaNs. If we skip it, we need to + * convert Infs to NaNs manually. + */ + if (loadi->intrinsic == nir_intrinsic_load_interpolated_input && + preserve_nans(b->shader, clone->bit_size)) + clone = build_convert_inf_to_nan(b, clone); + + /* Replace the original load. */ + nir_def_rewrite_uses(&loadi->def, clone); + nir_instr_remove(&loadi->instr); + *progress |= list_index ? nir_progress_producer : + nir_progress_consumer; + } + } + + /* Clear the lists. */ + list_inithead(&slot->producer.loads); + list_inithead(&slot->consumer.loads); + + /* Remove all stores now that loads have been replaced. */ + remove_all_stores_and_clear_slot(linkage, i, progress); + } +} + +/****************************************************************** + * OUTPUT DEDUPLICATION + ******************************************************************/ + +/* We can only deduplicate outputs that have the same qualifier, and color + * components must be deduplicated separately because they are affected by GL + * states. + * + * QUAL_*_INTERP_ANY means that the interpolation qualifier doesn't matter for + * deduplication as long as it's not flat. + * + * QUAL_COLOR_SHADEMODEL_ANY is the same, but can be switched to flat + * by the flatshade state, so it can't be deduplicated with + * QUAL_COLOR_INTERP_ANY, which is never flat. + */ +enum var_qualifier { + QUAL_PATCH, + QUAL_VAR_FLAT, + QUAL_COLOR_FLAT, + /* When nir_io_has_flexible_input_interpolation_except_flat is set: */ + QUAL_VAR_INTERP_ANY, + QUAL_COLOR_INTERP_ANY, + QUAL_COLOR_SHADEMODEL_ANY, + /* When nir_io_has_flexible_input_interpolation_except_flat is unset: */ + QUAL_VAR_PERSP_PIXEL, + QUAL_VAR_PERSP_CENTROID, + QUAL_VAR_PERSP_SAMPLE, + QUAL_VAR_LINEAR_PIXEL, + QUAL_VAR_LINEAR_CENTROID, + QUAL_VAR_LINEAR_SAMPLE, + QUAL_COLOR_PERSP_PIXEL, + QUAL_COLOR_PERSP_CENTROID, + QUAL_COLOR_PERSP_SAMPLE, + QUAL_COLOR_LINEAR_PIXEL, + QUAL_COLOR_LINEAR_CENTROID, + QUAL_COLOR_LINEAR_SAMPLE, + QUAL_COLOR_SHADEMODEL_PIXEL, + QUAL_COLOR_SHADEMODEL_CENTROID, + QUAL_COLOR_SHADEMODEL_SAMPLE, + NUM_DEDUP_QUALIFIERS, + + QUAL_SKIP, + QUAL_UNKNOWN, +}; + +/* Return the input qualifier if all loads use the same one, else skip. + * This is only used by output deduplication to determine input compatibility. + */ +static enum var_qualifier +get_input_qualifier(struct linkage_info *linkage, unsigned i) +{ + assert(linkage->consumer_stage == MESA_SHADER_FRAGMENT); + struct scalar_slot *slot = &linkage->slot[i]; + bool is_color = is_interpolated_color(linkage, i); + nir_intrinsic_instr *load = + list_first_entry(&slot->consumer.loads, struct list_node, head)->instr; + + if (load->intrinsic == nir_intrinsic_load_input) + return is_color ? QUAL_COLOR_FLAT : QUAL_VAR_FLAT; + + assert(load->intrinsic == nir_intrinsic_load_interpolated_input); + nir_intrinsic_instr *baryc = + nir_instr_as_intrinsic(load->src[0].ssa->parent_instr); + + if (linkage->consumer_builder.shader->options->io_options & + nir_io_has_flexible_input_interpolation_except_flat) { + if (is_color) { + return nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NONE ? + QUAL_COLOR_SHADEMODEL_ANY : QUAL_COLOR_INTERP_ANY; + } else { + return QUAL_VAR_INTERP_ANY; + } + } + + /* Get the exact interpolation qualifier. */ + unsigned pixel_location; + enum var_qualifier qual; + + switch (baryc->intrinsic) { + case nir_intrinsic_load_barycentric_pixel: + pixel_location = 0; + break; + case nir_intrinsic_load_barycentric_centroid: + pixel_location = 1; + break; + case nir_intrinsic_load_barycentric_sample: + pixel_location = 2; + break; + case nir_intrinsic_load_barycentric_at_offset: + case nir_intrinsic_load_barycentric_at_sample: + /* Don't deduplicate outputs that are interpolated at offset/sample. */ + return QUAL_SKIP; + default: + unreachable("unexpected barycentric src"); + } + + switch (nir_intrinsic_interp_mode(baryc)) { + case INTERP_MODE_NONE: + qual = is_color ? QUAL_COLOR_SHADEMODEL_PIXEL : + QUAL_VAR_PERSP_PIXEL; + break; + case INTERP_MODE_SMOOTH: + qual = is_color ? QUAL_COLOR_PERSP_PIXEL : QUAL_VAR_PERSP_PIXEL; + break; + case INTERP_MODE_NOPERSPECTIVE: + qual = is_color ? QUAL_COLOR_LINEAR_PIXEL : QUAL_VAR_LINEAR_PIXEL; + break; + default: + unreachable("unexpected interp mode"); + } + + /* The ordering of the "qual" enum was carefully chosen to make this + * addition correct. + */ + STATIC_ASSERT(QUAL_VAR_PERSP_PIXEL + 1 == QUAL_VAR_PERSP_CENTROID); + STATIC_ASSERT(QUAL_VAR_PERSP_PIXEL + 2 == QUAL_VAR_PERSP_SAMPLE); + STATIC_ASSERT(QUAL_VAR_LINEAR_PIXEL + 1 == QUAL_VAR_LINEAR_CENTROID); + STATIC_ASSERT(QUAL_VAR_LINEAR_PIXEL + 2 == QUAL_VAR_LINEAR_SAMPLE); + STATIC_ASSERT(QUAL_COLOR_PERSP_PIXEL + 1 == QUAL_COLOR_PERSP_CENTROID); + STATIC_ASSERT(QUAL_COLOR_PERSP_PIXEL + 2 == QUAL_COLOR_PERSP_SAMPLE); + STATIC_ASSERT(QUAL_COLOR_LINEAR_PIXEL + 1 == QUAL_COLOR_LINEAR_CENTROID); + STATIC_ASSERT(QUAL_COLOR_LINEAR_PIXEL + 2 == QUAL_COLOR_LINEAR_SAMPLE); + STATIC_ASSERT(QUAL_COLOR_SHADEMODEL_PIXEL + 1 == + QUAL_COLOR_SHADEMODEL_CENTROID); + STATIC_ASSERT(QUAL_COLOR_SHADEMODEL_PIXEL + 2 == + QUAL_COLOR_SHADEMODEL_SAMPLE); + return qual + pixel_location; +} + +static void +deduplicate_outputs(struct linkage_info *linkage, + nir_opt_varyings_progress *progress) +{ + struct hash_table *tables[NUM_DEDUP_QUALIFIERS] = {NULL}; + unsigned i; + + /* Find duplicated outputs. If there are multiple stores, they should all + * store the same value as all stores of some other output. That's + * guaranteed by output_equal_mask. + */ + BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) { + if (!can_optimize_varying(linkage, vec4_slot(i)).deduplicate) + continue; + + struct scalar_slot *slot = &linkage->slot[i]; + enum var_qualifier qualifier; + gl_varying_slot var_slot = vec4_slot(i); + + /* Determine which qualifier this slot has. */ + if ((var_slot >= VARYING_SLOT_PATCH0 && + var_slot <= VARYING_SLOT_PATCH31) || + var_slot == VARYING_SLOT_TESS_LEVEL_INNER || + var_slot == VARYING_SLOT_TESS_LEVEL_OUTER) + qualifier = QUAL_PATCH; + else if (linkage->consumer_stage != MESA_SHADER_FRAGMENT) + qualifier = QUAL_VAR_FLAT; + else + qualifier = get_input_qualifier(linkage, i); + + if (qualifier == QUAL_SKIP) + continue; + + struct hash_table **table = &tables[qualifier]; + if (!*table) + *table = _mesa_pointer_hash_table_create(NULL); + + nir_instr *value = slot->producer.value; + + struct hash_entry *entry = _mesa_hash_table_search(*table, value); + if (!entry) { + _mesa_hash_table_insert(*table, value, (void*)(uintptr_t)i); + continue; + } + + /* We've found a duplicate. Redirect loads and remove stores. */ + struct scalar_slot *found_slot = &linkage->slot[(uintptr_t)entry->data]; + nir_intrinsic_instr *store = + list_first_entry(&found_slot->producer.stores, + struct list_node, head)->instr; + nir_io_semantics sem = nir_intrinsic_io_semantics(store); + unsigned component = nir_intrinsic_component(store); + + /* Redirect loads. */ + for (unsigned list_index = 0; list_index < 2; list_index++) { + struct list_head *src_loads = list_index ? &slot->producer.loads : + &slot->consumer.loads; + struct list_head *dst_loads = list_index ? &found_slot->producer.loads : + &found_slot->consumer.loads; + bool has_progress = !list_is_empty(src_loads); + + list_for_each_entry(struct list_node, iter, src_loads, head) { + nir_intrinsic_instr *loadi = iter->instr; + + nir_intrinsic_set_io_semantics(loadi, sem); + nir_intrinsic_set_component(loadi, component); + + /* We also need to set the base to match the duplicate load, so + * that CSE can eliminate it. + */ + if (!list_is_empty(dst_loads)) { + struct list_node *first = + list_first_entry(dst_loads, struct list_node, head); + nir_intrinsic_set_base(loadi, nir_intrinsic_base(first->instr)); + } else { + /* Use the base of the found store if there are no loads (it can + * only happen with TCS). + */ + assert(list_index == 0); + nir_intrinsic_set_base(loadi, nir_intrinsic_base(store)); + } + } + + if (has_progress) { + /* Move the redirected loads to the found slot, so that compaction + * can find them. + */ + list_splicetail(src_loads, dst_loads); + list_inithead(src_loads); + + *progress |= list_index ? nir_progress_producer : + nir_progress_consumer; + } + } + + /* Remove all duplicated stores now that loads have been redirected. */ + remove_all_stores_and_clear_slot(linkage, i, progress); + } + + for (unsigned i = 0; i < ARRAY_SIZE(tables); i++) + _mesa_hash_table_destroy(tables[i], NULL); +} + +/****************************************************************** + * FIND OPEN-CODED TES INPUT INTERPOLATION + ******************************************************************/ + +static bool +is_sysval(nir_instr *instr, gl_system_value sysval) +{ + if (instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + if (intr->intrinsic == nir_intrinsic_from_system_value(sysval)) + return true; + + if (intr->intrinsic == nir_intrinsic_load_deref) { + nir_deref_instr *deref = + nir_instr_as_deref(intr->src[0].ssa->parent_instr); + + return nir_deref_mode_is_one_of(deref, nir_var_system_value) && + deref->var->data.location == sysval; + } + } + + return false; +} + +static nir_alu_instr * +get_single_use_as_alu(nir_def *def) +{ + /* Only 1 use allowed. */ + if (!list_is_singular(&def->uses)) + return NULL; + + nir_instr *instr = + nir_src_parent_instr(list_first_entry(&def->uses, nir_src, use_link)); + if (instr->type != nir_instr_type_alu) + return NULL; + + return nir_instr_as_alu(instr); +} + +static nir_alu_instr * +check_tes_input_load_get_single_use_alu(nir_intrinsic_instr *load, + unsigned *vertex_index, + unsigned *vertices_used, + unsigned max_vertices) +{ + if (load->intrinsic != nir_intrinsic_load_per_vertex_input) + return NULL; + + /* Check the vertex index. Each vertex can be loaded only once. */ + if (!nir_src_is_const(load->src[0])) + return false; + + *vertex_index = nir_src_as_uint(load->src[0]); + if (*vertex_index >= max_vertices || + *vertices_used & BITFIELD_BIT(*vertex_index)) + return false; + + *vertices_used |= BITFIELD_BIT(*vertex_index); + + return get_single_use_as_alu(&load->def); +} + +static bool +gather_fmul_tess_coord(nir_intrinsic_instr *load, nir_alu_instr *fmul, + unsigned vertex_index, unsigned *tess_coord_swizzle, + unsigned *tess_coord_used, nir_def **load_tess_coord) +{ + unsigned other_src = fmul->src[0].src.ssa == &load->def; + nir_instr *other_instr = fmul->src[other_src].src.ssa->parent_instr; + + assert(fmul->src[!other_src].swizzle[0] == 0); + + if (!is_sysval(other_instr, SYSTEM_VALUE_TESS_COORD)) + return false; + + unsigned tess_coord_component = fmul->src[other_src].swizzle[0]; + /* Each tesscoord component can be used only once. */ + if (*tess_coord_used & BITFIELD_BIT(tess_coord_component)) + return false; + + *tess_coord_swizzle |= tess_coord_component << (4 * vertex_index); + *tess_coord_used |= BITFIELD_BIT(tess_coord_component); + *load_tess_coord = &nir_instr_as_intrinsic(other_instr)->def; + return true; +} + +/** + * Find interpolation of the form: + * input[0].slot * TessCoord.a + + * input[1].slot * TessCoord.b + + * input[2].slot * TessCoord.c; + * + * a,b,c can be any of x,y,z, but each can occur only once. + */ +static bool +find_tes_triangle_interp_3fmul_2fadd(struct linkage_info *linkage, unsigned i) +{ + struct scalar_slot *slot = &linkage->slot[i]; + unsigned vertices_used = 0; + unsigned tess_coord_used = 0; + unsigned tess_coord_swizzle = 0; + unsigned num_fmuls = 0, num_fadds = 0; + nir_alu_instr *fadds[2]; + nir_def *load_tess_coord = NULL; + + /* Find 3 multiplications by TessCoord and their uses, which must be + * fadds. + */ + list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) { + unsigned vertex_index; + nir_alu_instr *fmul = + check_tes_input_load_get_single_use_alu(iter->instr, &vertex_index, + &vertices_used, 3); + /* Only maximum of 3 loads expected. Also reject exact ops because we + * are going to do an inexact transformation with it. + */ + if (!fmul || fmul->op != nir_op_fmul || fmul->exact || num_fmuls == 3 || + !gather_fmul_tess_coord(iter->instr, fmul, vertex_index, + &tess_coord_swizzle, &tess_coord_used, + &load_tess_coord)) + return false; + + num_fmuls++; + + /* The multiplication must only be used by fadd. Also reject exact ops. + */ + nir_alu_instr *fadd = get_single_use_as_alu(&fmul->def); + if (!fadd || fadd->op != nir_op_fadd || fadd->exact) + return false; + + /* The 3 fmuls must only be used by 2 fadds. */ + unsigned i; + for (i = 0; i < num_fadds; i++) { + if (fadds[i] == fadd) + break; + } + if (i == num_fadds) { + if (num_fadds == 2) + return false; + + fadds[num_fadds++] = fadd; + } + } + + if (num_fmuls != 3 || num_fadds != 2) + return false; + + assert(tess_coord_used == 0x7); + + /* We have found that the only uses of the 3 fmuls are 2 fadds, which + * implies that at least 2 fmuls are used by the same fadd. + * + * Check that 1 fadd is used by the other fadd, which can only be + * the result of the TessCoord interpolation. + */ + for (unsigned i = 0; i < 2; i++) { + if (get_single_use_as_alu(&fadds[i]->def) == fadds[!i]) { + switch (tess_coord_swizzle) { + case 0x210: + slot->consumer.tes_interp_load = fadds[!i]; + slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_UVW; + slot->consumer.tes_load_tess_coord = load_tess_coord; + return true; + + case 0x102: + slot->consumer.tes_interp_load = fadds[!i]; + slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_WUV; + slot->consumer.tes_load_tess_coord = load_tess_coord; + return true; + + default: + return false; + } + } + } + + return false; +} + +/** + * Find interpolation of the form: + * fma(input[0].slot, TessCoord.a, + * fma(input[1].slot, TessCoord.b, + * input[2].slot * TessCoord.c)) + * + * a,b,c can be any of x,y,z, but each can occur only once. + */ +static bool +find_tes_triangle_interp_1fmul_2ffma(struct linkage_info *linkage, unsigned i) +{ + struct scalar_slot *slot = &linkage->slot[i]; + unsigned vertices_used = 0; + unsigned tess_coord_used = 0; + unsigned tess_coord_swizzle = 0; + unsigned num_fmuls = 0, num_ffmas = 0; + nir_alu_instr *ffmas[2], *fmul = NULL; + nir_def *load_tess_coord = NULL; + + list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) { + unsigned vertex_index; + nir_alu_instr *alu = + check_tes_input_load_get_single_use_alu(iter->instr, &vertex_index, + &vertices_used, 3); + + /* Reject exact ops because we are going to do an inexact transformation + * with it. + */ + if (!alu || (alu->op != nir_op_fmul && alu->op != nir_op_ffma) || + alu->exact || + !gather_fmul_tess_coord(iter->instr, alu, vertex_index, + &tess_coord_swizzle, &tess_coord_used, + &load_tess_coord)) + return false; + + /* The multiplication must only be used by ffma. */ + if (alu->op == nir_op_fmul) { + nir_alu_instr *ffma = get_single_use_as_alu(&alu->def); + if (!ffma || ffma->op != nir_op_ffma) + return false; + + if (num_fmuls == 1) + return false; + + fmul = alu; + num_fmuls++; + } else { + if (num_ffmas == 2) + return false; + + ffmas[num_ffmas++] = alu; + } + } + + if (num_fmuls != 1 || num_ffmas != 2) + return false; + + assert(tess_coord_used == 0x7); + + /* We have found that fmul has only 1 use and it's ffma, and there are 2 + * ffmas. Fail if neither ffma is using fmul. + */ + if (ffmas[0]->src[2].src.ssa != &fmul->def && + ffmas[1]->src[2].src.ssa != &fmul->def) + return false; + + /* If one ffma is using the other ffma, it's guaranteed to be src[2]. */ + for (unsigned i = 0; i < 2; i++) { + if (get_single_use_as_alu(&ffmas[i]->def) == ffmas[!i]) { + switch (tess_coord_swizzle) { + case 0x210: + slot->consumer.tes_interp_load = ffmas[!i]; + slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_UVW; + slot->consumer.tes_load_tess_coord = load_tess_coord; + return true; + + case 0x102: + slot->consumer.tes_interp_load = ffmas[!i]; + slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_WUV; + slot->consumer.tes_load_tess_coord = load_tess_coord; + return true; + + default: + return false; + } + } + } + + return false; +} + +static void +find_open_coded_tes_input_interpolation(struct linkage_info *linkage) +{ + if (linkage->consumer_stage != MESA_SHADER_TESS_EVAL) + return; + + unsigned i; + BITSET_FOREACH_SET(i, linkage->flat32_mask, NUM_SCALAR_SLOTS) { + if (vec4_slot(i) >= VARYING_SLOT_PATCH0 && + vec4_slot(i) <= VARYING_SLOT_PATCH31) + continue; + if (find_tes_triangle_interp_3fmul_2fadd(linkage, i)) + continue; + if (find_tes_triangle_interp_1fmul_2ffma(linkage, i)) + continue; + } + + BITSET_FOREACH_SET(i, linkage->flat16_mask, NUM_SCALAR_SLOTS) { + if (vec4_slot(i) >= VARYING_SLOT_PATCH0 && + vec4_slot(i) <= VARYING_SLOT_PATCH31) + continue; + if (find_tes_triangle_interp_3fmul_2fadd(linkage, i)) + continue; + if (find_tes_triangle_interp_1fmul_2ffma(linkage, i)) + continue; + } +} + +/****************************************************************** + * BACKWARD INTER-SHADER CODE MOTION + ******************************************************************/ + +#define NEED_UPDATE_MOVABLE_FLAGS(instr) \ + (!((instr)->pass_flags & (FLAG_MOVABLE | FLAG_UNMOVABLE))) + +#define GET_SRC_INTERP(alu, i) \ + ((alu)->src[i].src.ssa->parent_instr->pass_flags & FLAG_INTERP_MASK) + +static bool +can_move_alu_across_interp(struct linkage_info *linkage, nir_alu_instr *alu) +{ + /* Exact ALUs can't be moved across interpolation. */ + if (alu->exact) + return false; + + /* Interpolation converts Infs to NaNs. If we turn a result of an ALU + * instruction into a new interpolated input, it converts Infs to NaNs for + * that instruction, while removing the Infs to NaNs conversion for sourced + * interpolated values. We can't do that if Infs and NaNs must be preserved. + */ + if (preserve_infs_nans(linkage->consumer_builder.shader, alu->def.bit_size)) + return false; + + switch (alu->op) { + /* Always legal if the sources are interpolated identically because: + * interp(x, i, j) + interp(y, i, j) = interp(x + y, i, j) + * interp(x, i, j) + convergent_expr = interp(x + convergent_expr, i, j) + */ + case nir_op_fadd: + case nir_op_fsub: + /* This is the same as multiplying by -1, which is always legal, see fmul. + */ + case nir_op_fneg: + case nir_op_mov: + return true; + + /* At least one side of the multiplication must be convergent because this + * is the only equation with multiplication that is true: + * interp(x, i, j) * convergent_expr = interp(x * convergent_expr, i, j) + */ + case nir_op_fmul: + case nir_op_fmulz: + case nir_op_ffma: + case nir_op_ffmaz: + return GET_SRC_INTERP(alu, 0) == FLAG_INTERP_CONVERGENT || + GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT; + + case nir_op_fdiv: + /* The right side must be convergent, which then follows the fmul rule. + */ + return GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT; + + case nir_op_flrp: + /* Using the same rule as fmul. */ + return (GET_SRC_INTERP(alu, 0) == FLAG_INTERP_CONVERGENT && + GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT) || + GET_SRC_INTERP(alu, 2) == FLAG_INTERP_CONVERGENT; + + default: + /* Moving other ALU instructions across interpolation is illegal. */ + return false; + } +} + +/* Determine whether an instruction is movable from the consumer to + * the producer. Also determine which interpolation modes each ALU instruction + * should use if its value was promoted to a new input. + */ +static void +update_movable_flags(struct linkage_info *linkage, nir_instr *instr) +{ + /* This function shouldn't be called more than once for each instruction + * to minimize recursive calling. + */ + assert(NEED_UPDATE_MOVABLE_FLAGS(instr)); + + switch (instr->type) { + case nir_instr_type_undef: + case nir_instr_type_load_const: + /* Treat constants as convergent, which means compatible with both flat + * and non-flat inputs. + */ + instr->pass_flags |= FLAG_MOVABLE | FLAG_INTERP_CONVERGENT; + return; + + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(instr); + unsigned num_srcs = nir_op_infos[alu->op].num_inputs; + unsigned alu_interp; + + /* These are shader-dependent and thus unmovable. */ + if (nir_op_is_derivative(alu->op)) { + instr->pass_flags |= FLAG_UNMOVABLE; + return; + } + + /* Make vector ops unmovable. They are technically movable but more + * complicated, and NIR should be scalarized for this pass anyway. + * The only remaining vector ops should be vecN for intrinsic sources. + */ + if (alu->def.num_components > 1) { + instr->pass_flags |= FLAG_UNMOVABLE; + return; + } + + alu_interp = FLAG_INTERP_CONVERGENT; + + for (unsigned i = 0; i < num_srcs; i++) { + nir_instr *src_instr = alu->src[i].src.ssa->parent_instr; + + if (NEED_UPDATE_MOVABLE_FLAGS(src_instr)) + update_movable_flags(linkage, src_instr); + + if (src_instr->pass_flags & FLAG_UNMOVABLE) { + instr->pass_flags |= FLAG_UNMOVABLE; + return; + } + + /* Determine which interpolation mode this ALU instruction should + * use if it was promoted to a new input. + */ + unsigned src_interp = src_instr->pass_flags & FLAG_INTERP_MASK; + + if (alu_interp == src_interp || + src_interp == FLAG_INTERP_CONVERGENT) { + /* Nothing to do. */ + } else if (alu_interp == FLAG_INTERP_CONVERGENT) { + alu_interp = src_interp; + } else { + assert(alu_interp != FLAG_INTERP_CONVERGENT && + src_interp != FLAG_INTERP_CONVERGENT && + alu_interp != src_interp); + /* The ALU instruction sources conflicting interpolation flags. + * It can never become a new input. + */ + instr->pass_flags |= FLAG_UNMOVABLE; + return; + } + } + + /* Check if we can move the ALU instruction across an interpolated + * load into the previous shader. + */ + if (alu_interp > FLAG_INTERP_FLAT && + !can_move_alu_across_interp(linkage, alu)) { + instr->pass_flags |= FLAG_UNMOVABLE; + return; + } + + instr->pass_flags |= FLAG_MOVABLE | alu_interp; + return; + } + + case nir_instr_type_intrinsic: { + /* Movable input loads already have FLAG_MOVABLE on them. + * Unmovable input loads skipped by initialization get UNMOVABLE here. + * (e.g. colors, texcoords) + * + * The only other movable intrinsic is load_deref for uniforms and UBOs. + * Other intrinsics are not movable. + */ + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + if (intr->intrinsic == nir_intrinsic_load_deref) { + nir_instr *deref = intr->src[0].ssa->parent_instr; + + if (NEED_UPDATE_MOVABLE_FLAGS(deref)) + update_movable_flags(linkage, deref); + + if (deref->pass_flags & FLAG_MOVABLE) { + /* Treat uniforms as convergent, which means compatible with both + * flat and non-flat inputs. + */ + instr->pass_flags |= FLAG_MOVABLE | FLAG_INTERP_CONVERGENT; + return; + } + } + + instr->pass_flags |= FLAG_UNMOVABLE; + return; + } + + case nir_instr_type_deref: + if (can_move_deref_between_shaders(linkage, instr)) + instr->pass_flags |= FLAG_MOVABLE; + else + instr->pass_flags |= FLAG_UNMOVABLE; + return; + + default: + instr->pass_flags |= FLAG_UNMOVABLE; + return; + } +} + +/* Gather the input loads used by the post-dominator using DFS. */ +static void +gather_used_input_loads(nir_instr *instr, + nir_intrinsic_instr *loads[NUM_SCALAR_SLOTS], + unsigned *num_loads) +{ + switch (instr->type) { + case nir_instr_type_undef: + case nir_instr_type_load_const: + return; + + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(instr); + unsigned num_srcs = nir_op_infos[alu->op].num_inputs; + + for (unsigned i = 0; i < num_srcs; i++) { + gather_used_input_loads(alu->src[i].src.ssa->parent_instr, + loads, num_loads); + } + return; + } + + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_deref: + case nir_intrinsic_load_tess_coord: + return; + + case nir_intrinsic_load_input: + case nir_intrinsic_load_per_vertex_input: + case nir_intrinsic_load_interpolated_input: + if (!(intr->instr.pass_flags & FLAG_GATHER_LOADS_VISITED)) { + assert(*num_loads < NUM_SCALAR_SLOTS*8); + loads[(*num_loads)++] = intr; + intr->instr.pass_flags |= FLAG_GATHER_LOADS_VISITED; + } + return; + + default: + printf("%u\n", intr->intrinsic); + unreachable("unexpected intrinsic"); + } + } + + default: + unreachable("unexpected instr type"); + } +} + +/* Move a post-dominator, which is an ALU opcode, into the previous shader, + * and replace the post-dominator with a new input load. + */ +static bool +try_move_postdominator(struct linkage_info *linkage, + struct nir_use_dominance_state *postdom_state, + nir_alu_instr *postdom, + nir_def *load_def, + nir_intrinsic_instr *first_load, + nir_opt_varyings_progress *progress) +{ +#define PRINT 0 +#if PRINT + printf("Trying to move post-dom: "); + nir_print_instr(&postdom->instr, stdout); + puts(""); +#endif + + /* Gather the input loads used by the post-dominator using DFS. */ + nir_intrinsic_instr *loads[NUM_SCALAR_SLOTS*8]; + unsigned num_loads = 0; + gather_used_input_loads(&postdom->instr, loads, &num_loads); + + /* Clear the flag set by gather_used_input_loads. */ + for (unsigned i = 0; i < num_loads; i++) + loads[i]->instr.pass_flags &= ~FLAG_GATHER_LOADS_VISITED; + + /* For all the loads, the previous shader must have the corresponding + * output stores in the same basic block because we are going to replace + * them with 1 store. Only TCS and GS can have stores of different outputs + * in different blocks. + */ + nir_block *block = NULL; + + for (unsigned i = 0; i < num_loads; i++) { + unsigned slot_index = intr_get_scalar_16bit_slot(loads[i]); + struct scalar_slot *slot = &linkage->slot[slot_index]; + + assert(list_is_singular(&slot->producer.stores)); + nir_intrinsic_instr *store = + list_first_entry(&slot->producer.stores, struct list_node, + head)->instr; + + if (!block) { + block = store->instr.block; + continue; + } + if (block != store->instr.block) + return false; + } + + assert(block); + +#if PRINT + printf("Post-dom accepted: "); + nir_print_instr(&postdom->instr, stdout); + puts("\n"); +#endif + + /* Determine the scalar slot index of the new varying. It will reuse + * the slot of the load we started from because the load will be + * removed. + */ + unsigned final_slot = intr_get_scalar_16bit_slot(first_load); + + /* Replace the post-dominator in the consumer with a new input load. + * Since we are reusing the same slot as the first load and it has + * the right interpolation qualifiers, use it as the new load by using + * it in place of the post-dominator. + * + * Boolean post-dominators are upcast in the producer and then downcast + * in the consumer. + */ + unsigned slot_index = final_slot; + struct scalar_slot *slot = &linkage->slot[slot_index]; + nir_builder *b = &linkage->consumer_builder; + b->cursor = nir_after_instr(load_def->parent_instr); + unsigned alu_interp = postdom->instr.pass_flags & FLAG_INTERP_MASK; + nir_def *new_input, *new_tes_loads[3]; + BITSET_WORD *mask; + + /* NIR can't do 1-bit inputs. Convert them to a bigger size. */ + assert(postdom->def.bit_size & (1 | 16 | 32)); + unsigned new_bit_size = postdom->def.bit_size; + + if (new_bit_size == 1) { + assert(alu_interp == FLAG_INTERP_CONVERGENT || + alu_interp == FLAG_INTERP_FLAT); + /* TODO: We could use 16 bits instead, but that currently fails on AMD. + */ + new_bit_size = 32; + } + + /* Create the new input load. This creates a new load (or a series of + * loads in case of open-coded TES interpolation) that's identical to + * the original load(s). + */ + if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && + alu_interp > FLAG_INTERP_FLAT) { + nir_def *baryc = NULL; + + /* Determine the barycentric coordinates. */ + switch (alu_interp) { + case FLAG_INTERP_PERSP_PIXEL: + case FLAG_INTERP_LINEAR_PIXEL: + baryc = nir_load_barycentric_pixel(b, 32); + break; + case FLAG_INTERP_PERSP_CENTROID: + case FLAG_INTERP_LINEAR_CENTROID: + baryc = nir_load_barycentric_centroid(b, 32); + break; + case FLAG_INTERP_PERSP_SAMPLE: + case FLAG_INTERP_LINEAR_SAMPLE: + baryc = nir_load_barycentric_sample(b, 32); + break; + } + + nir_intrinsic_instr *baryc_i = + nir_instr_as_intrinsic(baryc->parent_instr); + + if (alu_interp == FLAG_INTERP_LINEAR_PIXEL || + alu_interp == FLAG_INTERP_LINEAR_CENTROID || + alu_interp == FLAG_INTERP_LINEAR_SAMPLE) + nir_intrinsic_set_interp_mode(baryc_i, INTERP_MODE_NOPERSPECTIVE); + else + nir_intrinsic_set_interp_mode(baryc_i, INTERP_MODE_SMOOTH); + + new_input = nir_load_interpolated_input( + b, 1, new_bit_size, baryc, nir_imm_int(b, 0), + .base = nir_intrinsic_base(first_load), + .component = nir_intrinsic_component(first_load), + .dest_type = nir_intrinsic_dest_type(first_load), + .io_semantics = nir_intrinsic_io_semantics(first_load)); + + mask = new_bit_size == 16 ? linkage->interp_fp16_mask + : linkage->interp_fp32_mask; + } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL && + alu_interp > FLAG_INTERP_FLAT) { + nir_def *zero = nir_imm_int(b, 0); + + for (unsigned i = 0; i < 3; i++) { + new_tes_loads[i] = + nir_load_per_vertex_input(b, 1, new_bit_size, + i ? nir_imm_int(b, i) : zero, zero, + .base = nir_intrinsic_base(first_load), + .component = nir_intrinsic_component(first_load), + .dest_type = nir_intrinsic_dest_type(first_load), + .io_semantics = nir_intrinsic_io_semantics(first_load)); + } + + int remap_uvw[3] = {0, 1, 2}; + int remap_wuv[3] = {2, 0, 1}; + int *remap; + + switch (alu_interp) { + case FLAG_INTERP_TES_TRIANGLE_UVW: + remap = remap_uvw; + break; + case FLAG_INTERP_TES_TRIANGLE_WUV: + remap = remap_wuv; + break; + default: + unreachable("invalid TES interpolation mode"); + } + + nir_def *tesscoord = slot->consumer.tes_load_tess_coord; + nir_def *defs[3]; + + for (unsigned i = 0; i < 3; i++) { + if (i == 0) { + defs[i] = nir_fmul(b, new_tes_loads[i], + nir_channel(b, tesscoord, remap[i])); + } else { + defs[i] = nir_ffma(b, new_tes_loads[i], + nir_channel(b, tesscoord, remap[i]), + defs[i - 1]); + } + } + new_input = defs[2]; + + mask = new_bit_size == 16 ? linkage->flat16_mask + : linkage->flat32_mask; + } else { + new_input = + nir_load_input(b, 1, new_bit_size, nir_imm_int(b, 0), + .base = nir_intrinsic_base(first_load), + .component = nir_intrinsic_component(first_load), + .dest_type = nir_intrinsic_dest_type(first_load), + .io_semantics = nir_intrinsic_io_semantics(first_load)); + + if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && + alu_interp == FLAG_INTERP_CONVERGENT) { + mask = new_bit_size == 16 ? linkage->convergent16_mask + : linkage->convergent32_mask; + } else { + mask = new_bit_size == 16 ? linkage->flat16_mask + : linkage->flat32_mask; + } + } + + assert(!BITSET_TEST(linkage->no_varying32_mask, slot_index)); + assert(!BITSET_TEST(linkage->no_varying16_mask, slot_index)); + + /* Re-set the category of the new scalar input. This will cause + * the compaction to treat it as a different type, so that it will move it + * into the vec4 that has compatible interpolation qualifiers. + * + * This shouldn't be done if any of the interp masks are not set, which + * indicates that compaction is disallowed. + */ + if (BITSET_TEST(linkage->interp_fp32_mask, slot_index) || + BITSET_TEST(linkage->interp_fp16_mask, slot_index) || + BITSET_TEST(linkage->flat32_mask, slot_index) || + BITSET_TEST(linkage->flat16_mask, slot_index) || + BITSET_TEST(linkage->convergent32_mask, slot_index) || + BITSET_TEST(linkage->convergent16_mask, slot_index)) { + BITSET_CLEAR(linkage->interp_fp32_mask, slot_index); + BITSET_CLEAR(linkage->interp_fp16_mask, slot_index); + BITSET_CLEAR(linkage->flat16_mask, slot_index); + BITSET_CLEAR(linkage->flat32_mask, slot_index); + BITSET_CLEAR(linkage->convergent16_mask, slot_index); + BITSET_CLEAR(linkage->convergent32_mask, slot_index); + BITSET_SET(mask, slot_index); + } + + /* Replace the existing load with the new load in the slot. */ + if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL && + alu_interp >= FLAG_INTERP_TES_TRIANGLE_UVW) { + /* For TES, replace all 3 loads. */ + unsigned i = 0; + list_for_each_entry(struct list_node, iter, &slot->consumer.loads, + head) { + assert(i < 3); + iter->instr = nir_instr_as_intrinsic(new_tes_loads[i]->parent_instr); + i++; + } + + assert(i == 3); + assert(postdom->def.bit_size != 1); + + slot->consumer.tes_interp_load = + nir_instr_as_alu(new_input->parent_instr); + } else { + assert(list_is_singular(&slot->consumer.loads)); + list_first_entry(&slot->consumer.loads, struct list_node, head)->instr = + nir_instr_as_intrinsic(new_input->parent_instr); + + /* The input is a bigger type even if the post-dominator is boolean. */ + if (postdom->def.bit_size == 1) + new_input = nir_ine_imm(b, new_input, 0); + } + + nir_def_rewrite_uses(&postdom->def, new_input); + + /* Clone the post-dominator at the end of the block in the producer + * where the output stores are. + */ + b = &linkage->producer_builder; + b->cursor = nir_after_block_before_jump(block); + nir_def *producer_clone = clone_ssa(linkage, b, &postdom->def); + + /* Boolean post-dominators are upcast in the producer because we can't + * use 1-bit outputs. + */ + if (producer_clone->bit_size == 1) + producer_clone = nir_b2bN(b, producer_clone, new_bit_size); + + /* Move the existing store to the end of the block and rewrite it to use + * the post-dominator result. + */ + nir_intrinsic_instr *store = + list_first_entry(&linkage->slot[final_slot].producer.stores, + struct list_node, head)->instr; + nir_instr_move(b->cursor, &store->instr); + nir_src_rewrite(&store->src[0], producer_clone); + + /* Remove all loads and stores that we are replacing from the producer + * and consumer. + */ + for (unsigned i = 0; i < num_loads; i++) { + unsigned slot_index = intr_get_scalar_16bit_slot(loads[i]); + + if (slot_index == final_slot) { + /* Keep the load and store that we reused. */ + continue; + } + + /* Remove loads and stores that are dead after the code motion. Only + * those loads that are post-dominated by the post-dominator are dead. + */ + struct scalar_slot *slot = &linkage->slot[slot_index]; + nir_instr *load; + + if (slot->consumer.tes_interp_load) { + load = &slot->consumer.tes_interp_load->instr; + + /* With interpolated TES loads, we get here 3 times, once for each + * per-vertex load. Skip this if we've been here before. + */ + if (list_is_empty(&slot->producer.stores)) { + assert(list_is_empty(&slot->consumer.loads)); + continue; + } + } else { + assert(list_is_singular(&slot->consumer.loads)); + load = &list_first_entry(&slot->consumer.loads, + struct list_node, head)->instr->instr; + } + + if (nir_instr_dominates_use(postdom_state, &postdom->instr, load)) { + list_inithead(&slot->consumer.loads); + + /* Remove stores. (transform feedback is allowed here, just not + * in final_slot) + */ + remove_all_stores_and_clear_slot(linkage, slot_index, progress); + } + } + + *progress |= nir_progress_producer | nir_progress_consumer; + return true; +} + +static bool +backward_inter_shader_code_motion(struct linkage_info *linkage, + nir_opt_varyings_progress *progress) +{ + /* These producers are not supported. The description at the beginning + * suggests a possible workaround. + */ + if (linkage->producer_stage == MESA_SHADER_GEOMETRY || + linkage->producer_stage == MESA_SHADER_MESH || + linkage->producer_stage == MESA_SHADER_TASK) + return false; + + /* Clear pass_flags. */ + nir_shader_clear_pass_flags(linkage->consumer_builder.shader); + + /* Gather inputs that can be moved into the previous shader. These are only + * checked for the basic constraints for movability. + */ + struct { + nir_def *def; + nir_intrinsic_instr *first_load; + } movable_loads[NUM_SCALAR_SLOTS]; + unsigned num_movable_loads = 0; + unsigned i; + + BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) { + if (!can_optimize_varying(linkage, + vec4_slot(i)).inter_shader_code_motion) + continue; + + struct scalar_slot *slot = &linkage->slot[i]; + + assert(!list_is_empty(&slot->producer.stores)); + assert(!is_interpolated_texcoord(linkage, i)); + assert(!is_interpolated_color(linkage, i)); + + /* Disallow producer loads. */ + if (!list_is_empty(&slot->producer.loads)) + continue; + + /* There should be only 1 store per output. */ + if (!list_is_singular(&slot->producer.stores)) + continue; + + nir_def *load_def = NULL; + nir_intrinsic_instr *load = + list_first_entry(&slot->consumer.loads, struct list_node, + head)->instr; + + nir_intrinsic_instr *store = + list_first_entry(&slot->producer.stores, struct list_node, + head)->instr; + + /* Set interpolation flags. + * Handle interpolated TES loads first because they are special. + */ + if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL && + slot->consumer.tes_interp_load) { + if (linkage->producer_stage == MESA_SHADER_VERTEX) { + /* VS -> TES has no constraints on VS stores. */ + load_def = &slot->consumer.tes_interp_load->def; + load_def->parent_instr->pass_flags |= FLAG_ALU_IS_TES_INTERP_LOAD | + slot->consumer.tes_interp_mode; + } else { + assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL); + assert(store->intrinsic == nir_intrinsic_store_per_vertex_output); + + /* The vertex index of the store must InvocationID. */ + if (is_sysval(store->src[1].ssa->parent_instr, + SYSTEM_VALUE_INVOCATION_ID)) { + load_def = &slot->consumer.tes_interp_load->def; + load_def->parent_instr->pass_flags |= FLAG_ALU_IS_TES_INTERP_LOAD | + slot->consumer.tes_interp_mode; + } else { + continue; + } + } + } else { + /* Allow only 1 load per input. CSE should be run before this. */ + if (!list_is_singular(&slot->consumer.loads)) + continue; + + /* This can only be TCS -> TES, which is handled above and rejected + * otherwise. + */ + if (store->intrinsic == nir_intrinsic_store_per_vertex_output) { + assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL); + continue; + } + + /* TODO: handle load_per_vertex_input for TCS and GS. + * TES can also occur here if tes_interp_load is NULL. + */ + if (load->intrinsic == nir_intrinsic_load_per_vertex_input) + continue; + + load_def = &load->def; + + switch (load->intrinsic) { + case nir_intrinsic_load_interpolated_input: { + assert(linkage->consumer_stage == MESA_SHADER_FRAGMENT); + nir_intrinsic_instr *baryc = + nir_instr_as_intrinsic(load->src[0].ssa->parent_instr); + nir_intrinsic_op op = baryc->intrinsic; + enum glsl_interp_mode interp = nir_intrinsic_interp_mode(baryc); + bool linear = interp == INTERP_MODE_NOPERSPECTIVE; + bool convergent = BITSET_TEST(linkage->convergent32_mask, i) || + BITSET_TEST(linkage->convergent16_mask, i); + + assert(interp == INTERP_MODE_NONE || + interp == INTERP_MODE_SMOOTH || + interp == INTERP_MODE_NOPERSPECTIVE); + + if (convergent) { + load->instr.pass_flags |= FLAG_INTERP_CONVERGENT; + } else if (op == nir_intrinsic_load_barycentric_pixel) { + load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_PIXEL + : FLAG_INTERP_PERSP_PIXEL; + } else if (op == nir_intrinsic_load_barycentric_centroid) { + load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_CENTROID + : FLAG_INTERP_PERSP_CENTROID; + } else if (op == nir_intrinsic_load_barycentric_sample) { + load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_SAMPLE + : FLAG_INTERP_PERSP_SAMPLE; + } else { + /* Optimizing at_offset and at_sample would be possible but + * maybe not worth it if they are not convergent. Convergent + * inputs can trivially switch the barycentric coordinates + * to different ones or flat. + */ + continue; + } + break; + } + case nir_intrinsic_load_input: + if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { + if (BITSET_TEST(linkage->convergent32_mask, i) || + BITSET_TEST(linkage->convergent16_mask, i)) + load->instr.pass_flags |= FLAG_INTERP_CONVERGENT; + else + load->instr.pass_flags |= FLAG_INTERP_FLAT; + } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL) { + assert(vec4_slot(i) >= VARYING_SLOT_PATCH0 && + vec4_slot(i) <= VARYING_SLOT_PATCH31); + /* Patch inputs are always convergent. */ + load->instr.pass_flags |= FLAG_INTERP_CONVERGENT; + } else { + /* It's not a fragment shader. We still need to set this. */ + load->instr.pass_flags |= FLAG_INTERP_FLAT; + } + break; + default: + unreachable("unexpected load intrinsic"); + } + } + + load_def->parent_instr->pass_flags |= FLAG_MOVABLE; + + /* Disallow transform feedback. The load is "movable" for the purpose of + * finding a movable post-dominator, we just can't rewrite the store + * because we need to keep it for xfb, so the post-dominator search + * will have to start from a different load (only that varying will have + * its value rewritten). + */ + if (BITSET_TEST(linkage->xfb_mask, i)) + continue; + + assert(num_movable_loads < ARRAY_SIZE(movable_loads)); + movable_loads[num_movable_loads].def = load_def; + movable_loads[num_movable_loads].first_load = load; + num_movable_loads++; + } + + if (!num_movable_loads) + return false; + + struct nir_use_dominance_state *postdom_state = + nir_calc_use_dominance_impl(linkage->consumer_builder.impl, true); + + for (unsigned i = 0; i < num_movable_loads; i++) { + nir_def *load_def = movable_loads[i].def; + nir_instr *iter = load_def->parent_instr; + nir_instr *movable_postdom = NULL; + + /* Find the farthest post-dominator that is movable. */ + while (iter) { + iter = nir_get_immediate_use_dominator(postdom_state, iter); + if (iter) { + if (NEED_UPDATE_MOVABLE_FLAGS(iter)) + update_movable_flags(linkage, iter); + + if (iter->pass_flags & FLAG_UNMOVABLE) + break; + + /* This can only be an ALU instruction. */ + nir_alu_instr *alu = nir_instr_as_alu(iter); + + /* Skip 64-bit defs and keep searching. Replacing 32-bit inputs + * with one 64-bit input is unlikely to benefit. + */ + if (alu->def.bit_size == 64) + continue; + + /* Skip comparison opcodes that directly source the first load + * and a constant because any 1-bit values would have to be + * converted to 32 bits in the producer and then converted back + * to 1 bit using nir_op_ine in the consumer, achieving nothing. + */ + if (alu->def.bit_size == 1 && + ((nir_op_infos[alu->op].num_inputs == 1 && + alu->src[0].src.ssa == load_def) || + (nir_op_infos[alu->op].num_inputs == 2 && + ((alu->src[0].src.ssa == load_def && + alu->src[1].src.ssa->parent_instr->type == + nir_instr_type_load_const) || + (alu->src[0].src.ssa->parent_instr->type == + nir_instr_type_load_const && + alu->src[1].src.ssa == load_def))))) + continue; + + movable_postdom = iter; + } + } + + /* Add the post-dominator to the list unless it's been added already. */ + if (movable_postdom && + !(movable_postdom->pass_flags & FLAG_POST_DOMINATOR_PROCESSED)) { + if (try_move_postdominator(linkage, postdom_state, + nir_instr_as_alu(movable_postdom), + load_def, movable_loads[i].first_load, + progress)) { + /* Moving only one postdominator can change the IR enough that + * we should start from scratch. + */ + ralloc_free(postdom_state); + return true; + } + + movable_postdom->pass_flags |= FLAG_POST_DOMINATOR_PROCESSED; + } + } + + ralloc_free(postdom_state); + return false; +} + +/****************************************************************** + * COMPACTION + ******************************************************************/ + +/* Relocate a slot to a new index. Used by compaction. new_index is + * the component index at 16-bit granularity, so the size of vec4 is 8 + * in that representation. + */ +static void +relocate_slot(struct linkage_info *linkage, struct scalar_slot *slot, + unsigned i, unsigned new_index, enum fs_vec4_type fs_vec4_type, + nir_opt_varyings_progress *progress) +{ + assert(!list_is_empty(&slot->producer.stores)); + + list_for_each_entry(struct list_node, iter, &slot->producer.stores, head) { + assert(!nir_intrinsic_io_semantics(iter->instr).no_varying || + has_xfb(iter->instr) || + linkage->producer_stage == MESA_SHADER_TESS_CTRL); + assert(!is_active_sysval_output(linkage, i, iter->instr)); + } + + /* Relocate the slot in all loads and stores. */ + struct list_head *instruction_lists[3] = { + &slot->producer.stores, + &slot->producer.loads, + &slot->consumer.loads, + }; + + for (unsigned i = 0; i < ARRAY_SIZE(instruction_lists); i++) { + list_for_each_entry(struct list_node, iter, instruction_lists[i], head) { + nir_intrinsic_instr *intr = iter->instr; + + gl_varying_slot new_semantic = vec4_slot(new_index); + unsigned new_component = (new_index % 8) / 2; + bool new_high_16bits = new_index % 2; + + /* We also need to relocate xfb info because it's always relative + * to component 0. This just moves it into the correct xfb slot. + */ + if (has_xfb(intr)) { + unsigned old_component = nir_intrinsic_component(intr); + static const nir_io_xfb clear_xfb; + nir_io_xfb xfb; + bool new_is_odd = new_component % 2 == 1; + + memset(&xfb, 0, sizeof(xfb)); + + if (old_component >= 2) { + xfb.out[new_is_odd] = nir_intrinsic_io_xfb2(intr).out[old_component - 2]; + nir_intrinsic_set_io_xfb2(intr, clear_xfb); + } else { + xfb.out[new_is_odd] = nir_intrinsic_io_xfb(intr).out[old_component]; + nir_intrinsic_set_io_xfb(intr, clear_xfb); + } + + if (new_component >= 2) + nir_intrinsic_set_io_xfb2(intr, xfb); + else + nir_intrinsic_set_io_xfb(intr, xfb); + } + + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + + /* When relocating a back color store, don't change it to a front + * color as that would be incorrect. Keep it as back color and only + * relocate it between BFC0 and BFC1. + */ + if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && + (sem.location == VARYING_SLOT_BFC0 || + sem.location == VARYING_SLOT_BFC1)) { + assert(new_semantic == VARYING_SLOT_COL0 || + new_semantic == VARYING_SLOT_COL1); + new_semantic = VARYING_SLOT_BFC0 + + (new_semantic - VARYING_SLOT_COL0); + } + + sem.location = new_semantic; + sem.high_16bits = new_high_16bits; + + nir_intrinsic_set_io_semantics(intr, sem); + nir_intrinsic_set_component(intr, new_component); + + /* This path is used when promoting convergent interpolated + * inputs to flat. Replace load_interpolated_input with load_input. + */ + if (fs_vec4_type == FS_VEC4_TYPE_FLAT && + intr->intrinsic == nir_intrinsic_load_interpolated_input) { + assert(instruction_lists[i] == &slot->consumer.loads); + nir_builder *b = &linkage->consumer_builder; + + b->cursor = nir_before_instr(&intr->instr); + nir_def *load = + nir_load_input(b, 1, intr->def.bit_size, + nir_get_io_offset_src(intr)->ssa, + .io_semantics = sem, + .component = new_component, + .dest_type = nir_intrinsic_dest_type(intr)); + + nir_def_rewrite_uses(&intr->def, load); + iter->instr = nir_instr_as_intrinsic(load->parent_instr); + nir_instr_remove(&intr->instr); + *progress |= nir_progress_consumer; + + /* Interpolation converts Infs to NaNs. If we change it to flat, + * we need to convert Infs to NaNs manually in the producer to + * preserve that. + */ + if (preserve_nans(linkage->consumer_builder.shader, + load->bit_size)) { + list_for_each_entry(struct list_node, iter, + &slot->producer.stores, head) { + nir_intrinsic_instr *store = iter->instr; + + nir_builder *b = &linkage->producer_builder; + b->cursor = nir_before_instr(&store->instr); + nir_def *repl = + build_convert_inf_to_nan(b, store->src[0].ssa); + nir_src_rewrite(&store->src[0], repl); + } + } + } + } + } +} + +/** + * A helper function for compact_varyings(). Assign new slot indices for + * existing slots of a certain vec4 type (FLAT, FP16, or FP32). Skip already- + * assigned scalar slots (determined by assigned_mask) and don't assign to + * vec4 slots that have an incompatible vec4 type (determined by + * assigned_fs_vec4_type). This works with both 32-bit and 16-bit types. + * slot_size is the component size in the units of 16 bits (2 means 32 bits). + * + * The number of slots to assign can optionally be limited by + * max_assigned_slots. + * + * Return how many 16-bit slots are left unused in the last vec4 (up to 8 + * slots). + */ +static unsigned +fs_assign_slots(struct linkage_info *linkage, + BITSET_WORD *assigned_mask, + uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS], + BITSET_WORD *input_mask, + enum fs_vec4_type fs_vec4_type, + unsigned slot_size, + unsigned max_assigned_slots, + bool assign_colors, + unsigned color_channel_rotate, + nir_opt_varyings_progress *progress) +{ + unsigned i, slot_index, max_slot; + unsigned num_assigned_slots = 0; + + if (assign_colors) { + slot_index = VARYING_SLOT_COL0 * 8; /* starting slot */ + max_slot = VARYING_SLOT_COL1 * 8 + 8; + } else { + slot_index = VARYING_SLOT_VAR0 * 8; /* starting slot */ + max_slot = VARYING_SLOT_MAX; + } + + /* Assign new slot indices for scalar slots. */ + BITSET_FOREACH_SET(i, input_mask, NUM_SCALAR_SLOTS) { + if (is_interpolated_color(linkage, i) != assign_colors) + continue; + + /* Skip indirectly-indexed scalar slots and slots incompatible + * with the FS vec4 type. + */ + while ((fs_vec4_type != FS_VEC4_TYPE_NONE && + assigned_fs_vec4_type[vec4_slot(slot_index)] != + FS_VEC4_TYPE_NONE && + assigned_fs_vec4_type[vec4_slot(slot_index)] != + fs_vec4_type) || + BITSET_TEST32(linkage->indirect_mask, slot_index) || + BITSET_TEST(assigned_mask, slot_index)) { + /* If the FS vec4 type is incompatible. Move to the next vec4. */ + if (fs_vec4_type != FS_VEC4_TYPE_NONE && + assigned_fs_vec4_type[vec4_slot(slot_index)] != + FS_VEC4_TYPE_NONE && + assigned_fs_vec4_type[vec4_slot(slot_index)] != fs_vec4_type) { + slot_index = align(slot_index + slot_size, 8); /* move to next vec4 */ + continue; + } + + /* Copy the FS vec4 type if indexed indirectly, and move to + * the next slot. + */ + if (BITSET_TEST32(linkage->indirect_mask, slot_index)) { + if (assigned_fs_vec4_type) { + assigned_fs_vec4_type[vec4_slot(slot_index)] = + linkage->fs_vec4_type[vec4_slot(slot_index)]; + } + assert(slot_index % 2 == 0); + slot_index += 2; /* increment by 32 bits */ + continue; + } + + /* This slot is already assigned (assigned_mask is set). Move to + * the next one. + */ + slot_index += slot_size; + } + + /* Assign color channels in this order, starting + * at the color_channel_rotate component first. Cases: + * color_channel_rotate = 0: xyzw + * color_channel_rotate = 1: yzwx + * color_channel_rotate = 2: zwxy + * color_channel_rotate = 3: wxyz + * + * This has no effect on behavior per se, but some drivers merge VARn + * and COLn into one output if each defines different components. + * For example, if we store VAR0.xy and COL0.z, a driver can merge them + * by mapping the same output to 2 different inputs (VAR0 and COL0) if + * color-specific behavior is per component, but it can't merge VAR0.xy + * and COL0.x because they both define x. + */ + unsigned new_slot_index = slot_index; + if (assign_colors && color_channel_rotate) { + new_slot_index = (vec4_slot(new_slot_index)) * 8 + + (new_slot_index + color_channel_rotate * 2) % 8; + } + + /* Relocate the slot. */ + assert(slot_index < max_slot * 8); + relocate_slot(linkage, &linkage->slot[i], i, new_slot_index, + fs_vec4_type, progress); + BITSET_SET(assigned_mask, slot_index); + if (assigned_fs_vec4_type) + assigned_fs_vec4_type[vec4_slot(slot_index)] = fs_vec4_type; + slot_index += slot_size; /* move to the next slot */ + num_assigned_slots += slot_size; + + /* Remove the slot from the input (unassigned) mask. */ + BITSET_CLEAR(input_mask, i); + + /* The number of slots to assign can optionally be limited. */ + assert(num_assigned_slots <= max_assigned_slots); + if (num_assigned_slots == max_assigned_slots) + break; + } + + assert(slot_index <= max_slot * 8); + /* Return how many 16-bit slots are left unused in the last vec4. */ + return (NUM_SCALAR_SLOTS - slot_index) % 8; +} + +/** + * This is called once for 32-bit inputs and once for 16-bit inputs. + * It assigns new slot indices to all scalar slots specified in the masks. + * + * \param linkage Linkage info + * \param assigned_mask Which scalar (16-bit) slots are already taken. + * \param assigned_fs_vec4_type Which vec4 slots have an assigned qualifier + * and can only be filled with compatible slots. + * \param interp_mask The list of interp slots to assign locations for. + * \param flat_mask The list of flat slots to assign locations for. + * \param convergent_mask The list of slots that have convergent output + * stores. + * \param sized_interp_type One of FS_VEC4_TYPE_INTERP_{FP32, FP16, COLOR}. + * \param slot_size 1 for 16 bits, 2 for 32 bits + * \param color_channel_rotate Assign color channels starting with this index, + * e.g. 2 assigns channels in the zwxy order. + * \param assign_colors Whether to assign only color varyings or only + * non-color varyings. + */ +static void +fs_assign_slot_groups(struct linkage_info *linkage, + BITSET_WORD *assigned_mask, + uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS], + BITSET_WORD *interp_mask, + BITSET_WORD *flat_mask, + BITSET_WORD *convergent_mask, + BITSET_WORD *color_interp_mask, + enum fs_vec4_type sized_interp_type, + unsigned slot_size, + bool assign_colors, + unsigned color_channel_rotate, + nir_opt_varyings_progress *progress) +{ + /* Put interpolated slots first. */ + unsigned unused_interp_slots = + fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, + interp_mask, sized_interp_type, + slot_size, NUM_SCALAR_SLOTS, assign_colors, + color_channel_rotate, progress); + + unsigned unused_color_interp_slots = 0; + if (color_interp_mask) { + unused_color_interp_slots = + fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, + color_interp_mask, FS_VEC4_TYPE_INTERP_COLOR, + slot_size, NUM_SCALAR_SLOTS, assign_colors, + color_channel_rotate, progress); + } + + /* Put flat slots next. + * Note that only flat vec4 slots can have both 32-bit and 16-bit types + * packed in the same vec4. 32-bit flat inputs are packed first, followed + * by 16-bit flat inputs. + */ + unsigned unused_flat_slots = + fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, + flat_mask, FS_VEC4_TYPE_FLAT, + slot_size, NUM_SCALAR_SLOTS, assign_colors, + color_channel_rotate, progress); + + /* Take the inputs with convergent values and assign them as follows. + * Since they can be assigned as both interpolated and flat, we can + * choose. We prefer them to be flat, but if interpolated vec4s have + * unused components, try to fill those before starting a new flat vec4. + * + * First, fill the unused components of flat (if any), then fill + * the unused components of interpolated (if any), and then make + * the remaining convergent inputs flat. + */ + if (unused_flat_slots) { + fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, + convergent_mask, FS_VEC4_TYPE_FLAT, + slot_size, unused_flat_slots, assign_colors, + color_channel_rotate, progress); + } + if (unused_interp_slots) { + fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, + convergent_mask, sized_interp_type, + slot_size, unused_interp_slots, assign_colors, + color_channel_rotate, progress); + } + if (unused_color_interp_slots) { + fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, + convergent_mask, FS_VEC4_TYPE_INTERP_COLOR, + slot_size, unused_color_interp_slots, assign_colors, + color_channel_rotate, progress); + } + fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, + convergent_mask, FS_VEC4_TYPE_FLAT, + slot_size, NUM_SCALAR_SLOTS, assign_colors, + color_channel_rotate, progress); +} + +static void +vs_tcs_tes_gs_assign_slots(struct linkage_info *linkage, + BITSET_WORD *input_mask, + unsigned *slot_index, + unsigned *patch_slot_index, + unsigned slot_size, + nir_opt_varyings_progress *progress) +{ + unsigned i; + + BITSET_FOREACH_SET(i, input_mask, NUM_SCALAR_SLOTS) { + if (i >= VARYING_SLOT_PATCH0 * 8 && i < VARYING_SLOT_TESS_MAX * 8) { + /* Skip indirectly-indexed scalar slots at 32-bit granularity. + * We have to do it at this granularity because the low 16-bit + * slot is set to 1 for 32-bit inputs but not the high 16-bit slot. + */ + while (BITSET_TEST32(linkage->indirect_mask, *patch_slot_index)) + *patch_slot_index = align(*patch_slot_index + 1, 2); + + assert(*patch_slot_index < VARYING_SLOT_TESS_MAX * 8); + relocate_slot(linkage, &linkage->slot[i], i, *patch_slot_index, + FS_VEC4_TYPE_NONE, progress); + *patch_slot_index += slot_size; /* increment by 16 or 32 bits */ + } else { + /* If the driver wants to use POS and we've already used it, move + * to VARn. + */ + if (*slot_index < VARYING_SLOT_VAR0 && + *slot_index >= VARYING_SLOT_POS + 8) + *slot_index = VARYING_SLOT_VAR0 * 8; + + /* Skip indirectly-indexed scalar slots at 32-bit granularity. */ + while (BITSET_TEST32(linkage->indirect_mask, *slot_index)) + *slot_index = align(*slot_index + 1, 2); + + assert(*slot_index < VARYING_SLOT_MAX * 8); + relocate_slot(linkage, &linkage->slot[i], i, *slot_index, + FS_VEC4_TYPE_NONE, progress); + *slot_index += slot_size; /* increment by 16 or 32 bits */ + } + } +} + +/** + * Compaction means scalarizing and then packing scalar components into full + * vec4s, so that we minimize the number of unused components in vec4 slots. + * + * Compaction is as simple as moving a scalar input from one scalar slot + * to another. Indirectly-indexed slots are not touched, so the compaction + * has to compact around them. Unused 32-bit components of indirectly-indexed + * slots are still filled, so no space is wasted there, but if indirectly- + * indexed 16-bit components have the other 16-bit half unused, that half is + * wasted. + */ +static void +compact_varyings(struct linkage_info *linkage, + nir_opt_varyings_progress *progress) +{ + if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { + /* These arrays are used to track which scalar slots we've already + * assigned. We can fill unused components of indirectly-indexed slots, + * but only if the vec4 slot type (FLAT, FP16, or FP32) is the same. + * Assign vec4 slot type separately, skipping over already assigned + * scalar slots. + */ + uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS] = {0}; + BITSET_DECLARE(assigned_mask, NUM_SCALAR_SLOTS); + BITSET_ZERO(assigned_mask); + + fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type, + linkage->interp_fp32_mask, linkage->flat32_mask, + linkage->convergent32_mask, NULL, + FS_VEC4_TYPE_INTERP_FP32, 2, false, 0, progress); + + /* Now do the same thing, but for 16-bit inputs. */ + fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type, + linkage->interp_fp16_mask, linkage->flat16_mask, + linkage->convergent16_mask, NULL, + FS_VEC4_TYPE_INTERP_FP16, 1, false, 0, progress); + + /* Put transform-feedback-only outputs last. */ + fs_assign_slots(linkage, assigned_mask, NULL, + linkage->xfb32_only_mask, FS_VEC4_TYPE_NONE, 2, + NUM_SCALAR_SLOTS, false, 0, progress); + + fs_assign_slots(linkage, assigned_mask, NULL, + linkage->xfb16_only_mask, FS_VEC4_TYPE_NONE, 1, + NUM_SCALAR_SLOTS, false, 0, progress); + + /* Color varyings are only compacted among themselves. */ + /* Set whether the shader contains any color varyings. */ + unsigned col0 = VARYING_SLOT_COL0 * 8; + bool has_colors = + !BITSET_TEST_RANGE_INSIDE_WORD(linkage->interp_fp32_mask, col0, 16, + 0) || + !BITSET_TEST_RANGE_INSIDE_WORD(linkage->convergent32_mask, col0, 16, + 0) || + !BITSET_TEST_RANGE_INSIDE_WORD(linkage->color32_mask, col0, 16, 0) || + !BITSET_TEST_RANGE_INSIDE_WORD(linkage->flat32_mask, col0, 16, 0) || + !BITSET_TEST_RANGE_INSIDE_WORD(linkage->xfb32_only_mask, col0, 16, 0); + + if (has_colors) { + unsigned color_channel_rotate = + DIV_ROUND_UP(BITSET_LAST_BIT(assigned_mask), 2) % 4; + + fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type, + linkage->interp_fp32_mask, linkage->flat32_mask, + linkage->convergent32_mask, linkage->color32_mask, + FS_VEC4_TYPE_INTERP_FP32, 2, true, + color_channel_rotate, progress); + + /* Put transform-feedback-only outputs last. */ + fs_assign_slots(linkage, assigned_mask, NULL, + linkage->xfb32_only_mask, FS_VEC4_TYPE_NONE, 2, + NUM_SCALAR_SLOTS, true, color_channel_rotate, + progress); + } + } else { + /* The consumer is a TCS, TES, or GS. + * + * "use_pos" says whether the driver prefers that compaction with non-FS + * consumers puts varyings into POS first before using any VARn. + */ + bool use_pos = !(linkage->producer_builder.shader->options->io_options & + nir_io_dont_use_pos_for_non_fs_varyings); + unsigned slot_index = (use_pos ? VARYING_SLOT_POS + : VARYING_SLOT_VAR0) * 8; + unsigned patch_slot_index = VARYING_SLOT_PATCH0 * 8; + + /* Compact 32-bit inputs. */ + vs_tcs_tes_gs_assign_slots(linkage, linkage->flat32_mask, &slot_index, + &patch_slot_index, 2, progress); + + /* Compact 16-bit inputs, allowing them to share vec4 slots with 32-bit + * inputs. + */ + vs_tcs_tes_gs_assign_slots(linkage, linkage->flat16_mask, &slot_index, + &patch_slot_index, 1, progress); + + /* Put no-varying slots last. These are TCS outputs read by TCS but not + * TES. + */ + vs_tcs_tes_gs_assign_slots(linkage, linkage->no_varying32_mask, &slot_index, + &patch_slot_index, 2, progress); + vs_tcs_tes_gs_assign_slots(linkage, linkage->no_varying16_mask, &slot_index, + &patch_slot_index, 1, progress); + + assert(slot_index <= VARYING_SLOT_MAX * 8); + assert(patch_slot_index <= VARYING_SLOT_TESS_MAX * 8); + } +} + +/****************************************************************** + * PUTTING IT ALL TOGETHER + ******************************************************************/ + +static void +init_linkage(nir_shader *producer, nir_shader *consumer, bool spirv, + unsigned max_uniform_components, unsigned max_ubos_per_stage, + struct linkage_info *linkage) +{ + *linkage = (struct linkage_info){ + .spirv = spirv, + .producer_stage = producer->info.stage, + .consumer_stage = consumer->info.stage, + .producer_builder = + nir_builder_create(nir_shader_get_entrypoint(producer)), + .consumer_builder = + nir_builder_create(nir_shader_get_entrypoint(consumer)), + + .max_varying_expression_cost = + producer->options->varying_expression_max_cost ? + producer->options->varying_expression_max_cost(producer, consumer) : 0, + + .linear_mem_ctx = linear_context(ralloc_context(NULL)), + }; + + for (unsigned i = 0; i < ARRAY_SIZE(linkage->slot); i++) { + list_inithead(&linkage->slot[i].producer.loads); + list_inithead(&linkage->slot[i].producer.stores); + list_inithead(&linkage->slot[i].consumer.loads); + } + + /* Preparation. */ + nir_shader_intrinsics_pass(consumer, gather_inputs, 0, linkage); + nir_shader_intrinsics_pass(producer, gather_outputs, 0, linkage); + tidy_up_indirect_varyings(linkage); + determine_uniform_movability(linkage, max_uniform_components); + determine_ubo_movability(linkage, max_ubos_per_stage); +} + +static void +free_linkage(struct linkage_info *linkage) +{ + ralloc_free(ralloc_parent_of_linear_context(linkage->linear_mem_ctx)); +} + +static void +print_shader_linkage(nir_shader *producer, nir_shader *consumer) +{ + struct linkage_info linkage; + + init_linkage(producer, consumer, false, 0, 0, &linkage); + print_linkage(&linkage); + free_linkage(&linkage); +} + +/** + * Run lots of optimizations on varyings. See the description at the beginning + * of this file. + */ +nir_opt_varyings_progress +nir_opt_varyings(nir_shader *producer, nir_shader *consumer, bool spirv, + unsigned max_uniform_components, unsigned max_ubos_per_stage) +{ + /* Producers before a fragment shader must have up-to-date vertex + * divergence information. + */ + if (consumer->info.stage == MESA_SHADER_FRAGMENT) { + /* Required by the divergence analysis. */ + NIR_PASS(_, producer, nir_convert_to_lcssa, true, true); + nir_vertex_divergence_analysis(producer); + } + + nir_opt_varyings_progress progress = 0; + struct linkage_info linkage; + init_linkage(producer, consumer, spirv, max_uniform_components, + max_ubos_per_stage, &linkage); + + /* Part 1: Run optimizations that only remove varyings. (they can move + * instructions between shaders) + */ + remove_dead_varyings(&linkage, &progress); + propagate_uniform_expressions(&linkage, &progress); + + /* Part 2: Deduplicate outputs. */ + deduplicate_outputs(&linkage, &progress); + + /* Run CSE on the consumer after output deduplication because duplicated + * loads can prevent finding the post-dominator for inter-shader code + * motion. + */ + NIR_PASS(_, consumer, nir_opt_cse); + + /* Re-gather linkage info after CSE. */ + free_linkage(&linkage); + init_linkage(producer, consumer, spirv, max_uniform_components, + max_ubos_per_stage, &linkage); + /* This must be done again to clean up bitmasks in linkage. */ + remove_dead_varyings(&linkage, &progress); + + /* This must be done after deduplication and before inter-shader code + * motion. + */ + tidy_up_convergent_varyings(&linkage); + find_open_coded_tes_input_interpolation(&linkage); + + /* Part 3: Run optimizations that completely change varyings. */ +#if PRINT + int i = 0; + puts("Before:"); + nir_print_shader(linkage.producer_builder.shader, stdout); + nir_print_shader(linkage.consumer_builder.shader, stdout); + print_linkage(&linkage); + puts(""); +#endif + + while (backward_inter_shader_code_motion(&linkage, &progress)) { +#if PRINT + i++; + printf("Finished: %i\n", i); + nir_print_shader(linkage.producer_builder.shader, stdout); + nir_print_shader(linkage.consumer_builder.shader, stdout); + print_linkage(&linkage); + puts(""); +#endif + } + + /* Part 4: Do compaction. */ + compact_varyings(&linkage, &progress); + + nir_metadata_preserve(linkage.producer_builder.impl, + progress & nir_progress_producer ? + (nir_metadata_block_index | + nir_metadata_dominance) : + nir_metadata_all); + nir_metadata_preserve(linkage.consumer_builder.impl, + progress & nir_progress_consumer ? + (nir_metadata_block_index | + nir_metadata_dominance) : + nir_metadata_all); + free_linkage(&linkage); + + if (progress & nir_progress_producer) + nir_validate_shader(producer, "nir_opt_varyings"); + if (progress & nir_progress_consumer) + nir_validate_shader(consumer, "nir_opt_varyings"); + + return progress; +}