intel/fs,vec4: Drop uniform compaction and pull constant support

The only driver using these was i965 and it's gone now. This is all dead code. Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14056>
2021-12-03 21:34:06 -06:00
parent 4175ed5099
commit 8f3c100d61
16 changed files with 20 additions and 791 deletions
--- a/src/gallium/drivers/crocus/crocus_disk_cache.c
+++ b/src/gallium/drivers/crocus/crocus_disk_cache.c
@@ -181,9 +181,6 @@ crocus_disk_cache_retrieve(struct crocus_context *ice,
   }
   prog_data->param = NULL;
   prog_data->pull_param = NULL;
   assert(prog_data->nr_pull_params == 0);
   if (prog_data->nr_params) {
      prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params);
      blob_copy_bytes(&blob, prog_data->param,
--- a/src/gallium/drivers/crocus/crocus_program_cache.c
+++ b/src/gallium/drivers/crocus/crocus_program_cache.c
@@ -224,10 +224,8 @@ crocus_upload_shader(struct crocus_context *ice,
   shader->bt = *bt;
   ralloc_steal(shader, shader->prog_data);
-   if (prog_data_size > 16) {
+   if (prog_data_size > 16)
      ralloc_steal(shader->prog_data, prog_data->param);
      ralloc_steal(shader->prog_data, prog_data->pull_param);
   }
   ralloc_steal(shader, shader->streamout);
   ralloc_steal(shader, shader->system_values);
--- a/src/gallium/drivers/crocus/crocus_screen.c
+++ b/src/gallium/drivers/crocus/crocus_screen.c
@@ -779,9 +779,7 @@ crocus_screen_create(int fd, const struct pipe_screen_config *config)
   screen->compiler = brw_compiler_create(screen, &screen->devinfo);
   screen->compiler->shader_debug_log = crocus_shader_debug_log;
   screen->compiler->shader_perf_log = crocus_shader_perf_log;
   screen->compiler->supports_pull_constants = false;
   screen->compiler->supports_shader_constants = false;
   screen->compiler->compact_params = false;
   screen->compiler->constant_buffer_0_is_relative = true;
   if (screen->devinfo.ver >= 7) {
--- a/src/gallium/drivers/iris/iris_disk_cache.c
+++ b/src/gallium/drivers/iris/iris_disk_cache.c
@@ -207,9 +207,6 @@ iris_disk_cache_retrieve(struct iris_screen *screen,
   }
   prog_data->param = NULL;
   prog_data->pull_param = NULL;
   assert(prog_data->nr_pull_params == 0);
   if (prog_data->nr_params) {
      prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params);
      blob_copy_bytes(&blob, prog_data->param,
--- a/src/gallium/drivers/iris/iris_program.c
+++ b/src/gallium/drivers/iris/iris_program.c
@@ -90,7 +90,6 @@ iris_finalize_program(struct iris_compiled_shader *shader,
   ralloc_steal(shader, shader->prog_data);
   ralloc_steal(shader->prog_data, (void *)prog_data->relocs);
   ralloc_steal(shader->prog_data, prog_data->param);
   ralloc_steal(shader->prog_data, prog_data->pull_param);
   ralloc_steal(shader, shader->streamout);
   ralloc_steal(shader, shader->system_values);
 }
--- a/src/gallium/drivers/iris/iris_screen.c
+++ b/src/gallium/drivers/iris/iris_screen.c
@@ -839,9 +839,7 @@ iris_screen_create(int fd, const struct pipe_screen_config *config)
   screen->compiler = brw_compiler_create(screen, &screen->devinfo);
   screen->compiler->shader_debug_log = iris_shader_debug_log;
   screen->compiler->shader_perf_log = iris_shader_perf_log;
   screen->compiler->supports_pull_constants = false;
   screen->compiler->supports_shader_constants = true;
   screen->compiler->compact_params = false;
   screen->compiler->indirect_ubos_use_sampler = screen->devinfo.ver < 12;
   screen->l3_config_3d = iris_get_default_l3_config(&screen->devinfo, false);
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -91,24 +91,12 @@ struct brw_compiler {
    */
   bool constant_buffer_0_is_relative;
   /**
    * Whether or not the driver supports pull constants.  If not, the compiler
    * will attempt to push everything.
    */
   bool supports_pull_constants;
   /**
    * Whether or not the driver supports NIR shader constants.  This controls
    * whether nir_opt_large_constants will be run.
    */
   bool supports_shader_constants;
   /**
    * Whether or not the driver wants uniform params to be compacted by the
    * back-end compiler.
    */
   bool compact_params;
   /**
    * Whether or not the driver wants variable group size to be lowered by the
    * back-end compiler.
@@ -775,7 +763,6 @@ struct brw_stage_prog_data {
   struct brw_ubo_range ubo_ranges[4];
   GLuint nr_params;       /**< number of float params/constants */
   GLuint nr_pull_params;
   gl_shader_stage stage;
@@ -822,7 +809,6 @@ struct brw_stage_prog_data {
    * above.
    */
   uint32_t *param;
   uint32_t *pull_param;
   /* Whether shader uses atomic operations. */
   bool uses_atomic_load_store;
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1234,7 +1234,6 @@ void
 fs_visitor::import_uniforms(fs_visitor *v)
 {
   this->push_constant_loc = v->push_constant_loc;
   this->pull_constant_loc = v->pull_constant_loc;
   this->uniforms = v->uniforms;
   this->subgroup_id = v->subgroup_id;
   for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++)
@@ -1801,7 +1800,6 @@ fs_visitor::assign_curb_setup()
   uint64_t want_zero = used & stage_prog_data->zero_push_reg;
   if (want_zero) {
      assert(!compiler->compact_params);
      fs_builder ubld = bld.exec_all().group(8, 0).at(
         cfg->first_block(), cfg->first_block()->start());
@@ -2396,109 +2394,6 @@ get_subgroup_id_param_index(const intel_device_info *devinfo,
   return -1;
 }
 /**
 * Struct for handling complex alignments.
 *
 * A complex alignment is stored as multiplier and an offset.  A value is
 * considered to be aligned if it is {offset} larger than a multiple of {mul}.
 * For instance, with an alignment of {8, 2}, cplx_align_apply would do the
 * following:
 *
 *  N  | cplx_align_apply({8, 2}, N)
 * ----+-----------------------------
 *  4  | 6
 *  6  | 6
 *  8  | 14
 *  10 | 14
 *  12 | 14
 *  14 | 14
 *  16 | 22
 */
 struct cplx_align {
   unsigned mul:4;
   unsigned offset:4;
 };
 #define CPLX_ALIGN_MAX_MUL 8
 static void
 cplx_align_assert_sane(struct cplx_align a)
 {
   assert(a.mul > 0 && util_is_power_of_two_nonzero(a.mul));
   assert(a.offset < a.mul);
 }
 /**
 * Combines two alignments to produce a least multiple of sorts.
 *
 * The returned alignment is the smallest (in terms of multiplier) such that
 * anything aligned to both a and b will be aligned to the new alignment.
 * This function will assert-fail if a and b are not compatible, i.e. if the
 * offset parameters are such that no common alignment is possible.
 */
 static struct cplx_align
 cplx_align_combine(struct cplx_align a, struct cplx_align b)
 {
   cplx_align_assert_sane(a);
   cplx_align_assert_sane(b);
   /* Assert that the alignments agree. */
   assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1)));
   return a.mul > b.mul ? a : b;
 }
 /**
 * Apply a complex alignment
 *
 * This function will return the smallest number greater than or equal to
 * offset that is aligned to align.
 */
 static unsigned
 cplx_align_apply(struct cplx_align align, unsigned offset)
 {
   return ALIGN(offset - align.offset, align.mul) + align.offset;
 }
 #define UNIFORM_SLOT_SIZE 4
 struct uniform_slot_info {
   /** True if the given uniform slot is live */
   unsigned is_live:1;
   /** True if this slot and the next slot must remain contiguous */
   unsigned contiguous:1;
   struct cplx_align align;
 };
 static void
 mark_uniform_slots_read(struct uniform_slot_info *slots,
                        unsigned num_slots, unsigned alignment)
 {
   assert(alignment > 0 && util_is_power_of_two_nonzero(alignment));
   assert(alignment <= CPLX_ALIGN_MAX_MUL);
   /* We can't align a slot to anything less than the slot size */
   alignment = MAX2(alignment, UNIFORM_SLOT_SIZE);
   struct cplx_align align = {alignment, 0};
   cplx_align_assert_sane(align);
   for (unsigned i = 0; i < num_slots; i++) {
      slots[i].is_live = true;
      if (i < num_slots - 1)
         slots[i].contiguous = true;
      align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1);
      if (slots[i].align.mul == 0) {
         slots[i].align = align;
      } else {
         slots[i].align = cplx_align_combine(slots[i].align, align);
      }
   }
 }
 /**
 * Assign UNIFORM file registers to either push constants or pull constants.
 *
@@ -2512,197 +2407,12 @@ void
 fs_visitor::assign_constant_locations()
 {
   /* Only the first compile gets to decide on locations. */
-   if (push_constant_loc) {
+   if (push_constant_loc)
      assert(pull_constant_loc);
      return;
   }
-   if (compiler->compact_params) {
+   push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
-      struct uniform_slot_info slots[uniforms + 1];
+   for (unsigned u = 0; u < uniforms; u++)
-      memset(slots, 0, sizeof(slots));
+      push_constant_loc[u] = u;
      foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
         for (int i = 0 ; i < inst->sources; i++) {
            if (inst->src[i].file != UNIFORM)
               continue;
            /* NIR tightly packs things so the uniform number might not be
             * aligned (if we have a double right after a float, for
             * instance).  This is fine because the process of re-arranging
             * them will ensure that things are properly aligned.  The offset
             * into that uniform, however, must be aligned.
             *
             * In Vulkan, we have explicit offsets but everything is crammed
             * into a single "variable" so inst->src[i].nr will always be 0.
             * Everything will be properly aligned relative to that one base.
             */
            assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
            unsigned u = inst->src[i].nr +
                         inst->src[i].offset / UNIFORM_SLOT_SIZE;
            if (u >= uniforms)
               continue;
            unsigned slots_read;
            if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
               slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
            } else {
               unsigned bytes_read = inst->components_read(i) *
                                     type_sz(inst->src[i].type);
               slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
            }
            assert(u + slots_read <= uniforms);
            mark_uniform_slots_read(&slots[u], slots_read,
                                    type_sz(inst->src[i].type));
         }
      }
      int subgroup_id_index = get_subgroup_id_param_index(devinfo,
                                                          stage_prog_data);
      /* Only allow 16 registers (128 uniform components) as push constants.
       *
       * Just demote the end of the list.  We could probably do better
       * here, demoting things that are rarely used in the program first.
       *
       * If changing this value, note the limitation about total_regs in
       * brw_curbe.c.
       */
      unsigned int max_push_components = 16 * 8;
      if (subgroup_id_index >= 0)
         max_push_components--; /* Save a slot for the thread ID */
      /* We push small arrays, but no bigger than 16 floats.  This is big
       * enough for a vec4 but hopefully not large enough to push out other
       * stuff.  We should probably use a better heuristic at some point.
       */
      const unsigned int max_chunk_size = 16;
      unsigned int num_push_constants = 0;
      unsigned int num_pull_constants = 0;
      push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
      pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
      /* Default to -1 meaning no location */
      memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
      memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
      int chunk_start = -1;
      struct cplx_align align;
      for (unsigned u = 0; u < uniforms; u++) {
         if (!slots[u].is_live) {
            assert(chunk_start == -1);
            continue;
         }
         /* Skip subgroup_id_index to put it in the last push register. */
         if (subgroup_id_index == (int)u)
            continue;
         if (chunk_start == -1) {
            chunk_start = u;
            align = slots[u].align;
         } else {
            /* Offset into the chunk */
            unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
            /* Shift the slot alignment down by the chunk offset so it is
             * comparable with the base chunk alignment.
             */
            struct cplx_align slot_align = slots[u].align;
            slot_align.offset =
               (slot_align.offset - chunk_offset) & (align.mul - 1);
            align = cplx_align_combine(align, slot_align);
         }
         /* Sanity check the alignment */
         cplx_align_assert_sane(align);
         if (slots[u].contiguous)
            continue;
         /* Adjust the alignment to be in terms of slots, not bytes */
         assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
         assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
         align.mul /= UNIFORM_SLOT_SIZE;
         align.offset /= UNIFORM_SLOT_SIZE;
         unsigned push_start_align = cplx_align_apply(align, num_push_constants);
         unsigned chunk_size = u - chunk_start + 1;
         if ((!compiler->supports_pull_constants && u < UBO_START) ||
             (chunk_size < max_chunk_size &&
              push_start_align + chunk_size <= max_push_components)) {
            /* Align up the number of push constants */
            num_push_constants = push_start_align;
            for (unsigned i = 0; i < chunk_size; i++)
               push_constant_loc[chunk_start + i] = num_push_constants++;
         } else {
            /* We need to pull this one */
            num_pull_constants = cplx_align_apply(align, num_pull_constants);
            for (unsigned i = 0; i < chunk_size; i++)
               pull_constant_loc[chunk_start + i] = num_pull_constants++;
         }
         /* Reset the chunk and start again */
         chunk_start = -1;
      }
      /* Add the CS local thread ID uniform at the end of the push constants */
      if (subgroup_id_index >= 0)
         push_constant_loc[subgroup_id_index] = num_push_constants++;
      /* As the uniforms are going to be reordered, stash the old array and
       * create two new arrays for push/pull params.
       */
      uint32_t *param = stage_prog_data->param;
      stage_prog_data->nr_params = num_push_constants;
      if (num_push_constants) {
         stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
                                                num_push_constants);
      } else {
         stage_prog_data->param = NULL;
      }
      assert(stage_prog_data->nr_pull_params == 0);
      assert(stage_prog_data->pull_param == NULL);
      if (num_pull_constants > 0) {
         stage_prog_data->nr_pull_params = num_pull_constants;
         stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
                                                     num_pull_constants);
      }
      /* Up until now, the param[] array has been indexed by reg + offset
       * of UNIFORM registers.  Move pull constants into pull_param[] and
       * condense param[] to only contain the uniforms we chose to push.
       *
       * NOTE: Because we are condensing the params[] array, we know that
       * push_constant_loc[i] <= i and we can do it in one smooth loop without
       * having to make a copy.
       */
      for (unsigned int i = 0; i < uniforms; i++) {
         uint32_t value = param[i];
         if (pull_constant_loc[i] != -1) {
            stage_prog_data->pull_param[pull_constant_loc[i]] = value;
         } else if (push_constant_loc[i] != -1) {
            stage_prog_data->param[push_constant_loc[i]] = value;
         }
      }
      ralloc_free(param);
   } else {
      /* If we don't want to compact anything, just set up dummy push/pull
       * arrays.  All the rest of the compiler cares about are these arrays.
       */
      push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
      pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
      for (unsigned u = 0; u < uniforms; u++)
         push_constant_loc[u] = u;
      memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
   }
   /* Now that we know how many regular uniforms we'll push, reduce the
    * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
@@ -2733,33 +2443,22 @@ fs_visitor::get_pull_locs(const fs_reg &src,
 {
   assert(src.file == UNIFORM);
-   if (src.nr >= UBO_START) {
+   if (src.nr < UBO_START)
-      const struct brw_ubo_range *range =
+      return false;
         &prog_data->ubo_ranges[src.nr - UBO_START];
-      /* If this access is in our (reduced) range, use the push data. */
+   const struct brw_ubo_range *range =
-      if (src.offset / 32 < range->length)
+      &prog_data->ubo_ranges[src.nr - UBO_START];
         return false;
-      *out_surf_index = prog_data->binding_table.ubo_start + range->block;
+   /* If this access is in our (reduced) range, use the push data. */
-      *out_pull_index = (32 * range->start + src.offset) / 4;
+   if (src.offset / 32 < range->length)
      return false;
-      prog_data->has_ubo_pull = true;
+   *out_surf_index = prog_data->binding_table.ubo_start + range->block;
-      return true;
+   *out_pull_index = (32 * range->start + src.offset) / 4;
   }
-   const unsigned location = src.nr + src.offset / 4;
+   prog_data->has_ubo_pull = true;
-   if (location < uniforms && pull_constant_loc[location] != -1) {
+   return true;
      /* A regular uniform push constant */
      *out_surf_index = stage_prog_data->binding_table.pull_constants_start;
      *out_pull_index = pull_constant_loc[location];
      prog_data->has_ubo_pull = true;
      return true;
   }
   return false;
 }
 /**
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -369,12 +369,6 @@ public:
   /** Byte-offset for the next available spot in the scratch space buffer. */
   unsigned last_scratch;
   /**
    * Array mapping UNIFORM register numbers to the pull parameter index,
    * or -1 if this uniform register isn't being uploaded as a pull constant.
    */
   int *pull_constant_loc;
   /**
    * Array mapping UNIFORM register numbers to the push parameter index,
    * or -1 if this uniform register isn't being uploaded as a push constant.
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -103,10 +103,8 @@ void
 fs_visitor::nir_setup_uniforms()
 {
   /* Only the first compile gets to set up uniforms. */
-   if (push_constant_loc) {
+   if (push_constant_loc)
      assert(pull_constant_loc);
      return;
   }
   uniforms = nir->num_uniforms / 4;
--- a/src/intel/compiler/brw_fs_visitor.cpp
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -126,7 +126,6 @@ fs_visitor::emit_dummy_fs()
   /* We don't have any uniforms. */
   stage_prog_data->nr_params = 0;
   stage_prog_data->nr_pull_params = 0;
   stage_prog_data->curb_read_length = 0;
   stage_prog_data->dispatch_grf_start_reg = 2;
   wm_prog_data->dispatch_grf_start_reg_16 = 2;
@@ -1192,7 +1191,6 @@ fs_visitor::init()
   this->uniforms = 0;
   this->last_scratch = 0;
   this->pull_constant_loc = NULL;
   this->push_constant_loc = NULL;
   this->shader_stats.scheduler_mode = NULL;
--- a/src/intel/compiler/brw_vec4.cpp
+++ b/src/intel/compiler/brw_vec4.cpp
@@ -604,194 +604,6 @@ vec4_visitor::split_uniform_registers()
   }
 }
 /* This function returns the register number where we placed the uniform */
 static int
 set_push_constant_loc(const int nr_uniforms, int *new_uniform_count,
                      const int src, const int size, const int channel_size,
                      int *new_loc, int *new_chan,
                      int *new_chans_used)
 {
   int dst;
   /* Find the lowest place we can slot this uniform in. */
   for (dst = 0; dst < nr_uniforms; dst++) {
      if (ALIGN(new_chans_used[dst], channel_size) + size <= 4)
         break;
   }
   assert(dst < nr_uniforms);
   new_loc[src] = dst;
   new_chan[src] = ALIGN(new_chans_used[dst], channel_size);
   new_chans_used[dst] = ALIGN(new_chans_used[dst], channel_size) + size;
   *new_uniform_count = MAX2(*new_uniform_count, dst + 1);
   return dst;
 }
 void
 vec4_visitor::pack_uniform_registers()
 {
   if (!compiler->compact_params)
      return;
   uint8_t chans_used[this->uniforms];
   int new_loc[this->uniforms];
   int new_chan[this->uniforms];
   bool is_aligned_to_dvec4[this->uniforms];
   int new_chans_used[this->uniforms];
   int channel_sizes[this->uniforms];
   memset(chans_used, 0, sizeof(chans_used));
   memset(new_loc, 0, sizeof(new_loc));
   memset(new_chan, 0, sizeof(new_chan));
   memset(new_chans_used, 0, sizeof(new_chans_used));
   memset(is_aligned_to_dvec4, 0, sizeof(is_aligned_to_dvec4));
   memset(channel_sizes, 0, sizeof(channel_sizes));
   /* Find which uniform vectors are actually used by the program.  We
    * expect unused vector elements when we've moved array access out
    * to pull constants, and from some GLSL code generators like wine.
    */
   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
      unsigned readmask;
      switch (inst->opcode) {
      case VEC4_OPCODE_PACK_BYTES:
      case BRW_OPCODE_DP4:
      case BRW_OPCODE_DPH:
         readmask = 0xf;
         break;
      case BRW_OPCODE_DP3:
         readmask = 0x7;
         break;
      case BRW_OPCODE_DP2:
         readmask = 0x3;
         break;
      default:
         readmask = inst->dst.writemask;
         break;
      }
      for (int i = 0 ; i < 3; i++) {
         if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START)
            continue;
         assert(type_sz(inst->src[i].type) % 4 == 0);
         int channel_size = type_sz(inst->src[i].type) / 4;
         int reg = inst->src[i].nr;
         for (int c = 0; c < 4; c++) {
            if (!(readmask & (1 << c)))
               continue;
            unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1;
            unsigned used = MAX2(chans_used[reg], channel * channel_size);
            if (used <= 4) {
               chans_used[reg] = used;
               channel_sizes[reg] = MAX2(channel_sizes[reg], channel_size);
            } else {
               is_aligned_to_dvec4[reg] = true;
               is_aligned_to_dvec4[reg + 1] = true;
               chans_used[reg + 1] = used - 4;
               channel_sizes[reg + 1] = MAX2(channel_sizes[reg + 1], channel_size);
            }
         }
      }
      if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
          inst->src[0].file == UNIFORM) {
         assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
         assert(inst->src[0].subnr == 0);
         unsigned bytes_read = inst->src[2].ud;
         assert(bytes_read % 4 == 0);
         unsigned vec4s_read = DIV_ROUND_UP(bytes_read, 16);
         /* We just mark every register touched by a MOV_INDIRECT as being
          * fully used.  This ensures that it doesn't broken up piecewise by
          * the next part of our packing algorithm.
          */
         int reg = inst->src[0].nr;
         int channel_size = type_sz(inst->src[0].type) / 4;
         for (unsigned i = 0; i < vec4s_read; i++) {
            chans_used[reg + i] = 4;
            channel_sizes[reg + i] = MAX2(channel_sizes[reg + i], channel_size);
         }
      }
   }
   int new_uniform_count = 0;
   /* As the uniforms are going to be reordered, take the data from a temporary
    * copy of the original param[].
    */
   uint32_t *param = ralloc_array(NULL, uint32_t, stage_prog_data->nr_params);
   memcpy(param, stage_prog_data->param,
          sizeof(uint32_t) * stage_prog_data->nr_params);
   /* Now, figure out a packing of the live uniform vectors into our
    * push constants. Start with dvec{3,4} because they are aligned to
    * dvec4 size (2 vec4).
    */
   for (int src = 0; src < uniforms; src++) {
      int size = chans_used[src];
      if (size == 0 || !is_aligned_to_dvec4[src])
         continue;
      /* dvec3 are aligned to dvec4 size, apply the alignment of the size
       * to 4 to avoid moving last component of a dvec3 to the available
       * location at the end of a previous dvec3. These available locations
       * could be filled by smaller variables in next loop.
       */
      size = ALIGN(size, 4);
      int dst = set_push_constant_loc(uniforms, &new_uniform_count,
                                      src, size, channel_sizes[src],
                                      new_loc, new_chan,
                                      new_chans_used);
      /* Move the references to the data */
      for (int j = 0; j < size; j++) {
         stage_prog_data->param[dst * 4 + new_chan[src] + j] =
            param[src * 4 + j];
      }
   }
   /* Continue with the rest of data, which is aligned to vec4. */
   for (int src = 0; src < uniforms; src++) {
      int size = chans_used[src];
      if (size == 0 || is_aligned_to_dvec4[src])
         continue;
      int dst = set_push_constant_loc(uniforms, &new_uniform_count,
                                      src, size, channel_sizes[src],
                                      new_loc, new_chan,
                                      new_chans_used);
      /* Move the references to the data */
      for (int j = 0; j < size; j++) {
         stage_prog_data->param[dst * 4 + new_chan[src] + j] =
            param[src * 4 + j];
      }
   }
   ralloc_free(param);
   this->uniforms = new_uniform_count;
   stage_prog_data->nr_params = new_uniform_count * 4;
   /* Now, update the instructions for our repacked uniforms. */
   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
      for (int i = 0 ; i < 3; i++) {
         int src = inst->src[i].nr;
         if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START)
            continue;
         int chan = new_chan[src] / channel_sizes[src];
         inst->src[i].nr = new_loc[src];
         inst->src[i].swizzle += BRW_SWIZZLE4(chan, chan, chan, chan);
      }
   }
 }
 /**
 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
 *
@@ -910,97 +722,6 @@ vec4_visitor::opt_algebraic()
   return progress;
 }
 /**
 * Only a limited number of hardware registers may be used for push
 * constants, so this turns access to the overflowed constants into
 * pull constants.
 */
 void
 vec4_visitor::move_push_constants_to_pull_constants()
 {
   int pull_constant_loc[this->uniforms];
   const int max_uniform_components = push_length * 8;
   if (this->uniforms * 4 <= max_uniform_components)
      return;
   assert(compiler->supports_pull_constants);
   assert(compiler->compact_params);
   /* If we got here, we also can't have any push ranges */
   for (unsigned i = 0; i < 4; i++)
      assert(prog_data->base.ubo_ranges[i].length == 0);
   /* Make some sort of choice as to which uniforms get sent to pull
    * constants.  We could potentially do something clever here like
    * look for the most infrequently used uniform vec4s, but leave
    * that for later.
    */
   for (int i = 0; i < this->uniforms * 4; i += 4) {
      pull_constant_loc[i / 4] = -1;
      if (i >= max_uniform_components) {
         uint32_t *values = &stage_prog_data->param[i];
         /* Try to find an existing copy of this uniform in the pull
          * constants if it was part of an array access already.
          */
         for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
            int matches;
            for (matches = 0; matches < 4; matches++) {
               if (stage_prog_data->pull_param[j + matches] != values[matches])
                  break;
            }
            if (matches == 4) {
               pull_constant_loc[i / 4] = j / 4;
               break;
            }
         }
         if (pull_constant_loc[i / 4] == -1) {
            assert(stage_prog_data->nr_pull_params % 4 == 0);
            pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
            for (int j = 0; j < 4; j++) {
               stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
                  values[j];
            }
         }
      }
   }
   /* Now actually rewrite usage of the things we've moved to pull
    * constants.
    */
   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
      for (int i = 0 ; i < 3; i++) {
         if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START ||
             pull_constant_loc[inst->src[i].nr] == -1)
            continue;
         int uniform = inst->src[i].nr;
         const glsl_type *temp_type = type_sz(inst->src[i].type) == 8 ?
            glsl_type::dvec4_type : glsl_type::vec4_type;
         dst_reg temp = dst_reg(this, temp_type);
         emit_pull_constant_load(block, inst, temp, inst->src[i],
                                 pull_constant_loc[uniform], src_reg());
         inst->src[i].file = temp.file;
         inst->src[i].nr = temp.nr;
         inst->src[i].offset %= 16;
         inst->src[i].reladdr = NULL;
      }
   }
   /* Repack push constants to remove the now-unused ones. */
   pack_uniform_registers();
 }
 /* Conditions for which we want to avoid setting the dependency control bits */
 bool
 vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
@@ -1842,15 +1563,13 @@ vec4_visitor::setup_uniforms(int reg)
   /* It's possible that uniform compaction will shrink further than expected
    * so we re-compute the layout and set up our UBO push starts.
    */
-   const unsigned old_push_length = push_length;
+   ASSERTED const unsigned old_push_length = push_length;
   push_length = DIV_ROUND_UP(prog_data->base.nr_params, 8);
   for (unsigned i = 0; i < 4; i++) {
      ubo_push_start[i] = push_length;
      push_length += stage_prog_data->ubo_ranges[i].length;
   }
-   assert(push_length <= old_push_length);
+   assert(push_length == old_push_length);
   if (push_length < old_push_length)
      assert(compiler->compact_params);
   /* The pre-gfx6 VS requires that some push constants get loaded no
    * matter what, or the GPU would hang.
@@ -2738,10 +2457,8 @@ vec4_visitor::run()
    * often do repeated subexpressions for those.
    */
   move_grf_array_access_to_scratch();
-   move_uniform_array_access_to_pull_constants();
+   split_uniform_registers();
   pack_uniform_registers();
   move_push_constants_to_pull_constants();
   split_virtual_grfs();
 #define OPT(pass, args...) ({                                          \
--- a/src/intel/compiler/brw_vec4.h
+++ b/src/intel/compiler/brw_vec4.h
@@ -138,9 +138,7 @@ public:
   void spill_reg(unsigned spill_reg);
   void move_grf_array_access_to_scratch();
   void move_uniform_array_access_to_pull_constants();
   void move_push_constants_to_pull_constants();
   void split_uniform_registers();
   void pack_uniform_registers();
   void setup_push_ranges();
   virtual void invalidate_analysis(brw::analysis_dependency_class c);
   void split_virtual_grfs();
@@ -292,11 +290,6 @@ public:
 			  int base_offset);
   void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
 			   int base_offset);
   void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
 				dst_reg dst,
 				src_reg orig_src,
                                int base_offset,
                                src_reg indirect);
   void emit_pull_constant_load_reg(dst_reg dst,
                                    src_reg surf_index,
                                    src_reg offset,
--- a/src/intel/compiler/brw_vec4_gs_visitor.cpp
+++ b/src/intel/compiler/brw_vec4_gs_visitor.cpp
@@ -889,7 +889,6 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
            memcpy(prog_data->base.base.param, param,
                   sizeof(uint32_t) * param_count);
            prog_data->base.base.nr_params = param_count;
            prog_data->base.base.nr_pull_params = 0;
            ralloc_free(param);
         }
      }
--- a/src/intel/compiler/brw_vec4_visitor.cpp
+++ b/src/intel/compiler/brw_vec4_visitor.cpp
@@ -1592,146 +1592,6 @@ vec4_visitor::move_grf_array_access_to_scratch()
   }
 }
 /**
 * Emits an instruction before @inst to load the value named by @orig_src
 * from the pull constant buffer (surface) at @base_offset to @temp.
 */
 void
 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
                                      dst_reg temp, src_reg orig_src,
                                      int base_offset, src_reg indirect)
 {
   assert(orig_src.offset % 16 == 0);
   const unsigned index = prog_data->base.binding_table.pull_constants_start;
   /* For 64bit loads we need to emit two 32-bit load messages and we also
    * we need to shuffle the 32-bit data result into proper 64-bit data. To do
    * that we emit the 32-bit loads into a temporary and we shuffle the result
    * into the original destination.
    */
   dst_reg orig_temp = temp;
   bool is_64bit = type_sz(orig_src.type) == 8;
   if (is_64bit) {
      assert(type_sz(temp.type) == 8);
      dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
      temp = retype(temp_df, BRW_REGISTER_TYPE_F);
   }
   src_reg src = orig_src;
   for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
      int reg_offset = base_offset + src.offset / 16;
      src_reg offset;
      if (indirect.file != BAD_FILE) {
         offset = src_reg(this, glsl_type::uint_type);
         emit_before(block, inst, ADD(dst_reg(offset), indirect,
                                      brw_imm_ud(reg_offset * 16)));
      } else {
         offset = brw_imm_d(reg_offset * 16);
      }
      emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
                                  brw_imm_ud(index),
                                  offset,
                                  block, inst);
      src = byte_offset(src, 16);
   }
   if (is_64bit) {
      temp = retype(temp, BRW_REGISTER_TYPE_DF);
      shuffle_64bit_data(orig_temp, src_reg(temp), false, false, block, inst);
   }
 }
 /**
 * Implements array access of uniforms by inserting a
 * PULL_CONSTANT_LOAD instruction.
 *
 * Unlike temporary GRF array access (where we don't support it due to
 * the difficulty of doing relative addressing on instruction
 * destinations), we could potentially do array access of uniforms
 * that were loaded in GRF space as push constants.  In real-world
 * usage we've seen, though, the arrays being used are always larger
 * than we could load as push constants, so just always move all
 * uniform array access out to a pull constant buffer.
 */
 void
 vec4_visitor::move_uniform_array_access_to_pull_constants()
 {
   /* The vulkan dirver doesn't support pull constants other than UBOs so
    * everything has to be pushed regardless.
    */
   if (!compiler->supports_pull_constants) {
      split_uniform_registers();
      return;
   }
   /* Allocate the pull_params array */
   assert(stage_prog_data->nr_pull_params == 0);
   stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
                                              this->uniforms * 4);
   int pull_constant_loc[this->uniforms];
   memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
   /* First, walk through the instructions and determine which things need to
    * be pulled.  We mark something as needing to be pulled by setting
    * pull_constant_loc to 0.
    */
   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
      /* We only care about MOV_INDIRECT of a uniform */
      if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
          inst->src[0].file != UNIFORM)
         continue;
      int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
      for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
         pull_constant_loc[uniform_nr + j] = 0;
   }
   /* Next, we walk the list of uniforms and assign real pull constant
    * locations and set their corresponding entries in pull_param.
    */
   for (int j = 0; j < this->uniforms; j++) {
      if (pull_constant_loc[j] < 0)
         continue;
      pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
      for (int i = 0; i < 4; i++) {
         stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
            = stage_prog_data->param[j * 4 + i];
      }
   }
   /* Finally, we can walk through the instructions and lower MOV_INDIRECT
    * instructions to actual uniform pulls.
    */
   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
      /* We only care about MOV_INDIRECT of a uniform */
      if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
          inst->src[0].file != UNIFORM)
         continue;
      int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
      assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
      emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
                              pull_constant_loc[uniform_nr], inst->src[1]);
      inst->remove(block);
   }
   /* Now there are no accesses of the UNIFORM file with a reladdr, so
    * no need to track them as larger-than-vec4 objects.  This will be
    * relied on in cutting out unused uniform vectors from push
    * constants.
    */
   split_uniform_registers();
 }
 void
 vec4_visitor::resolve_ud_negate(src_reg *reg)
 {
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -974,11 +974,9 @@ anv_physical_device_try_create(struct anv_instance *instance,
   }
   device->compiler->shader_debug_log = compiler_debug_log;
   device->compiler->shader_perf_log = compiler_perf_log;
   device->compiler->supports_pull_constants = false;
   device->compiler->constant_buffer_0_is_relative =
      device->info.ver < 8 || !device->has_context_isolation;
   device->compiler->supports_shader_constants = true;
   device->compiler->compact_params = false;
   device->compiler->indirect_ubos_use_sampler = device->info.ver < 12;
   isl_device_init(&device->isl_dev, &device->info);