intel/fs,vec4: Drop uniform compaction and pull constant support

The only driver using these was i965 and it's gone now.  This is all
dead code.

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14056>
This commit is contained in:
Jason Ekstrand
2021-12-03 21:34:06 -06:00
committed by Marge Bot
parent 4175ed5099
commit 8f3c100d61
16 changed files with 20 additions and 791 deletions

View File

@@ -181,9 +181,6 @@ crocus_disk_cache_retrieve(struct crocus_context *ice,
} }
prog_data->param = NULL; prog_data->param = NULL;
prog_data->pull_param = NULL;
assert(prog_data->nr_pull_params == 0);
if (prog_data->nr_params) { if (prog_data->nr_params) {
prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params); prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params);
blob_copy_bytes(&blob, prog_data->param, blob_copy_bytes(&blob, prog_data->param,

View File

@@ -224,10 +224,8 @@ crocus_upload_shader(struct crocus_context *ice,
shader->bt = *bt; shader->bt = *bt;
ralloc_steal(shader, shader->prog_data); ralloc_steal(shader, shader->prog_data);
if (prog_data_size > 16) { if (prog_data_size > 16)
ralloc_steal(shader->prog_data, prog_data->param); ralloc_steal(shader->prog_data, prog_data->param);
ralloc_steal(shader->prog_data, prog_data->pull_param);
}
ralloc_steal(shader, shader->streamout); ralloc_steal(shader, shader->streamout);
ralloc_steal(shader, shader->system_values); ralloc_steal(shader, shader->system_values);

View File

@@ -779,9 +779,7 @@ crocus_screen_create(int fd, const struct pipe_screen_config *config)
screen->compiler = brw_compiler_create(screen, &screen->devinfo); screen->compiler = brw_compiler_create(screen, &screen->devinfo);
screen->compiler->shader_debug_log = crocus_shader_debug_log; screen->compiler->shader_debug_log = crocus_shader_debug_log;
screen->compiler->shader_perf_log = crocus_shader_perf_log; screen->compiler->shader_perf_log = crocus_shader_perf_log;
screen->compiler->supports_pull_constants = false;
screen->compiler->supports_shader_constants = false; screen->compiler->supports_shader_constants = false;
screen->compiler->compact_params = false;
screen->compiler->constant_buffer_0_is_relative = true; screen->compiler->constant_buffer_0_is_relative = true;
if (screen->devinfo.ver >= 7) { if (screen->devinfo.ver >= 7) {

View File

@@ -207,9 +207,6 @@ iris_disk_cache_retrieve(struct iris_screen *screen,
} }
prog_data->param = NULL; prog_data->param = NULL;
prog_data->pull_param = NULL;
assert(prog_data->nr_pull_params == 0);
if (prog_data->nr_params) { if (prog_data->nr_params) {
prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params); prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params);
blob_copy_bytes(&blob, prog_data->param, blob_copy_bytes(&blob, prog_data->param,

View File

@@ -90,7 +90,6 @@ iris_finalize_program(struct iris_compiled_shader *shader,
ralloc_steal(shader, shader->prog_data); ralloc_steal(shader, shader->prog_data);
ralloc_steal(shader->prog_data, (void *)prog_data->relocs); ralloc_steal(shader->prog_data, (void *)prog_data->relocs);
ralloc_steal(shader->prog_data, prog_data->param); ralloc_steal(shader->prog_data, prog_data->param);
ralloc_steal(shader->prog_data, prog_data->pull_param);
ralloc_steal(shader, shader->streamout); ralloc_steal(shader, shader->streamout);
ralloc_steal(shader, shader->system_values); ralloc_steal(shader, shader->system_values);
} }

View File

@@ -839,9 +839,7 @@ iris_screen_create(int fd, const struct pipe_screen_config *config)
screen->compiler = brw_compiler_create(screen, &screen->devinfo); screen->compiler = brw_compiler_create(screen, &screen->devinfo);
screen->compiler->shader_debug_log = iris_shader_debug_log; screen->compiler->shader_debug_log = iris_shader_debug_log;
screen->compiler->shader_perf_log = iris_shader_perf_log; screen->compiler->shader_perf_log = iris_shader_perf_log;
screen->compiler->supports_pull_constants = false;
screen->compiler->supports_shader_constants = true; screen->compiler->supports_shader_constants = true;
screen->compiler->compact_params = false;
screen->compiler->indirect_ubos_use_sampler = screen->devinfo.ver < 12; screen->compiler->indirect_ubos_use_sampler = screen->devinfo.ver < 12;
screen->l3_config_3d = iris_get_default_l3_config(&screen->devinfo, false); screen->l3_config_3d = iris_get_default_l3_config(&screen->devinfo, false);

View File

@@ -91,24 +91,12 @@ struct brw_compiler {
*/ */
bool constant_buffer_0_is_relative; bool constant_buffer_0_is_relative;
/**
* Whether or not the driver supports pull constants. If not, the compiler
* will attempt to push everything.
*/
bool supports_pull_constants;
/** /**
* Whether or not the driver supports NIR shader constants. This controls * Whether or not the driver supports NIR shader constants. This controls
* whether nir_opt_large_constants will be run. * whether nir_opt_large_constants will be run.
*/ */
bool supports_shader_constants; bool supports_shader_constants;
/**
* Whether or not the driver wants uniform params to be compacted by the
* back-end compiler.
*/
bool compact_params;
/** /**
* Whether or not the driver wants variable group size to be lowered by the * Whether or not the driver wants variable group size to be lowered by the
* back-end compiler. * back-end compiler.
@@ -775,7 +763,6 @@ struct brw_stage_prog_data {
struct brw_ubo_range ubo_ranges[4]; struct brw_ubo_range ubo_ranges[4];
GLuint nr_params; /**< number of float params/constants */ GLuint nr_params; /**< number of float params/constants */
GLuint nr_pull_params;
gl_shader_stage stage; gl_shader_stage stage;
@@ -822,7 +809,6 @@ struct brw_stage_prog_data {
* above. * above.
*/ */
uint32_t *param; uint32_t *param;
uint32_t *pull_param;
/* Whether shader uses atomic operations. */ /* Whether shader uses atomic operations. */
bool uses_atomic_load_store; bool uses_atomic_load_store;

View File

@@ -1234,7 +1234,6 @@ void
fs_visitor::import_uniforms(fs_visitor *v) fs_visitor::import_uniforms(fs_visitor *v)
{ {
this->push_constant_loc = v->push_constant_loc; this->push_constant_loc = v->push_constant_loc;
this->pull_constant_loc = v->pull_constant_loc;
this->uniforms = v->uniforms; this->uniforms = v->uniforms;
this->subgroup_id = v->subgroup_id; this->subgroup_id = v->subgroup_id;
for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++) for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++)
@@ -1801,7 +1800,6 @@ fs_visitor::assign_curb_setup()
uint64_t want_zero = used & stage_prog_data->zero_push_reg; uint64_t want_zero = used & stage_prog_data->zero_push_reg;
if (want_zero) { if (want_zero) {
assert(!compiler->compact_params);
fs_builder ubld = bld.exec_all().group(8, 0).at( fs_builder ubld = bld.exec_all().group(8, 0).at(
cfg->first_block(), cfg->first_block()->start()); cfg->first_block(), cfg->first_block()->start());
@@ -2396,109 +2394,6 @@ get_subgroup_id_param_index(const intel_device_info *devinfo,
return -1; return -1;
} }
/**
* Struct for handling complex alignments.
*
* A complex alignment is stored as multiplier and an offset. A value is
* considered to be aligned if it is {offset} larger than a multiple of {mul}.
* For instance, with an alignment of {8, 2}, cplx_align_apply would do the
* following:
*
* N | cplx_align_apply({8, 2}, N)
* ----+-----------------------------
* 4 | 6
* 6 | 6
* 8 | 14
* 10 | 14
* 12 | 14
* 14 | 14
* 16 | 22
*/
struct cplx_align {
unsigned mul:4;
unsigned offset:4;
};
#define CPLX_ALIGN_MAX_MUL 8
static void
cplx_align_assert_sane(struct cplx_align a)
{
assert(a.mul > 0 && util_is_power_of_two_nonzero(a.mul));
assert(a.offset < a.mul);
}
/**
* Combines two alignments to produce a least multiple of sorts.
*
* The returned alignment is the smallest (in terms of multiplier) such that
* anything aligned to both a and b will be aligned to the new alignment.
* This function will assert-fail if a and b are not compatible, i.e. if the
* offset parameters are such that no common alignment is possible.
*/
static struct cplx_align
cplx_align_combine(struct cplx_align a, struct cplx_align b)
{
cplx_align_assert_sane(a);
cplx_align_assert_sane(b);
/* Assert that the alignments agree. */
assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1)));
return a.mul > b.mul ? a : b;
}
/**
* Apply a complex alignment
*
* This function will return the smallest number greater than or equal to
* offset that is aligned to align.
*/
static unsigned
cplx_align_apply(struct cplx_align align, unsigned offset)
{
return ALIGN(offset - align.offset, align.mul) + align.offset;
}
#define UNIFORM_SLOT_SIZE 4
struct uniform_slot_info {
/** True if the given uniform slot is live */
unsigned is_live:1;
/** True if this slot and the next slot must remain contiguous */
unsigned contiguous:1;
struct cplx_align align;
};
static void
mark_uniform_slots_read(struct uniform_slot_info *slots,
unsigned num_slots, unsigned alignment)
{
assert(alignment > 0 && util_is_power_of_two_nonzero(alignment));
assert(alignment <= CPLX_ALIGN_MAX_MUL);
/* We can't align a slot to anything less than the slot size */
alignment = MAX2(alignment, UNIFORM_SLOT_SIZE);
struct cplx_align align = {alignment, 0};
cplx_align_assert_sane(align);
for (unsigned i = 0; i < num_slots; i++) {
slots[i].is_live = true;
if (i < num_slots - 1)
slots[i].contiguous = true;
align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1);
if (slots[i].align.mul == 0) {
slots[i].align = align;
} else {
slots[i].align = cplx_align_combine(slots[i].align, align);
}
}
}
/** /**
* Assign UNIFORM file registers to either push constants or pull constants. * Assign UNIFORM file registers to either push constants or pull constants.
* *
@@ -2512,197 +2407,12 @@ void
fs_visitor::assign_constant_locations() fs_visitor::assign_constant_locations()
{ {
/* Only the first compile gets to decide on locations. */ /* Only the first compile gets to decide on locations. */
if (push_constant_loc) { if (push_constant_loc)
assert(pull_constant_loc);
return; return;
}
if (compiler->compact_params) { push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
struct uniform_slot_info slots[uniforms + 1]; for (unsigned u = 0; u < uniforms; u++)
memset(slots, 0, sizeof(slots)); push_constant_loc[u] = u;
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
for (int i = 0 ; i < inst->sources; i++) {
if (inst->src[i].file != UNIFORM)
continue;
/* NIR tightly packs things so the uniform number might not be
* aligned (if we have a double right after a float, for
* instance). This is fine because the process of re-arranging
* them will ensure that things are properly aligned. The offset
* into that uniform, however, must be aligned.
*
* In Vulkan, we have explicit offsets but everything is crammed
* into a single "variable" so inst->src[i].nr will always be 0.
* Everything will be properly aligned relative to that one base.
*/
assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
unsigned u = inst->src[i].nr +
inst->src[i].offset / UNIFORM_SLOT_SIZE;
if (u >= uniforms)
continue;
unsigned slots_read;
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
} else {
unsigned bytes_read = inst->components_read(i) *
type_sz(inst->src[i].type);
slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
}
assert(u + slots_read <= uniforms);
mark_uniform_slots_read(&slots[u], slots_read,
type_sz(inst->src[i].type));
}
}
int subgroup_id_index = get_subgroup_id_param_index(devinfo,
stage_prog_data);
/* Only allow 16 registers (128 uniform components) as push constants.
*
* Just demote the end of the list. We could probably do better
* here, demoting things that are rarely used in the program first.
*
* If changing this value, note the limitation about total_regs in
* brw_curbe.c.
*/
unsigned int max_push_components = 16 * 8;
if (subgroup_id_index >= 0)
max_push_components--; /* Save a slot for the thread ID */
/* We push small arrays, but no bigger than 16 floats. This is big
* enough for a vec4 but hopefully not large enough to push out other
* stuff. We should probably use a better heuristic at some point.
*/
const unsigned int max_chunk_size = 16;
unsigned int num_push_constants = 0;
unsigned int num_pull_constants = 0;
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
/* Default to -1 meaning no location */
memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
int chunk_start = -1;
struct cplx_align align;
for (unsigned u = 0; u < uniforms; u++) {
if (!slots[u].is_live) {
assert(chunk_start == -1);
continue;
}
/* Skip subgroup_id_index to put it in the last push register. */
if (subgroup_id_index == (int)u)
continue;
if (chunk_start == -1) {
chunk_start = u;
align = slots[u].align;
} else {
/* Offset into the chunk */
unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
/* Shift the slot alignment down by the chunk offset so it is
* comparable with the base chunk alignment.
*/
struct cplx_align slot_align = slots[u].align;
slot_align.offset =
(slot_align.offset - chunk_offset) & (align.mul - 1);
align = cplx_align_combine(align, slot_align);
}
/* Sanity check the alignment */
cplx_align_assert_sane(align);
if (slots[u].contiguous)
continue;
/* Adjust the alignment to be in terms of slots, not bytes */
assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
align.mul /= UNIFORM_SLOT_SIZE;
align.offset /= UNIFORM_SLOT_SIZE;
unsigned push_start_align = cplx_align_apply(align, num_push_constants);
unsigned chunk_size = u - chunk_start + 1;
if ((!compiler->supports_pull_constants && u < UBO_START) ||
(chunk_size < max_chunk_size &&
push_start_align + chunk_size <= max_push_components)) {
/* Align up the number of push constants */
num_push_constants = push_start_align;
for (unsigned i = 0; i < chunk_size; i++)
push_constant_loc[chunk_start + i] = num_push_constants++;
} else {
/* We need to pull this one */
num_pull_constants = cplx_align_apply(align, num_pull_constants);
for (unsigned i = 0; i < chunk_size; i++)
pull_constant_loc[chunk_start + i] = num_pull_constants++;
}
/* Reset the chunk and start again */
chunk_start = -1;
}
/* Add the CS local thread ID uniform at the end of the push constants */
if (subgroup_id_index >= 0)
push_constant_loc[subgroup_id_index] = num_push_constants++;
/* As the uniforms are going to be reordered, stash the old array and
* create two new arrays for push/pull params.
*/
uint32_t *param = stage_prog_data->param;
stage_prog_data->nr_params = num_push_constants;
if (num_push_constants) {
stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
num_push_constants);
} else {
stage_prog_data->param = NULL;
}
assert(stage_prog_data->nr_pull_params == 0);
assert(stage_prog_data->pull_param == NULL);
if (num_pull_constants > 0) {
stage_prog_data->nr_pull_params = num_pull_constants;
stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
num_pull_constants);
}
/* Up until now, the param[] array has been indexed by reg + offset
* of UNIFORM registers. Move pull constants into pull_param[] and
* condense param[] to only contain the uniforms we chose to push.
*
* NOTE: Because we are condensing the params[] array, we know that
* push_constant_loc[i] <= i and we can do it in one smooth loop without
* having to make a copy.
*/
for (unsigned int i = 0; i < uniforms; i++) {
uint32_t value = param[i];
if (pull_constant_loc[i] != -1) {
stage_prog_data->pull_param[pull_constant_loc[i]] = value;
} else if (push_constant_loc[i] != -1) {
stage_prog_data->param[push_constant_loc[i]] = value;
}
}
ralloc_free(param);
} else {
/* If we don't want to compact anything, just set up dummy push/pull
* arrays. All the rest of the compiler cares about are these arrays.
*/
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
for (unsigned u = 0; u < uniforms; u++)
push_constant_loc[u] = u;
memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
}
/* Now that we know how many regular uniforms we'll push, reduce the /* Now that we know how many regular uniforms we'll push, reduce the
* UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits. * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
@@ -2733,33 +2443,22 @@ fs_visitor::get_pull_locs(const fs_reg &src,
{ {
assert(src.file == UNIFORM); assert(src.file == UNIFORM);
if (src.nr >= UBO_START) { if (src.nr < UBO_START)
const struct brw_ubo_range *range = return false;
&prog_data->ubo_ranges[src.nr - UBO_START];
/* If this access is in our (reduced) range, use the push data. */ const struct brw_ubo_range *range =
if (src.offset / 32 < range->length) &prog_data->ubo_ranges[src.nr - UBO_START];
return false;
*out_surf_index = prog_data->binding_table.ubo_start + range->block; /* If this access is in our (reduced) range, use the push data. */
*out_pull_index = (32 * range->start + src.offset) / 4; if (src.offset / 32 < range->length)
return false;
prog_data->has_ubo_pull = true; *out_surf_index = prog_data->binding_table.ubo_start + range->block;
return true; *out_pull_index = (32 * range->start + src.offset) / 4;
}
const unsigned location = src.nr + src.offset / 4; prog_data->has_ubo_pull = true;
if (location < uniforms && pull_constant_loc[location] != -1) { return true;
/* A regular uniform push constant */
*out_surf_index = stage_prog_data->binding_table.pull_constants_start;
*out_pull_index = pull_constant_loc[location];
prog_data->has_ubo_pull = true;
return true;
}
return false;
} }
/** /**

View File

@@ -369,12 +369,6 @@ public:
/** Byte-offset for the next available spot in the scratch space buffer. */ /** Byte-offset for the next available spot in the scratch space buffer. */
unsigned last_scratch; unsigned last_scratch;
/**
* Array mapping UNIFORM register numbers to the pull parameter index,
* or -1 if this uniform register isn't being uploaded as a pull constant.
*/
int *pull_constant_loc;
/** /**
* Array mapping UNIFORM register numbers to the push parameter index, * Array mapping UNIFORM register numbers to the push parameter index,
* or -1 if this uniform register isn't being uploaded as a push constant. * or -1 if this uniform register isn't being uploaded as a push constant.

View File

@@ -103,10 +103,8 @@ void
fs_visitor::nir_setup_uniforms() fs_visitor::nir_setup_uniforms()
{ {
/* Only the first compile gets to set up uniforms. */ /* Only the first compile gets to set up uniforms. */
if (push_constant_loc) { if (push_constant_loc)
assert(pull_constant_loc);
return; return;
}
uniforms = nir->num_uniforms / 4; uniforms = nir->num_uniforms / 4;

View File

@@ -126,7 +126,6 @@ fs_visitor::emit_dummy_fs()
/* We don't have any uniforms. */ /* We don't have any uniforms. */
stage_prog_data->nr_params = 0; stage_prog_data->nr_params = 0;
stage_prog_data->nr_pull_params = 0;
stage_prog_data->curb_read_length = 0; stage_prog_data->curb_read_length = 0;
stage_prog_data->dispatch_grf_start_reg = 2; stage_prog_data->dispatch_grf_start_reg = 2;
wm_prog_data->dispatch_grf_start_reg_16 = 2; wm_prog_data->dispatch_grf_start_reg_16 = 2;
@@ -1192,7 +1191,6 @@ fs_visitor::init()
this->uniforms = 0; this->uniforms = 0;
this->last_scratch = 0; this->last_scratch = 0;
this->pull_constant_loc = NULL;
this->push_constant_loc = NULL; this->push_constant_loc = NULL;
this->shader_stats.scheduler_mode = NULL; this->shader_stats.scheduler_mode = NULL;

View File

@@ -604,194 +604,6 @@ vec4_visitor::split_uniform_registers()
} }
} }
/* This function returns the register number where we placed the uniform */
static int
set_push_constant_loc(const int nr_uniforms, int *new_uniform_count,
const int src, const int size, const int channel_size,
int *new_loc, int *new_chan,
int *new_chans_used)
{
int dst;
/* Find the lowest place we can slot this uniform in. */
for (dst = 0; dst < nr_uniforms; dst++) {
if (ALIGN(new_chans_used[dst], channel_size) + size <= 4)
break;
}
assert(dst < nr_uniforms);
new_loc[src] = dst;
new_chan[src] = ALIGN(new_chans_used[dst], channel_size);
new_chans_used[dst] = ALIGN(new_chans_used[dst], channel_size) + size;
*new_uniform_count = MAX2(*new_uniform_count, dst + 1);
return dst;
}
void
vec4_visitor::pack_uniform_registers()
{
if (!compiler->compact_params)
return;
uint8_t chans_used[this->uniforms];
int new_loc[this->uniforms];
int new_chan[this->uniforms];
bool is_aligned_to_dvec4[this->uniforms];
int new_chans_used[this->uniforms];
int channel_sizes[this->uniforms];
memset(chans_used, 0, sizeof(chans_used));
memset(new_loc, 0, sizeof(new_loc));
memset(new_chan, 0, sizeof(new_chan));
memset(new_chans_used, 0, sizeof(new_chans_used));
memset(is_aligned_to_dvec4, 0, sizeof(is_aligned_to_dvec4));
memset(channel_sizes, 0, sizeof(channel_sizes));
/* Find which uniform vectors are actually used by the program. We
* expect unused vector elements when we've moved array access out
* to pull constants, and from some GLSL code generators like wine.
*/
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
unsigned readmask;
switch (inst->opcode) {
case VEC4_OPCODE_PACK_BYTES:
case BRW_OPCODE_DP4:
case BRW_OPCODE_DPH:
readmask = 0xf;
break;
case BRW_OPCODE_DP3:
readmask = 0x7;
break;
case BRW_OPCODE_DP2:
readmask = 0x3;
break;
default:
readmask = inst->dst.writemask;
break;
}
for (int i = 0 ; i < 3; i++) {
if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START)
continue;
assert(type_sz(inst->src[i].type) % 4 == 0);
int channel_size = type_sz(inst->src[i].type) / 4;
int reg = inst->src[i].nr;
for (int c = 0; c < 4; c++) {
if (!(readmask & (1 << c)))
continue;
unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1;
unsigned used = MAX2(chans_used[reg], channel * channel_size);
if (used <= 4) {
chans_used[reg] = used;
channel_sizes[reg] = MAX2(channel_sizes[reg], channel_size);
} else {
is_aligned_to_dvec4[reg] = true;
is_aligned_to_dvec4[reg + 1] = true;
chans_used[reg + 1] = used - 4;
channel_sizes[reg + 1] = MAX2(channel_sizes[reg + 1], channel_size);
}
}
}
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
inst->src[0].file == UNIFORM) {
assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
assert(inst->src[0].subnr == 0);
unsigned bytes_read = inst->src[2].ud;
assert(bytes_read % 4 == 0);
unsigned vec4s_read = DIV_ROUND_UP(bytes_read, 16);
/* We just mark every register touched by a MOV_INDIRECT as being
* fully used. This ensures that it doesn't broken up piecewise by
* the next part of our packing algorithm.
*/
int reg = inst->src[0].nr;
int channel_size = type_sz(inst->src[0].type) / 4;
for (unsigned i = 0; i < vec4s_read; i++) {
chans_used[reg + i] = 4;
channel_sizes[reg + i] = MAX2(channel_sizes[reg + i], channel_size);
}
}
}
int new_uniform_count = 0;
/* As the uniforms are going to be reordered, take the data from a temporary
* copy of the original param[].
*/
uint32_t *param = ralloc_array(NULL, uint32_t, stage_prog_data->nr_params);
memcpy(param, stage_prog_data->param,
sizeof(uint32_t) * stage_prog_data->nr_params);
/* Now, figure out a packing of the live uniform vectors into our
* push constants. Start with dvec{3,4} because they are aligned to
* dvec4 size (2 vec4).
*/
for (int src = 0; src < uniforms; src++) {
int size = chans_used[src];
if (size == 0 || !is_aligned_to_dvec4[src])
continue;
/* dvec3 are aligned to dvec4 size, apply the alignment of the size
* to 4 to avoid moving last component of a dvec3 to the available
* location at the end of a previous dvec3. These available locations
* could be filled by smaller variables in next loop.
*/
size = ALIGN(size, 4);
int dst = set_push_constant_loc(uniforms, &new_uniform_count,
src, size, channel_sizes[src],
new_loc, new_chan,
new_chans_used);
/* Move the references to the data */
for (int j = 0; j < size; j++) {
stage_prog_data->param[dst * 4 + new_chan[src] + j] =
param[src * 4 + j];
}
}
/* Continue with the rest of data, which is aligned to vec4. */
for (int src = 0; src < uniforms; src++) {
int size = chans_used[src];
if (size == 0 || is_aligned_to_dvec4[src])
continue;
int dst = set_push_constant_loc(uniforms, &new_uniform_count,
src, size, channel_sizes[src],
new_loc, new_chan,
new_chans_used);
/* Move the references to the data */
for (int j = 0; j < size; j++) {
stage_prog_data->param[dst * 4 + new_chan[src] + j] =
param[src * 4 + j];
}
}
ralloc_free(param);
this->uniforms = new_uniform_count;
stage_prog_data->nr_params = new_uniform_count * 4;
/* Now, update the instructions for our repacked uniforms. */
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
for (int i = 0 ; i < 3; i++) {
int src = inst->src[i].nr;
if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START)
continue;
int chan = new_chan[src] / channel_sizes[src];
inst->src[i].nr = new_loc[src];
inst->src[i].swizzle += BRW_SWIZZLE4(chan, chan, chan, chan);
}
}
}
/** /**
* Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a). * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
* *
@@ -910,97 +722,6 @@ vec4_visitor::opt_algebraic()
return progress; return progress;
} }
/**
* Only a limited number of hardware registers may be used for push
* constants, so this turns access to the overflowed constants into
* pull constants.
*/
void
vec4_visitor::move_push_constants_to_pull_constants()
{
int pull_constant_loc[this->uniforms];
const int max_uniform_components = push_length * 8;
if (this->uniforms * 4 <= max_uniform_components)
return;
assert(compiler->supports_pull_constants);
assert(compiler->compact_params);
/* If we got here, we also can't have any push ranges */
for (unsigned i = 0; i < 4; i++)
assert(prog_data->base.ubo_ranges[i].length == 0);
/* Make some sort of choice as to which uniforms get sent to pull
* constants. We could potentially do something clever here like
* look for the most infrequently used uniform vec4s, but leave
* that for later.
*/
for (int i = 0; i < this->uniforms * 4; i += 4) {
pull_constant_loc[i / 4] = -1;
if (i >= max_uniform_components) {
uint32_t *values = &stage_prog_data->param[i];
/* Try to find an existing copy of this uniform in the pull
* constants if it was part of an array access already.
*/
for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
int matches;
for (matches = 0; matches < 4; matches++) {
if (stage_prog_data->pull_param[j + matches] != values[matches])
break;
}
if (matches == 4) {
pull_constant_loc[i / 4] = j / 4;
break;
}
}
if (pull_constant_loc[i / 4] == -1) {
assert(stage_prog_data->nr_pull_params % 4 == 0);
pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
for (int j = 0; j < 4; j++) {
stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
values[j];
}
}
}
}
/* Now actually rewrite usage of the things we've moved to pull
* constants.
*/
foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
for (int i = 0 ; i < 3; i++) {
if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START ||
pull_constant_loc[inst->src[i].nr] == -1)
continue;
int uniform = inst->src[i].nr;
const glsl_type *temp_type = type_sz(inst->src[i].type) == 8 ?
glsl_type::dvec4_type : glsl_type::vec4_type;
dst_reg temp = dst_reg(this, temp_type);
emit_pull_constant_load(block, inst, temp, inst->src[i],
pull_constant_loc[uniform], src_reg());
inst->src[i].file = temp.file;
inst->src[i].nr = temp.nr;
inst->src[i].offset %= 16;
inst->src[i].reladdr = NULL;
}
}
/* Repack push constants to remove the now-unused ones. */
pack_uniform_registers();
}
/* Conditions for which we want to avoid setting the dependency control bits */ /* Conditions for which we want to avoid setting the dependency control bits */
bool bool
vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst) vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
@@ -1842,15 +1563,13 @@ vec4_visitor::setup_uniforms(int reg)
/* It's possible that uniform compaction will shrink further than expected /* It's possible that uniform compaction will shrink further than expected
* so we re-compute the layout and set up our UBO push starts. * so we re-compute the layout and set up our UBO push starts.
*/ */
const unsigned old_push_length = push_length; ASSERTED const unsigned old_push_length = push_length;
push_length = DIV_ROUND_UP(prog_data->base.nr_params, 8); push_length = DIV_ROUND_UP(prog_data->base.nr_params, 8);
for (unsigned i = 0; i < 4; i++) { for (unsigned i = 0; i < 4; i++) {
ubo_push_start[i] = push_length; ubo_push_start[i] = push_length;
push_length += stage_prog_data->ubo_ranges[i].length; push_length += stage_prog_data->ubo_ranges[i].length;
} }
assert(push_length <= old_push_length); assert(push_length == old_push_length);
if (push_length < old_push_length)
assert(compiler->compact_params);
/* The pre-gfx6 VS requires that some push constants get loaded no /* The pre-gfx6 VS requires that some push constants get loaded no
* matter what, or the GPU would hang. * matter what, or the GPU would hang.
@@ -2738,10 +2457,8 @@ vec4_visitor::run()
* often do repeated subexpressions for those. * often do repeated subexpressions for those.
*/ */
move_grf_array_access_to_scratch(); move_grf_array_access_to_scratch();
move_uniform_array_access_to_pull_constants(); split_uniform_registers();
pack_uniform_registers();
move_push_constants_to_pull_constants();
split_virtual_grfs(); split_virtual_grfs();
#define OPT(pass, args...) ({ \ #define OPT(pass, args...) ({ \

View File

@@ -138,9 +138,7 @@ public:
void spill_reg(unsigned spill_reg); void spill_reg(unsigned spill_reg);
void move_grf_array_access_to_scratch(); void move_grf_array_access_to_scratch();
void move_uniform_array_access_to_pull_constants(); void move_uniform_array_access_to_pull_constants();
void move_push_constants_to_pull_constants();
void split_uniform_registers(); void split_uniform_registers();
void pack_uniform_registers();
void setup_push_ranges(); void setup_push_ranges();
virtual void invalidate_analysis(brw::analysis_dependency_class c); virtual void invalidate_analysis(brw::analysis_dependency_class c);
void split_virtual_grfs(); void split_virtual_grfs();
@@ -292,11 +290,6 @@ public:
int base_offset); int base_offset);
void emit_scratch_write(bblock_t *block, vec4_instruction *inst, void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
int base_offset); int base_offset);
void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
dst_reg dst,
src_reg orig_src,
int base_offset,
src_reg indirect);
void emit_pull_constant_load_reg(dst_reg dst, void emit_pull_constant_load_reg(dst_reg dst,
src_reg surf_index, src_reg surf_index,
src_reg offset, src_reg offset,

View File

@@ -889,7 +889,6 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
memcpy(prog_data->base.base.param, param, memcpy(prog_data->base.base.param, param,
sizeof(uint32_t) * param_count); sizeof(uint32_t) * param_count);
prog_data->base.base.nr_params = param_count; prog_data->base.base.nr_params = param_count;
prog_data->base.base.nr_pull_params = 0;
ralloc_free(param); ralloc_free(param);
} }
} }

View File

@@ -1592,146 +1592,6 @@ vec4_visitor::move_grf_array_access_to_scratch()
} }
} }
/**
* Emits an instruction before @inst to load the value named by @orig_src
* from the pull constant buffer (surface) at @base_offset to @temp.
*/
void
vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
dst_reg temp, src_reg orig_src,
int base_offset, src_reg indirect)
{
assert(orig_src.offset % 16 == 0);
const unsigned index = prog_data->base.binding_table.pull_constants_start;
/* For 64bit loads we need to emit two 32-bit load messages and we also
* we need to shuffle the 32-bit data result into proper 64-bit data. To do
* that we emit the 32-bit loads into a temporary and we shuffle the result
* into the original destination.
*/
dst_reg orig_temp = temp;
bool is_64bit = type_sz(orig_src.type) == 8;
if (is_64bit) {
assert(type_sz(temp.type) == 8);
dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
temp = retype(temp_df, BRW_REGISTER_TYPE_F);
}
src_reg src = orig_src;
for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
int reg_offset = base_offset + src.offset / 16;
src_reg offset;
if (indirect.file != BAD_FILE) {
offset = src_reg(this, glsl_type::uint_type);
emit_before(block, inst, ADD(dst_reg(offset), indirect,
brw_imm_ud(reg_offset * 16)));
} else {
offset = brw_imm_d(reg_offset * 16);
}
emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
brw_imm_ud(index),
offset,
block, inst);
src = byte_offset(src, 16);
}
if (is_64bit) {
temp = retype(temp, BRW_REGISTER_TYPE_DF);
shuffle_64bit_data(orig_temp, src_reg(temp), false, false, block, inst);
}
}
/**
* Implements array access of uniforms by inserting a
* PULL_CONSTANT_LOAD instruction.
*
* Unlike temporary GRF array access (where we don't support it due to
* the difficulty of doing relative addressing on instruction
* destinations), we could potentially do array access of uniforms
* that were loaded in GRF space as push constants. In real-world
* usage we've seen, though, the arrays being used are always larger
* than we could load as push constants, so just always move all
* uniform array access out to a pull constant buffer.
*/
void
vec4_visitor::move_uniform_array_access_to_pull_constants()
{
/* The vulkan dirver doesn't support pull constants other than UBOs so
* everything has to be pushed regardless.
*/
if (!compiler->supports_pull_constants) {
split_uniform_registers();
return;
}
/* Allocate the pull_params array */
assert(stage_prog_data->nr_pull_params == 0);
stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
this->uniforms * 4);
int pull_constant_loc[this->uniforms];
memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
/* First, walk through the instructions and determine which things need to
* be pulled. We mark something as needing to be pulled by setting
* pull_constant_loc to 0.
*/
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
/* We only care about MOV_INDIRECT of a uniform */
if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
inst->src[0].file != UNIFORM)
continue;
int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
pull_constant_loc[uniform_nr + j] = 0;
}
/* Next, we walk the list of uniforms and assign real pull constant
* locations and set their corresponding entries in pull_param.
*/
for (int j = 0; j < this->uniforms; j++) {
if (pull_constant_loc[j] < 0)
continue;
pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
for (int i = 0; i < 4; i++) {
stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
= stage_prog_data->param[j * 4 + i];
}
}
/* Finally, we can walk through the instructions and lower MOV_INDIRECT
* instructions to actual uniform pulls.
*/
foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
/* We only care about MOV_INDIRECT of a uniform */
if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
inst->src[0].file != UNIFORM)
continue;
int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
pull_constant_loc[uniform_nr], inst->src[1]);
inst->remove(block);
}
/* Now there are no accesses of the UNIFORM file with a reladdr, so
* no need to track them as larger-than-vec4 objects. This will be
* relied on in cutting out unused uniform vectors from push
* constants.
*/
split_uniform_registers();
}
void void
vec4_visitor::resolve_ud_negate(src_reg *reg) vec4_visitor::resolve_ud_negate(src_reg *reg)
{ {

View File

@@ -974,11 +974,9 @@ anv_physical_device_try_create(struct anv_instance *instance,
} }
device->compiler->shader_debug_log = compiler_debug_log; device->compiler->shader_debug_log = compiler_debug_log;
device->compiler->shader_perf_log = compiler_perf_log; device->compiler->shader_perf_log = compiler_perf_log;
device->compiler->supports_pull_constants = false;
device->compiler->constant_buffer_0_is_relative = device->compiler->constant_buffer_0_is_relative =
device->info.ver < 8 || !device->has_context_isolation; device->info.ver < 8 || !device->has_context_isolation;
device->compiler->supports_shader_constants = true; device->compiler->supports_shader_constants = true;
device->compiler->compact_params = false;
device->compiler->indirect_ubos_use_sampler = device->info.ver < 12; device->compiler->indirect_ubos_use_sampler = device->info.ver < 12;
isl_device_init(&device->isl_dev, &device->info); isl_device_init(&device->isl_dev, &device->info);