intel/fs,vec4: Drop uniform compaction and pull constant support
The only driver using these was i965 and it's gone now. This is all dead code. Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14056>
This commit is contained in:

committed by
Marge Bot

parent
4175ed5099
commit
8f3c100d61
@@ -181,9 +181,6 @@ crocus_disk_cache_retrieve(struct crocus_context *ice,
|
|||||||
}
|
}
|
||||||
|
|
||||||
prog_data->param = NULL;
|
prog_data->param = NULL;
|
||||||
prog_data->pull_param = NULL;
|
|
||||||
assert(prog_data->nr_pull_params == 0);
|
|
||||||
|
|
||||||
if (prog_data->nr_params) {
|
if (prog_data->nr_params) {
|
||||||
prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params);
|
prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params);
|
||||||
blob_copy_bytes(&blob, prog_data->param,
|
blob_copy_bytes(&blob, prog_data->param,
|
||||||
|
@@ -224,10 +224,8 @@ crocus_upload_shader(struct crocus_context *ice,
|
|||||||
shader->bt = *bt;
|
shader->bt = *bt;
|
||||||
|
|
||||||
ralloc_steal(shader, shader->prog_data);
|
ralloc_steal(shader, shader->prog_data);
|
||||||
if (prog_data_size > 16) {
|
if (prog_data_size > 16)
|
||||||
ralloc_steal(shader->prog_data, prog_data->param);
|
ralloc_steal(shader->prog_data, prog_data->param);
|
||||||
ralloc_steal(shader->prog_data, prog_data->pull_param);
|
|
||||||
}
|
|
||||||
ralloc_steal(shader, shader->streamout);
|
ralloc_steal(shader, shader->streamout);
|
||||||
ralloc_steal(shader, shader->system_values);
|
ralloc_steal(shader, shader->system_values);
|
||||||
|
|
||||||
|
@@ -779,9 +779,7 @@ crocus_screen_create(int fd, const struct pipe_screen_config *config)
|
|||||||
screen->compiler = brw_compiler_create(screen, &screen->devinfo);
|
screen->compiler = brw_compiler_create(screen, &screen->devinfo);
|
||||||
screen->compiler->shader_debug_log = crocus_shader_debug_log;
|
screen->compiler->shader_debug_log = crocus_shader_debug_log;
|
||||||
screen->compiler->shader_perf_log = crocus_shader_perf_log;
|
screen->compiler->shader_perf_log = crocus_shader_perf_log;
|
||||||
screen->compiler->supports_pull_constants = false;
|
|
||||||
screen->compiler->supports_shader_constants = false;
|
screen->compiler->supports_shader_constants = false;
|
||||||
screen->compiler->compact_params = false;
|
|
||||||
screen->compiler->constant_buffer_0_is_relative = true;
|
screen->compiler->constant_buffer_0_is_relative = true;
|
||||||
|
|
||||||
if (screen->devinfo.ver >= 7) {
|
if (screen->devinfo.ver >= 7) {
|
||||||
|
@@ -207,9 +207,6 @@ iris_disk_cache_retrieve(struct iris_screen *screen,
|
|||||||
}
|
}
|
||||||
|
|
||||||
prog_data->param = NULL;
|
prog_data->param = NULL;
|
||||||
prog_data->pull_param = NULL;
|
|
||||||
assert(prog_data->nr_pull_params == 0);
|
|
||||||
|
|
||||||
if (prog_data->nr_params) {
|
if (prog_data->nr_params) {
|
||||||
prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params);
|
prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params);
|
||||||
blob_copy_bytes(&blob, prog_data->param,
|
blob_copy_bytes(&blob, prog_data->param,
|
||||||
|
@@ -90,7 +90,6 @@ iris_finalize_program(struct iris_compiled_shader *shader,
|
|||||||
ralloc_steal(shader, shader->prog_data);
|
ralloc_steal(shader, shader->prog_data);
|
||||||
ralloc_steal(shader->prog_data, (void *)prog_data->relocs);
|
ralloc_steal(shader->prog_data, (void *)prog_data->relocs);
|
||||||
ralloc_steal(shader->prog_data, prog_data->param);
|
ralloc_steal(shader->prog_data, prog_data->param);
|
||||||
ralloc_steal(shader->prog_data, prog_data->pull_param);
|
|
||||||
ralloc_steal(shader, shader->streamout);
|
ralloc_steal(shader, shader->streamout);
|
||||||
ralloc_steal(shader, shader->system_values);
|
ralloc_steal(shader, shader->system_values);
|
||||||
}
|
}
|
||||||
|
@@ -839,9 +839,7 @@ iris_screen_create(int fd, const struct pipe_screen_config *config)
|
|||||||
screen->compiler = brw_compiler_create(screen, &screen->devinfo);
|
screen->compiler = brw_compiler_create(screen, &screen->devinfo);
|
||||||
screen->compiler->shader_debug_log = iris_shader_debug_log;
|
screen->compiler->shader_debug_log = iris_shader_debug_log;
|
||||||
screen->compiler->shader_perf_log = iris_shader_perf_log;
|
screen->compiler->shader_perf_log = iris_shader_perf_log;
|
||||||
screen->compiler->supports_pull_constants = false;
|
|
||||||
screen->compiler->supports_shader_constants = true;
|
screen->compiler->supports_shader_constants = true;
|
||||||
screen->compiler->compact_params = false;
|
|
||||||
screen->compiler->indirect_ubos_use_sampler = screen->devinfo.ver < 12;
|
screen->compiler->indirect_ubos_use_sampler = screen->devinfo.ver < 12;
|
||||||
|
|
||||||
screen->l3_config_3d = iris_get_default_l3_config(&screen->devinfo, false);
|
screen->l3_config_3d = iris_get_default_l3_config(&screen->devinfo, false);
|
||||||
|
@@ -91,24 +91,12 @@ struct brw_compiler {
|
|||||||
*/
|
*/
|
||||||
bool constant_buffer_0_is_relative;
|
bool constant_buffer_0_is_relative;
|
||||||
|
|
||||||
/**
|
|
||||||
* Whether or not the driver supports pull constants. If not, the compiler
|
|
||||||
* will attempt to push everything.
|
|
||||||
*/
|
|
||||||
bool supports_pull_constants;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Whether or not the driver supports NIR shader constants. This controls
|
* Whether or not the driver supports NIR shader constants. This controls
|
||||||
* whether nir_opt_large_constants will be run.
|
* whether nir_opt_large_constants will be run.
|
||||||
*/
|
*/
|
||||||
bool supports_shader_constants;
|
bool supports_shader_constants;
|
||||||
|
|
||||||
/**
|
|
||||||
* Whether or not the driver wants uniform params to be compacted by the
|
|
||||||
* back-end compiler.
|
|
||||||
*/
|
|
||||||
bool compact_params;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Whether or not the driver wants variable group size to be lowered by the
|
* Whether or not the driver wants variable group size to be lowered by the
|
||||||
* back-end compiler.
|
* back-end compiler.
|
||||||
@@ -775,7 +763,6 @@ struct brw_stage_prog_data {
|
|||||||
struct brw_ubo_range ubo_ranges[4];
|
struct brw_ubo_range ubo_ranges[4];
|
||||||
|
|
||||||
GLuint nr_params; /**< number of float params/constants */
|
GLuint nr_params; /**< number of float params/constants */
|
||||||
GLuint nr_pull_params;
|
|
||||||
|
|
||||||
gl_shader_stage stage;
|
gl_shader_stage stage;
|
||||||
|
|
||||||
@@ -822,7 +809,6 @@ struct brw_stage_prog_data {
|
|||||||
* above.
|
* above.
|
||||||
*/
|
*/
|
||||||
uint32_t *param;
|
uint32_t *param;
|
||||||
uint32_t *pull_param;
|
|
||||||
|
|
||||||
/* Whether shader uses atomic operations. */
|
/* Whether shader uses atomic operations. */
|
||||||
bool uses_atomic_load_store;
|
bool uses_atomic_load_store;
|
||||||
|
@@ -1234,7 +1234,6 @@ void
|
|||||||
fs_visitor::import_uniforms(fs_visitor *v)
|
fs_visitor::import_uniforms(fs_visitor *v)
|
||||||
{
|
{
|
||||||
this->push_constant_loc = v->push_constant_loc;
|
this->push_constant_loc = v->push_constant_loc;
|
||||||
this->pull_constant_loc = v->pull_constant_loc;
|
|
||||||
this->uniforms = v->uniforms;
|
this->uniforms = v->uniforms;
|
||||||
this->subgroup_id = v->subgroup_id;
|
this->subgroup_id = v->subgroup_id;
|
||||||
for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++)
|
for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++)
|
||||||
@@ -1801,7 +1800,6 @@ fs_visitor::assign_curb_setup()
|
|||||||
|
|
||||||
uint64_t want_zero = used & stage_prog_data->zero_push_reg;
|
uint64_t want_zero = used & stage_prog_data->zero_push_reg;
|
||||||
if (want_zero) {
|
if (want_zero) {
|
||||||
assert(!compiler->compact_params);
|
|
||||||
fs_builder ubld = bld.exec_all().group(8, 0).at(
|
fs_builder ubld = bld.exec_all().group(8, 0).at(
|
||||||
cfg->first_block(), cfg->first_block()->start());
|
cfg->first_block(), cfg->first_block()->start());
|
||||||
|
|
||||||
@@ -2396,109 +2394,6 @@ get_subgroup_id_param_index(const intel_device_info *devinfo,
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Struct for handling complex alignments.
|
|
||||||
*
|
|
||||||
* A complex alignment is stored as multiplier and an offset. A value is
|
|
||||||
* considered to be aligned if it is {offset} larger than a multiple of {mul}.
|
|
||||||
* For instance, with an alignment of {8, 2}, cplx_align_apply would do the
|
|
||||||
* following:
|
|
||||||
*
|
|
||||||
* N | cplx_align_apply({8, 2}, N)
|
|
||||||
* ----+-----------------------------
|
|
||||||
* 4 | 6
|
|
||||||
* 6 | 6
|
|
||||||
* 8 | 14
|
|
||||||
* 10 | 14
|
|
||||||
* 12 | 14
|
|
||||||
* 14 | 14
|
|
||||||
* 16 | 22
|
|
||||||
*/
|
|
||||||
struct cplx_align {
|
|
||||||
unsigned mul:4;
|
|
||||||
unsigned offset:4;
|
|
||||||
};
|
|
||||||
|
|
||||||
#define CPLX_ALIGN_MAX_MUL 8
|
|
||||||
|
|
||||||
static void
|
|
||||||
cplx_align_assert_sane(struct cplx_align a)
|
|
||||||
{
|
|
||||||
assert(a.mul > 0 && util_is_power_of_two_nonzero(a.mul));
|
|
||||||
assert(a.offset < a.mul);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Combines two alignments to produce a least multiple of sorts.
|
|
||||||
*
|
|
||||||
* The returned alignment is the smallest (in terms of multiplier) such that
|
|
||||||
* anything aligned to both a and b will be aligned to the new alignment.
|
|
||||||
* This function will assert-fail if a and b are not compatible, i.e. if the
|
|
||||||
* offset parameters are such that no common alignment is possible.
|
|
||||||
*/
|
|
||||||
static struct cplx_align
|
|
||||||
cplx_align_combine(struct cplx_align a, struct cplx_align b)
|
|
||||||
{
|
|
||||||
cplx_align_assert_sane(a);
|
|
||||||
cplx_align_assert_sane(b);
|
|
||||||
|
|
||||||
/* Assert that the alignments agree. */
|
|
||||||
assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1)));
|
|
||||||
|
|
||||||
return a.mul > b.mul ? a : b;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Apply a complex alignment
|
|
||||||
*
|
|
||||||
* This function will return the smallest number greater than or equal to
|
|
||||||
* offset that is aligned to align.
|
|
||||||
*/
|
|
||||||
static unsigned
|
|
||||||
cplx_align_apply(struct cplx_align align, unsigned offset)
|
|
||||||
{
|
|
||||||
return ALIGN(offset - align.offset, align.mul) + align.offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define UNIFORM_SLOT_SIZE 4
|
|
||||||
|
|
||||||
struct uniform_slot_info {
|
|
||||||
/** True if the given uniform slot is live */
|
|
||||||
unsigned is_live:1;
|
|
||||||
|
|
||||||
/** True if this slot and the next slot must remain contiguous */
|
|
||||||
unsigned contiguous:1;
|
|
||||||
|
|
||||||
struct cplx_align align;
|
|
||||||
};
|
|
||||||
|
|
||||||
static void
|
|
||||||
mark_uniform_slots_read(struct uniform_slot_info *slots,
|
|
||||||
unsigned num_slots, unsigned alignment)
|
|
||||||
{
|
|
||||||
assert(alignment > 0 && util_is_power_of_two_nonzero(alignment));
|
|
||||||
assert(alignment <= CPLX_ALIGN_MAX_MUL);
|
|
||||||
|
|
||||||
/* We can't align a slot to anything less than the slot size */
|
|
||||||
alignment = MAX2(alignment, UNIFORM_SLOT_SIZE);
|
|
||||||
|
|
||||||
struct cplx_align align = {alignment, 0};
|
|
||||||
cplx_align_assert_sane(align);
|
|
||||||
|
|
||||||
for (unsigned i = 0; i < num_slots; i++) {
|
|
||||||
slots[i].is_live = true;
|
|
||||||
if (i < num_slots - 1)
|
|
||||||
slots[i].contiguous = true;
|
|
||||||
|
|
||||||
align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1);
|
|
||||||
if (slots[i].align.mul == 0) {
|
|
||||||
slots[i].align = align;
|
|
||||||
} else {
|
|
||||||
slots[i].align = cplx_align_combine(slots[i].align, align);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Assign UNIFORM file registers to either push constants or pull constants.
|
* Assign UNIFORM file registers to either push constants or pull constants.
|
||||||
*
|
*
|
||||||
@@ -2512,197 +2407,12 @@ void
|
|||||||
fs_visitor::assign_constant_locations()
|
fs_visitor::assign_constant_locations()
|
||||||
{
|
{
|
||||||
/* Only the first compile gets to decide on locations. */
|
/* Only the first compile gets to decide on locations. */
|
||||||
if (push_constant_loc) {
|
if (push_constant_loc)
|
||||||
assert(pull_constant_loc);
|
|
||||||
return;
|
return;
|
||||||
}
|
|
||||||
|
|
||||||
if (compiler->compact_params) {
|
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
||||||
struct uniform_slot_info slots[uniforms + 1];
|
for (unsigned u = 0; u < uniforms; u++)
|
||||||
memset(slots, 0, sizeof(slots));
|
push_constant_loc[u] = u;
|
||||||
|
|
||||||
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
|
||||||
for (int i = 0 ; i < inst->sources; i++) {
|
|
||||||
if (inst->src[i].file != UNIFORM)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
/* NIR tightly packs things so the uniform number might not be
|
|
||||||
* aligned (if we have a double right after a float, for
|
|
||||||
* instance). This is fine because the process of re-arranging
|
|
||||||
* them will ensure that things are properly aligned. The offset
|
|
||||||
* into that uniform, however, must be aligned.
|
|
||||||
*
|
|
||||||
* In Vulkan, we have explicit offsets but everything is crammed
|
|
||||||
* into a single "variable" so inst->src[i].nr will always be 0.
|
|
||||||
* Everything will be properly aligned relative to that one base.
|
|
||||||
*/
|
|
||||||
assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
|
|
||||||
|
|
||||||
unsigned u = inst->src[i].nr +
|
|
||||||
inst->src[i].offset / UNIFORM_SLOT_SIZE;
|
|
||||||
|
|
||||||
if (u >= uniforms)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
unsigned slots_read;
|
|
||||||
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
|
|
||||||
slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
|
|
||||||
} else {
|
|
||||||
unsigned bytes_read = inst->components_read(i) *
|
|
||||||
type_sz(inst->src[i].type);
|
|
||||||
slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(u + slots_read <= uniforms);
|
|
||||||
mark_uniform_slots_read(&slots[u], slots_read,
|
|
||||||
type_sz(inst->src[i].type));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int subgroup_id_index = get_subgroup_id_param_index(devinfo,
|
|
||||||
stage_prog_data);
|
|
||||||
|
|
||||||
/* Only allow 16 registers (128 uniform components) as push constants.
|
|
||||||
*
|
|
||||||
* Just demote the end of the list. We could probably do better
|
|
||||||
* here, demoting things that are rarely used in the program first.
|
|
||||||
*
|
|
||||||
* If changing this value, note the limitation about total_regs in
|
|
||||||
* brw_curbe.c.
|
|
||||||
*/
|
|
||||||
unsigned int max_push_components = 16 * 8;
|
|
||||||
if (subgroup_id_index >= 0)
|
|
||||||
max_push_components--; /* Save a slot for the thread ID */
|
|
||||||
|
|
||||||
/* We push small arrays, but no bigger than 16 floats. This is big
|
|
||||||
* enough for a vec4 but hopefully not large enough to push out other
|
|
||||||
* stuff. We should probably use a better heuristic at some point.
|
|
||||||
*/
|
|
||||||
const unsigned int max_chunk_size = 16;
|
|
||||||
|
|
||||||
unsigned int num_push_constants = 0;
|
|
||||||
unsigned int num_pull_constants = 0;
|
|
||||||
|
|
||||||
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
||||||
pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
||||||
|
|
||||||
/* Default to -1 meaning no location */
|
|
||||||
memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
|
|
||||||
memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
|
|
||||||
|
|
||||||
int chunk_start = -1;
|
|
||||||
struct cplx_align align;
|
|
||||||
for (unsigned u = 0; u < uniforms; u++) {
|
|
||||||
if (!slots[u].is_live) {
|
|
||||||
assert(chunk_start == -1);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Skip subgroup_id_index to put it in the last push register. */
|
|
||||||
if (subgroup_id_index == (int)u)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (chunk_start == -1) {
|
|
||||||
chunk_start = u;
|
|
||||||
align = slots[u].align;
|
|
||||||
} else {
|
|
||||||
/* Offset into the chunk */
|
|
||||||
unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
|
|
||||||
|
|
||||||
/* Shift the slot alignment down by the chunk offset so it is
|
|
||||||
* comparable with the base chunk alignment.
|
|
||||||
*/
|
|
||||||
struct cplx_align slot_align = slots[u].align;
|
|
||||||
slot_align.offset =
|
|
||||||
(slot_align.offset - chunk_offset) & (align.mul - 1);
|
|
||||||
|
|
||||||
align = cplx_align_combine(align, slot_align);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Sanity check the alignment */
|
|
||||||
cplx_align_assert_sane(align);
|
|
||||||
|
|
||||||
if (slots[u].contiguous)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
/* Adjust the alignment to be in terms of slots, not bytes */
|
|
||||||
assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
|
|
||||||
assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
|
|
||||||
align.mul /= UNIFORM_SLOT_SIZE;
|
|
||||||
align.offset /= UNIFORM_SLOT_SIZE;
|
|
||||||
|
|
||||||
unsigned push_start_align = cplx_align_apply(align, num_push_constants);
|
|
||||||
unsigned chunk_size = u - chunk_start + 1;
|
|
||||||
if ((!compiler->supports_pull_constants && u < UBO_START) ||
|
|
||||||
(chunk_size < max_chunk_size &&
|
|
||||||
push_start_align + chunk_size <= max_push_components)) {
|
|
||||||
/* Align up the number of push constants */
|
|
||||||
num_push_constants = push_start_align;
|
|
||||||
for (unsigned i = 0; i < chunk_size; i++)
|
|
||||||
push_constant_loc[chunk_start + i] = num_push_constants++;
|
|
||||||
} else {
|
|
||||||
/* We need to pull this one */
|
|
||||||
num_pull_constants = cplx_align_apply(align, num_pull_constants);
|
|
||||||
for (unsigned i = 0; i < chunk_size; i++)
|
|
||||||
pull_constant_loc[chunk_start + i] = num_pull_constants++;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Reset the chunk and start again */
|
|
||||||
chunk_start = -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Add the CS local thread ID uniform at the end of the push constants */
|
|
||||||
if (subgroup_id_index >= 0)
|
|
||||||
push_constant_loc[subgroup_id_index] = num_push_constants++;
|
|
||||||
|
|
||||||
/* As the uniforms are going to be reordered, stash the old array and
|
|
||||||
* create two new arrays for push/pull params.
|
|
||||||
*/
|
|
||||||
uint32_t *param = stage_prog_data->param;
|
|
||||||
stage_prog_data->nr_params = num_push_constants;
|
|
||||||
if (num_push_constants) {
|
|
||||||
stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
|
|
||||||
num_push_constants);
|
|
||||||
} else {
|
|
||||||
stage_prog_data->param = NULL;
|
|
||||||
}
|
|
||||||
assert(stage_prog_data->nr_pull_params == 0);
|
|
||||||
assert(stage_prog_data->pull_param == NULL);
|
|
||||||
if (num_pull_constants > 0) {
|
|
||||||
stage_prog_data->nr_pull_params = num_pull_constants;
|
|
||||||
stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
|
|
||||||
num_pull_constants);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Up until now, the param[] array has been indexed by reg + offset
|
|
||||||
* of UNIFORM registers. Move pull constants into pull_param[] and
|
|
||||||
* condense param[] to only contain the uniforms we chose to push.
|
|
||||||
*
|
|
||||||
* NOTE: Because we are condensing the params[] array, we know that
|
|
||||||
* push_constant_loc[i] <= i and we can do it in one smooth loop without
|
|
||||||
* having to make a copy.
|
|
||||||
*/
|
|
||||||
for (unsigned int i = 0; i < uniforms; i++) {
|
|
||||||
uint32_t value = param[i];
|
|
||||||
if (pull_constant_loc[i] != -1) {
|
|
||||||
stage_prog_data->pull_param[pull_constant_loc[i]] = value;
|
|
||||||
} else if (push_constant_loc[i] != -1) {
|
|
||||||
stage_prog_data->param[push_constant_loc[i]] = value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ralloc_free(param);
|
|
||||||
} else {
|
|
||||||
/* If we don't want to compact anything, just set up dummy push/pull
|
|
||||||
* arrays. All the rest of the compiler cares about are these arrays.
|
|
||||||
*/
|
|
||||||
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
||||||
pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
||||||
|
|
||||||
for (unsigned u = 0; u < uniforms; u++)
|
|
||||||
push_constant_loc[u] = u;
|
|
||||||
|
|
||||||
memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Now that we know how many regular uniforms we'll push, reduce the
|
/* Now that we know how many regular uniforms we'll push, reduce the
|
||||||
* UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
|
* UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
|
||||||
@@ -2733,33 +2443,22 @@ fs_visitor::get_pull_locs(const fs_reg &src,
|
|||||||
{
|
{
|
||||||
assert(src.file == UNIFORM);
|
assert(src.file == UNIFORM);
|
||||||
|
|
||||||
if (src.nr >= UBO_START) {
|
if (src.nr < UBO_START)
|
||||||
const struct brw_ubo_range *range =
|
return false;
|
||||||
&prog_data->ubo_ranges[src.nr - UBO_START];
|
|
||||||
|
|
||||||
/* If this access is in our (reduced) range, use the push data. */
|
const struct brw_ubo_range *range =
|
||||||
if (src.offset / 32 < range->length)
|
&prog_data->ubo_ranges[src.nr - UBO_START];
|
||||||
return false;
|
|
||||||
|
|
||||||
*out_surf_index = prog_data->binding_table.ubo_start + range->block;
|
/* If this access is in our (reduced) range, use the push data. */
|
||||||
*out_pull_index = (32 * range->start + src.offset) / 4;
|
if (src.offset / 32 < range->length)
|
||||||
|
return false;
|
||||||
|
|
||||||
prog_data->has_ubo_pull = true;
|
*out_surf_index = prog_data->binding_table.ubo_start + range->block;
|
||||||
return true;
|
*out_pull_index = (32 * range->start + src.offset) / 4;
|
||||||
}
|
|
||||||
|
|
||||||
const unsigned location = src.nr + src.offset / 4;
|
prog_data->has_ubo_pull = true;
|
||||||
|
|
||||||
if (location < uniforms && pull_constant_loc[location] != -1) {
|
return true;
|
||||||
/* A regular uniform push constant */
|
|
||||||
*out_surf_index = stage_prog_data->binding_table.pull_constants_start;
|
|
||||||
*out_pull_index = pull_constant_loc[location];
|
|
||||||
|
|
||||||
prog_data->has_ubo_pull = true;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@@ -369,12 +369,6 @@ public:
|
|||||||
/** Byte-offset for the next available spot in the scratch space buffer. */
|
/** Byte-offset for the next available spot in the scratch space buffer. */
|
||||||
unsigned last_scratch;
|
unsigned last_scratch;
|
||||||
|
|
||||||
/**
|
|
||||||
* Array mapping UNIFORM register numbers to the pull parameter index,
|
|
||||||
* or -1 if this uniform register isn't being uploaded as a pull constant.
|
|
||||||
*/
|
|
||||||
int *pull_constant_loc;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Array mapping UNIFORM register numbers to the push parameter index,
|
* Array mapping UNIFORM register numbers to the push parameter index,
|
||||||
* or -1 if this uniform register isn't being uploaded as a push constant.
|
* or -1 if this uniform register isn't being uploaded as a push constant.
|
||||||
|
@@ -103,10 +103,8 @@ void
|
|||||||
fs_visitor::nir_setup_uniforms()
|
fs_visitor::nir_setup_uniforms()
|
||||||
{
|
{
|
||||||
/* Only the first compile gets to set up uniforms. */
|
/* Only the first compile gets to set up uniforms. */
|
||||||
if (push_constant_loc) {
|
if (push_constant_loc)
|
||||||
assert(pull_constant_loc);
|
|
||||||
return;
|
return;
|
||||||
}
|
|
||||||
|
|
||||||
uniforms = nir->num_uniforms / 4;
|
uniforms = nir->num_uniforms / 4;
|
||||||
|
|
||||||
|
@@ -126,7 +126,6 @@ fs_visitor::emit_dummy_fs()
|
|||||||
|
|
||||||
/* We don't have any uniforms. */
|
/* We don't have any uniforms. */
|
||||||
stage_prog_data->nr_params = 0;
|
stage_prog_data->nr_params = 0;
|
||||||
stage_prog_data->nr_pull_params = 0;
|
|
||||||
stage_prog_data->curb_read_length = 0;
|
stage_prog_data->curb_read_length = 0;
|
||||||
stage_prog_data->dispatch_grf_start_reg = 2;
|
stage_prog_data->dispatch_grf_start_reg = 2;
|
||||||
wm_prog_data->dispatch_grf_start_reg_16 = 2;
|
wm_prog_data->dispatch_grf_start_reg_16 = 2;
|
||||||
@@ -1192,7 +1191,6 @@ fs_visitor::init()
|
|||||||
|
|
||||||
this->uniforms = 0;
|
this->uniforms = 0;
|
||||||
this->last_scratch = 0;
|
this->last_scratch = 0;
|
||||||
this->pull_constant_loc = NULL;
|
|
||||||
this->push_constant_loc = NULL;
|
this->push_constant_loc = NULL;
|
||||||
|
|
||||||
this->shader_stats.scheduler_mode = NULL;
|
this->shader_stats.scheduler_mode = NULL;
|
||||||
|
@@ -604,194 +604,6 @@ vec4_visitor::split_uniform_registers()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* This function returns the register number where we placed the uniform */
|
|
||||||
static int
|
|
||||||
set_push_constant_loc(const int nr_uniforms, int *new_uniform_count,
|
|
||||||
const int src, const int size, const int channel_size,
|
|
||||||
int *new_loc, int *new_chan,
|
|
||||||
int *new_chans_used)
|
|
||||||
{
|
|
||||||
int dst;
|
|
||||||
/* Find the lowest place we can slot this uniform in. */
|
|
||||||
for (dst = 0; dst < nr_uniforms; dst++) {
|
|
||||||
if (ALIGN(new_chans_used[dst], channel_size) + size <= 4)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(dst < nr_uniforms);
|
|
||||||
|
|
||||||
new_loc[src] = dst;
|
|
||||||
new_chan[src] = ALIGN(new_chans_used[dst], channel_size);
|
|
||||||
new_chans_used[dst] = ALIGN(new_chans_used[dst], channel_size) + size;
|
|
||||||
|
|
||||||
*new_uniform_count = MAX2(*new_uniform_count, dst + 1);
|
|
||||||
return dst;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
vec4_visitor::pack_uniform_registers()
|
|
||||||
{
|
|
||||||
if (!compiler->compact_params)
|
|
||||||
return;
|
|
||||||
|
|
||||||
uint8_t chans_used[this->uniforms];
|
|
||||||
int new_loc[this->uniforms];
|
|
||||||
int new_chan[this->uniforms];
|
|
||||||
bool is_aligned_to_dvec4[this->uniforms];
|
|
||||||
int new_chans_used[this->uniforms];
|
|
||||||
int channel_sizes[this->uniforms];
|
|
||||||
|
|
||||||
memset(chans_used, 0, sizeof(chans_used));
|
|
||||||
memset(new_loc, 0, sizeof(new_loc));
|
|
||||||
memset(new_chan, 0, sizeof(new_chan));
|
|
||||||
memset(new_chans_used, 0, sizeof(new_chans_used));
|
|
||||||
memset(is_aligned_to_dvec4, 0, sizeof(is_aligned_to_dvec4));
|
|
||||||
memset(channel_sizes, 0, sizeof(channel_sizes));
|
|
||||||
|
|
||||||
/* Find which uniform vectors are actually used by the program. We
|
|
||||||
* expect unused vector elements when we've moved array access out
|
|
||||||
* to pull constants, and from some GLSL code generators like wine.
|
|
||||||
*/
|
|
||||||
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
|
|
||||||
unsigned readmask;
|
|
||||||
switch (inst->opcode) {
|
|
||||||
case VEC4_OPCODE_PACK_BYTES:
|
|
||||||
case BRW_OPCODE_DP4:
|
|
||||||
case BRW_OPCODE_DPH:
|
|
||||||
readmask = 0xf;
|
|
||||||
break;
|
|
||||||
case BRW_OPCODE_DP3:
|
|
||||||
readmask = 0x7;
|
|
||||||
break;
|
|
||||||
case BRW_OPCODE_DP2:
|
|
||||||
readmask = 0x3;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
readmask = inst->dst.writemask;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0 ; i < 3; i++) {
|
|
||||||
if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
assert(type_sz(inst->src[i].type) % 4 == 0);
|
|
||||||
int channel_size = type_sz(inst->src[i].type) / 4;
|
|
||||||
|
|
||||||
int reg = inst->src[i].nr;
|
|
||||||
for (int c = 0; c < 4; c++) {
|
|
||||||
if (!(readmask & (1 << c)))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1;
|
|
||||||
unsigned used = MAX2(chans_used[reg], channel * channel_size);
|
|
||||||
if (used <= 4) {
|
|
||||||
chans_used[reg] = used;
|
|
||||||
channel_sizes[reg] = MAX2(channel_sizes[reg], channel_size);
|
|
||||||
} else {
|
|
||||||
is_aligned_to_dvec4[reg] = true;
|
|
||||||
is_aligned_to_dvec4[reg + 1] = true;
|
|
||||||
chans_used[reg + 1] = used - 4;
|
|
||||||
channel_sizes[reg + 1] = MAX2(channel_sizes[reg + 1], channel_size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
|
|
||||||
inst->src[0].file == UNIFORM) {
|
|
||||||
assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
|
|
||||||
assert(inst->src[0].subnr == 0);
|
|
||||||
|
|
||||||
unsigned bytes_read = inst->src[2].ud;
|
|
||||||
assert(bytes_read % 4 == 0);
|
|
||||||
unsigned vec4s_read = DIV_ROUND_UP(bytes_read, 16);
|
|
||||||
|
|
||||||
/* We just mark every register touched by a MOV_INDIRECT as being
|
|
||||||
* fully used. This ensures that it doesn't broken up piecewise by
|
|
||||||
* the next part of our packing algorithm.
|
|
||||||
*/
|
|
||||||
int reg = inst->src[0].nr;
|
|
||||||
int channel_size = type_sz(inst->src[0].type) / 4;
|
|
||||||
for (unsigned i = 0; i < vec4s_read; i++) {
|
|
||||||
chans_used[reg + i] = 4;
|
|
||||||
channel_sizes[reg + i] = MAX2(channel_sizes[reg + i], channel_size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int new_uniform_count = 0;
|
|
||||||
|
|
||||||
/* As the uniforms are going to be reordered, take the data from a temporary
|
|
||||||
* copy of the original param[].
|
|
||||||
*/
|
|
||||||
uint32_t *param = ralloc_array(NULL, uint32_t, stage_prog_data->nr_params);
|
|
||||||
memcpy(param, stage_prog_data->param,
|
|
||||||
sizeof(uint32_t) * stage_prog_data->nr_params);
|
|
||||||
|
|
||||||
/* Now, figure out a packing of the live uniform vectors into our
|
|
||||||
* push constants. Start with dvec{3,4} because they are aligned to
|
|
||||||
* dvec4 size (2 vec4).
|
|
||||||
*/
|
|
||||||
for (int src = 0; src < uniforms; src++) {
|
|
||||||
int size = chans_used[src];
|
|
||||||
|
|
||||||
if (size == 0 || !is_aligned_to_dvec4[src])
|
|
||||||
continue;
|
|
||||||
|
|
||||||
/* dvec3 are aligned to dvec4 size, apply the alignment of the size
|
|
||||||
* to 4 to avoid moving last component of a dvec3 to the available
|
|
||||||
* location at the end of a previous dvec3. These available locations
|
|
||||||
* could be filled by smaller variables in next loop.
|
|
||||||
*/
|
|
||||||
size = ALIGN(size, 4);
|
|
||||||
int dst = set_push_constant_loc(uniforms, &new_uniform_count,
|
|
||||||
src, size, channel_sizes[src],
|
|
||||||
new_loc, new_chan,
|
|
||||||
new_chans_used);
|
|
||||||
/* Move the references to the data */
|
|
||||||
for (int j = 0; j < size; j++) {
|
|
||||||
stage_prog_data->param[dst * 4 + new_chan[src] + j] =
|
|
||||||
param[src * 4 + j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Continue with the rest of data, which is aligned to vec4. */
|
|
||||||
for (int src = 0; src < uniforms; src++) {
|
|
||||||
int size = chans_used[src];
|
|
||||||
|
|
||||||
if (size == 0 || is_aligned_to_dvec4[src])
|
|
||||||
continue;
|
|
||||||
|
|
||||||
int dst = set_push_constant_loc(uniforms, &new_uniform_count,
|
|
||||||
src, size, channel_sizes[src],
|
|
||||||
new_loc, new_chan,
|
|
||||||
new_chans_used);
|
|
||||||
/* Move the references to the data */
|
|
||||||
for (int j = 0; j < size; j++) {
|
|
||||||
stage_prog_data->param[dst * 4 + new_chan[src] + j] =
|
|
||||||
param[src * 4 + j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ralloc_free(param);
|
|
||||||
this->uniforms = new_uniform_count;
|
|
||||||
stage_prog_data->nr_params = new_uniform_count * 4;
|
|
||||||
|
|
||||||
/* Now, update the instructions for our repacked uniforms. */
|
|
||||||
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
|
|
||||||
for (int i = 0 ; i < 3; i++) {
|
|
||||||
int src = inst->src[i].nr;
|
|
||||||
|
|
||||||
if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
int chan = new_chan[src] / channel_sizes[src];
|
|
||||||
inst->src[i].nr = new_loc[src];
|
|
||||||
inst->src[i].swizzle += BRW_SWIZZLE4(chan, chan, chan, chan);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
|
* Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
|
||||||
*
|
*
|
||||||
@@ -910,97 +722,6 @@ vec4_visitor::opt_algebraic()
|
|||||||
return progress;
|
return progress;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Only a limited number of hardware registers may be used for push
|
|
||||||
* constants, so this turns access to the overflowed constants into
|
|
||||||
* pull constants.
|
|
||||||
*/
|
|
||||||
void
|
|
||||||
vec4_visitor::move_push_constants_to_pull_constants()
|
|
||||||
{
|
|
||||||
int pull_constant_loc[this->uniforms];
|
|
||||||
|
|
||||||
const int max_uniform_components = push_length * 8;
|
|
||||||
|
|
||||||
if (this->uniforms * 4 <= max_uniform_components)
|
|
||||||
return;
|
|
||||||
|
|
||||||
assert(compiler->supports_pull_constants);
|
|
||||||
assert(compiler->compact_params);
|
|
||||||
|
|
||||||
/* If we got here, we also can't have any push ranges */
|
|
||||||
for (unsigned i = 0; i < 4; i++)
|
|
||||||
assert(prog_data->base.ubo_ranges[i].length == 0);
|
|
||||||
|
|
||||||
/* Make some sort of choice as to which uniforms get sent to pull
|
|
||||||
* constants. We could potentially do something clever here like
|
|
||||||
* look for the most infrequently used uniform vec4s, but leave
|
|
||||||
* that for later.
|
|
||||||
*/
|
|
||||||
for (int i = 0; i < this->uniforms * 4; i += 4) {
|
|
||||||
pull_constant_loc[i / 4] = -1;
|
|
||||||
|
|
||||||
if (i >= max_uniform_components) {
|
|
||||||
uint32_t *values = &stage_prog_data->param[i];
|
|
||||||
|
|
||||||
/* Try to find an existing copy of this uniform in the pull
|
|
||||||
* constants if it was part of an array access already.
|
|
||||||
*/
|
|
||||||
for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
|
|
||||||
int matches;
|
|
||||||
|
|
||||||
for (matches = 0; matches < 4; matches++) {
|
|
||||||
if (stage_prog_data->pull_param[j + matches] != values[matches])
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (matches == 4) {
|
|
||||||
pull_constant_loc[i / 4] = j / 4;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (pull_constant_loc[i / 4] == -1) {
|
|
||||||
assert(stage_prog_data->nr_pull_params % 4 == 0);
|
|
||||||
pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
|
|
||||||
|
|
||||||
for (int j = 0; j < 4; j++) {
|
|
||||||
stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
|
|
||||||
values[j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Now actually rewrite usage of the things we've moved to pull
|
|
||||||
* constants.
|
|
||||||
*/
|
|
||||||
foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
|
|
||||||
for (int i = 0 ; i < 3; i++) {
|
|
||||||
if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START ||
|
|
||||||
pull_constant_loc[inst->src[i].nr] == -1)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
int uniform = inst->src[i].nr;
|
|
||||||
|
|
||||||
const glsl_type *temp_type = type_sz(inst->src[i].type) == 8 ?
|
|
||||||
glsl_type::dvec4_type : glsl_type::vec4_type;
|
|
||||||
dst_reg temp = dst_reg(this, temp_type);
|
|
||||||
|
|
||||||
emit_pull_constant_load(block, inst, temp, inst->src[i],
|
|
||||||
pull_constant_loc[uniform], src_reg());
|
|
||||||
|
|
||||||
inst->src[i].file = temp.file;
|
|
||||||
inst->src[i].nr = temp.nr;
|
|
||||||
inst->src[i].offset %= 16;
|
|
||||||
inst->src[i].reladdr = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Repack push constants to remove the now-unused ones. */
|
|
||||||
pack_uniform_registers();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Conditions for which we want to avoid setting the dependency control bits */
|
/* Conditions for which we want to avoid setting the dependency control bits */
|
||||||
bool
|
bool
|
||||||
vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
|
vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
|
||||||
@@ -1842,15 +1563,13 @@ vec4_visitor::setup_uniforms(int reg)
|
|||||||
/* It's possible that uniform compaction will shrink further than expected
|
/* It's possible that uniform compaction will shrink further than expected
|
||||||
* so we re-compute the layout and set up our UBO push starts.
|
* so we re-compute the layout and set up our UBO push starts.
|
||||||
*/
|
*/
|
||||||
const unsigned old_push_length = push_length;
|
ASSERTED const unsigned old_push_length = push_length;
|
||||||
push_length = DIV_ROUND_UP(prog_data->base.nr_params, 8);
|
push_length = DIV_ROUND_UP(prog_data->base.nr_params, 8);
|
||||||
for (unsigned i = 0; i < 4; i++) {
|
for (unsigned i = 0; i < 4; i++) {
|
||||||
ubo_push_start[i] = push_length;
|
ubo_push_start[i] = push_length;
|
||||||
push_length += stage_prog_data->ubo_ranges[i].length;
|
push_length += stage_prog_data->ubo_ranges[i].length;
|
||||||
}
|
}
|
||||||
assert(push_length <= old_push_length);
|
assert(push_length == old_push_length);
|
||||||
if (push_length < old_push_length)
|
|
||||||
assert(compiler->compact_params);
|
|
||||||
|
|
||||||
/* The pre-gfx6 VS requires that some push constants get loaded no
|
/* The pre-gfx6 VS requires that some push constants get loaded no
|
||||||
* matter what, or the GPU would hang.
|
* matter what, or the GPU would hang.
|
||||||
@@ -2738,10 +2457,8 @@ vec4_visitor::run()
|
|||||||
* often do repeated subexpressions for those.
|
* often do repeated subexpressions for those.
|
||||||
*/
|
*/
|
||||||
move_grf_array_access_to_scratch();
|
move_grf_array_access_to_scratch();
|
||||||
move_uniform_array_access_to_pull_constants();
|
split_uniform_registers();
|
||||||
|
|
||||||
pack_uniform_registers();
|
|
||||||
move_push_constants_to_pull_constants();
|
|
||||||
split_virtual_grfs();
|
split_virtual_grfs();
|
||||||
|
|
||||||
#define OPT(pass, args...) ({ \
|
#define OPT(pass, args...) ({ \
|
||||||
|
@@ -138,9 +138,7 @@ public:
|
|||||||
void spill_reg(unsigned spill_reg);
|
void spill_reg(unsigned spill_reg);
|
||||||
void move_grf_array_access_to_scratch();
|
void move_grf_array_access_to_scratch();
|
||||||
void move_uniform_array_access_to_pull_constants();
|
void move_uniform_array_access_to_pull_constants();
|
||||||
void move_push_constants_to_pull_constants();
|
|
||||||
void split_uniform_registers();
|
void split_uniform_registers();
|
||||||
void pack_uniform_registers();
|
|
||||||
void setup_push_ranges();
|
void setup_push_ranges();
|
||||||
virtual void invalidate_analysis(brw::analysis_dependency_class c);
|
virtual void invalidate_analysis(brw::analysis_dependency_class c);
|
||||||
void split_virtual_grfs();
|
void split_virtual_grfs();
|
||||||
@@ -292,11 +290,6 @@ public:
|
|||||||
int base_offset);
|
int base_offset);
|
||||||
void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
|
void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
|
||||||
int base_offset);
|
int base_offset);
|
||||||
void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
|
|
||||||
dst_reg dst,
|
|
||||||
src_reg orig_src,
|
|
||||||
int base_offset,
|
|
||||||
src_reg indirect);
|
|
||||||
void emit_pull_constant_load_reg(dst_reg dst,
|
void emit_pull_constant_load_reg(dst_reg dst,
|
||||||
src_reg surf_index,
|
src_reg surf_index,
|
||||||
src_reg offset,
|
src_reg offset,
|
||||||
|
@@ -889,7 +889,6 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
|
|||||||
memcpy(prog_data->base.base.param, param,
|
memcpy(prog_data->base.base.param, param,
|
||||||
sizeof(uint32_t) * param_count);
|
sizeof(uint32_t) * param_count);
|
||||||
prog_data->base.base.nr_params = param_count;
|
prog_data->base.base.nr_params = param_count;
|
||||||
prog_data->base.base.nr_pull_params = 0;
|
|
||||||
ralloc_free(param);
|
ralloc_free(param);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -1592,146 +1592,6 @@ vec4_visitor::move_grf_array_access_to_scratch()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Emits an instruction before @inst to load the value named by @orig_src
|
|
||||||
* from the pull constant buffer (surface) at @base_offset to @temp.
|
|
||||||
*/
|
|
||||||
void
|
|
||||||
vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
|
|
||||||
dst_reg temp, src_reg orig_src,
|
|
||||||
int base_offset, src_reg indirect)
|
|
||||||
{
|
|
||||||
assert(orig_src.offset % 16 == 0);
|
|
||||||
const unsigned index = prog_data->base.binding_table.pull_constants_start;
|
|
||||||
|
|
||||||
/* For 64bit loads we need to emit two 32-bit load messages and we also
|
|
||||||
* we need to shuffle the 32-bit data result into proper 64-bit data. To do
|
|
||||||
* that we emit the 32-bit loads into a temporary and we shuffle the result
|
|
||||||
* into the original destination.
|
|
||||||
*/
|
|
||||||
dst_reg orig_temp = temp;
|
|
||||||
bool is_64bit = type_sz(orig_src.type) == 8;
|
|
||||||
if (is_64bit) {
|
|
||||||
assert(type_sz(temp.type) == 8);
|
|
||||||
dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
|
|
||||||
temp = retype(temp_df, BRW_REGISTER_TYPE_F);
|
|
||||||
}
|
|
||||||
|
|
||||||
src_reg src = orig_src;
|
|
||||||
for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
|
|
||||||
int reg_offset = base_offset + src.offset / 16;
|
|
||||||
|
|
||||||
src_reg offset;
|
|
||||||
if (indirect.file != BAD_FILE) {
|
|
||||||
offset = src_reg(this, glsl_type::uint_type);
|
|
||||||
emit_before(block, inst, ADD(dst_reg(offset), indirect,
|
|
||||||
brw_imm_ud(reg_offset * 16)));
|
|
||||||
} else {
|
|
||||||
offset = brw_imm_d(reg_offset * 16);
|
|
||||||
}
|
|
||||||
|
|
||||||
emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
|
|
||||||
brw_imm_ud(index),
|
|
||||||
offset,
|
|
||||||
block, inst);
|
|
||||||
|
|
||||||
src = byte_offset(src, 16);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (is_64bit) {
|
|
||||||
temp = retype(temp, BRW_REGISTER_TYPE_DF);
|
|
||||||
shuffle_64bit_data(orig_temp, src_reg(temp), false, false, block, inst);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Implements array access of uniforms by inserting a
|
|
||||||
* PULL_CONSTANT_LOAD instruction.
|
|
||||||
*
|
|
||||||
* Unlike temporary GRF array access (where we don't support it due to
|
|
||||||
* the difficulty of doing relative addressing on instruction
|
|
||||||
* destinations), we could potentially do array access of uniforms
|
|
||||||
* that were loaded in GRF space as push constants. In real-world
|
|
||||||
* usage we've seen, though, the arrays being used are always larger
|
|
||||||
* than we could load as push constants, so just always move all
|
|
||||||
* uniform array access out to a pull constant buffer.
|
|
||||||
*/
|
|
||||||
void
|
|
||||||
vec4_visitor::move_uniform_array_access_to_pull_constants()
|
|
||||||
{
|
|
||||||
/* The vulkan dirver doesn't support pull constants other than UBOs so
|
|
||||||
* everything has to be pushed regardless.
|
|
||||||
*/
|
|
||||||
if (!compiler->supports_pull_constants) {
|
|
||||||
split_uniform_registers();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Allocate the pull_params array */
|
|
||||||
assert(stage_prog_data->nr_pull_params == 0);
|
|
||||||
stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
|
|
||||||
this->uniforms * 4);
|
|
||||||
|
|
||||||
int pull_constant_loc[this->uniforms];
|
|
||||||
memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
|
|
||||||
|
|
||||||
/* First, walk through the instructions and determine which things need to
|
|
||||||
* be pulled. We mark something as needing to be pulled by setting
|
|
||||||
* pull_constant_loc to 0.
|
|
||||||
*/
|
|
||||||
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
|
|
||||||
/* We only care about MOV_INDIRECT of a uniform */
|
|
||||||
if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
|
|
||||||
inst->src[0].file != UNIFORM)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
|
|
||||||
|
|
||||||
for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
|
|
||||||
pull_constant_loc[uniform_nr + j] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Next, we walk the list of uniforms and assign real pull constant
|
|
||||||
* locations and set their corresponding entries in pull_param.
|
|
||||||
*/
|
|
||||||
for (int j = 0; j < this->uniforms; j++) {
|
|
||||||
if (pull_constant_loc[j] < 0)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
|
|
||||||
|
|
||||||
for (int i = 0; i < 4; i++) {
|
|
||||||
stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
|
|
||||||
= stage_prog_data->param[j * 4 + i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Finally, we can walk through the instructions and lower MOV_INDIRECT
|
|
||||||
* instructions to actual uniform pulls.
|
|
||||||
*/
|
|
||||||
foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
|
|
||||||
/* We only care about MOV_INDIRECT of a uniform */
|
|
||||||
if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
|
|
||||||
inst->src[0].file != UNIFORM)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
|
|
||||||
|
|
||||||
assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
|
|
||||||
|
|
||||||
emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
|
|
||||||
pull_constant_loc[uniform_nr], inst->src[1]);
|
|
||||||
inst->remove(block);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Now there are no accesses of the UNIFORM file with a reladdr, so
|
|
||||||
* no need to track them as larger-than-vec4 objects. This will be
|
|
||||||
* relied on in cutting out unused uniform vectors from push
|
|
||||||
* constants.
|
|
||||||
*/
|
|
||||||
split_uniform_registers();
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
void
|
||||||
vec4_visitor::resolve_ud_negate(src_reg *reg)
|
vec4_visitor::resolve_ud_negate(src_reg *reg)
|
||||||
{
|
{
|
||||||
|
@@ -974,11 +974,9 @@ anv_physical_device_try_create(struct anv_instance *instance,
|
|||||||
}
|
}
|
||||||
device->compiler->shader_debug_log = compiler_debug_log;
|
device->compiler->shader_debug_log = compiler_debug_log;
|
||||||
device->compiler->shader_perf_log = compiler_perf_log;
|
device->compiler->shader_perf_log = compiler_perf_log;
|
||||||
device->compiler->supports_pull_constants = false;
|
|
||||||
device->compiler->constant_buffer_0_is_relative =
|
device->compiler->constant_buffer_0_is_relative =
|
||||||
device->info.ver < 8 || !device->has_context_isolation;
|
device->info.ver < 8 || !device->has_context_isolation;
|
||||||
device->compiler->supports_shader_constants = true;
|
device->compiler->supports_shader_constants = true;
|
||||||
device->compiler->compact_params = false;
|
|
||||||
device->compiler->indirect_ubos_use_sampler = device->info.ver < 12;
|
device->compiler->indirect_ubos_use_sampler = device->info.ver < 12;
|
||||||
|
|
||||||
isl_device_init(&device->isl_dev, &device->info);
|
isl_device_init(&device->isl_dev, &device->info);
|
||||||
|
Reference in New Issue
Block a user