|
|
|
@@ -2307,159 +2307,190 @@ fs_visitor::assign_constant_locations()
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct uniform_slot_info slots[uniforms];
|
|
|
|
|
memset(slots, 0, sizeof(slots));
|
|
|
|
|
if (compiler->compact_params) {
|
|
|
|
|
struct uniform_slot_info slots[uniforms];
|
|
|
|
|
memset(slots, 0, sizeof(slots));
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
|
|
|
|
for (int i = 0 ; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file != UNIFORM)
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
|
|
|
|
for (int i = 0 ; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file != UNIFORM)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* NIR tightly packs things so the uniform number might not be
|
|
|
|
|
* aligned (if we have a double right after a float, for
|
|
|
|
|
* instance). This is fine because the process of re-arranging
|
|
|
|
|
* them will ensure that things are properly aligned. The offset
|
|
|
|
|
* into that uniform, however, must be aligned.
|
|
|
|
|
*
|
|
|
|
|
* In Vulkan, we have explicit offsets but everything is crammed
|
|
|
|
|
* into a single "variable" so inst->src[i].nr will always be 0.
|
|
|
|
|
* Everything will be properly aligned relative to that one base.
|
|
|
|
|
*/
|
|
|
|
|
assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
|
|
|
|
|
|
|
|
|
|
unsigned u = inst->src[i].nr +
|
|
|
|
|
inst->src[i].offset / UNIFORM_SLOT_SIZE;
|
|
|
|
|
|
|
|
|
|
if (u >= uniforms)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
unsigned slots_read;
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
|
|
|
|
|
slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
|
|
|
|
|
} else {
|
|
|
|
|
unsigned bytes_read = inst->components_read(i) *
|
|
|
|
|
type_sz(inst->src[i].type);
|
|
|
|
|
slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert(u + slots_read <= uniforms);
|
|
|
|
|
mark_uniform_slots_read(&slots[u], slots_read,
|
|
|
|
|
type_sz(inst->src[i].type));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data);
|
|
|
|
|
|
|
|
|
|
/* Only allow 16 registers (128 uniform components) as push constants.
|
|
|
|
|
*
|
|
|
|
|
* Just demote the end of the list. We could probably do better
|
|
|
|
|
* here, demoting things that are rarely used in the program first.
|
|
|
|
|
*
|
|
|
|
|
* If changing this value, note the limitation about total_regs in
|
|
|
|
|
* brw_curbe.c.
|
|
|
|
|
*/
|
|
|
|
|
unsigned int max_push_components = 16 * 8;
|
|
|
|
|
if (subgroup_id_index >= 0)
|
|
|
|
|
max_push_components--; /* Save a slot for the thread ID */
|
|
|
|
|
|
|
|
|
|
/* We push small arrays, but no bigger than 16 floats. This is big
|
|
|
|
|
* enough for a vec4 but hopefully not large enough to push out other
|
|
|
|
|
* stuff. We should probably use a better heuristic at some point.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned int max_chunk_size = 16;
|
|
|
|
|
|
|
|
|
|
unsigned int num_push_constants = 0;
|
|
|
|
|
unsigned int num_pull_constants = 0;
|
|
|
|
|
|
|
|
|
|
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
|
|
|
pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
|
|
|
|
|
|
|
|
/* Default to -1 meaning no location */
|
|
|
|
|
memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
|
|
|
|
|
memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
|
|
|
|
|
|
|
|
|
|
int chunk_start = -1;
|
|
|
|
|
struct cplx_align align;
|
|
|
|
|
for (unsigned u = 0; u < uniforms; u++) {
|
|
|
|
|
if (!slots[u].is_live) {
|
|
|
|
|
assert(chunk_start == -1);
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* NIR tightly packs things so the uniform number might not be
|
|
|
|
|
* aligned (if we have a double right after a float, for instance).
|
|
|
|
|
* This is fine because the process of re-arranging them will ensure
|
|
|
|
|
* that things are properly aligned. The offset into that uniform,
|
|
|
|
|
* however, must be aligned.
|
|
|
|
|
*
|
|
|
|
|
* In Vulkan, we have explicit offsets but everything is crammed
|
|
|
|
|
* into a single "variable" so inst->src[i].nr will always be 0.
|
|
|
|
|
* Everything will be properly aligned relative to that one base.
|
|
|
|
|
*/
|
|
|
|
|
assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
|
|
|
|
|
|
|
|
|
|
unsigned u = inst->src[i].nr +
|
|
|
|
|
inst->src[i].offset / UNIFORM_SLOT_SIZE;
|
|
|
|
|
|
|
|
|
|
if (u >= uniforms)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
unsigned slots_read;
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
|
|
|
|
|
slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
|
|
|
|
|
} else {
|
|
|
|
|
unsigned bytes_read = inst->components_read(i) *
|
|
|
|
|
type_sz(inst->src[i].type);
|
|
|
|
|
slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert(u + slots_read <= uniforms);
|
|
|
|
|
mark_uniform_slots_read(&slots[u], slots_read,
|
|
|
|
|
type_sz(inst->src[i].type));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
/* Skip subgroup_id_index to put it in the last push register. */
|
|
|
|
|
if (subgroup_id_index == (int)u)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data);
|
|
|
|
|
if (chunk_start == -1) {
|
|
|
|
|
chunk_start = u;
|
|
|
|
|
align = slots[u].align;
|
|
|
|
|
} else {
|
|
|
|
|
/* Offset into the chunk */
|
|
|
|
|
unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
|
|
|
|
|
|
|
|
|
|
/* Only allow 16 registers (128 uniform components) as push constants.
|
|
|
|
|
*
|
|
|
|
|
* Just demote the end of the list. We could probably do better
|
|
|
|
|
* here, demoting things that are rarely used in the program first.
|
|
|
|
|
*
|
|
|
|
|
* If changing this value, note the limitation about total_regs in
|
|
|
|
|
* brw_curbe.c.
|
|
|
|
|
*/
|
|
|
|
|
unsigned int max_push_components = 16 * 8;
|
|
|
|
|
if (subgroup_id_index >= 0)
|
|
|
|
|
max_push_components--; /* Save a slot for the thread ID */
|
|
|
|
|
/* Shift the slot alignment down by the chunk offset so it is
|
|
|
|
|
* comparable with the base chunk alignment.
|
|
|
|
|
*/
|
|
|
|
|
struct cplx_align slot_align = slots[u].align;
|
|
|
|
|
slot_align.offset =
|
|
|
|
|
(slot_align.offset - chunk_offset) & (align.mul - 1);
|
|
|
|
|
|
|
|
|
|
/* We push small arrays, but no bigger than 16 floats. This is big enough
|
|
|
|
|
* for a vec4 but hopefully not large enough to push out other stuff. We
|
|
|
|
|
* should probably use a better heuristic at some point.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned int max_chunk_size = 16;
|
|
|
|
|
align = cplx_align_combine(align, slot_align);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned int num_push_constants = 0;
|
|
|
|
|
unsigned int num_pull_constants = 0;
|
|
|
|
|
/* Sanity check the alignment */
|
|
|
|
|
cplx_align_assert_sane(align);
|
|
|
|
|
|
|
|
|
|
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
|
|
|
pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
|
|
|
if (slots[u].contiguous)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* Default to -1 meaning no location */
|
|
|
|
|
memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
|
|
|
|
|
memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
|
|
|
|
|
/* Adjust the alignment to be in terms of slots, not bytes */
|
|
|
|
|
assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
|
|
|
|
|
assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
|
|
|
|
|
align.mul /= UNIFORM_SLOT_SIZE;
|
|
|
|
|
align.offset /= UNIFORM_SLOT_SIZE;
|
|
|
|
|
|
|
|
|
|
int chunk_start = -1;
|
|
|
|
|
struct cplx_align align;
|
|
|
|
|
for (unsigned u = 0; u < uniforms; u++) {
|
|
|
|
|
if (!slots[u].is_live) {
|
|
|
|
|
assert(chunk_start == -1);
|
|
|
|
|
continue;
|
|
|
|
|
unsigned push_start_align = cplx_align_apply(align, num_push_constants);
|
|
|
|
|
unsigned chunk_size = u - chunk_start + 1;
|
|
|
|
|
if ((!compiler->supports_pull_constants && u < UBO_START) ||
|
|
|
|
|
(chunk_size < max_chunk_size &&
|
|
|
|
|
push_start_align + chunk_size <= max_push_components)) {
|
|
|
|
|
/* Align up the number of push constants */
|
|
|
|
|
num_push_constants = push_start_align;
|
|
|
|
|
for (unsigned i = 0; i < chunk_size; i++)
|
|
|
|
|
push_constant_loc[chunk_start + i] = num_push_constants++;
|
|
|
|
|
} else {
|
|
|
|
|
/* We need to pull this one */
|
|
|
|
|
num_pull_constants = cplx_align_apply(align, num_pull_constants);
|
|
|
|
|
for (unsigned i = 0; i < chunk_size; i++)
|
|
|
|
|
pull_constant_loc[chunk_start + i] = num_pull_constants++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Reset the chunk and start again */
|
|
|
|
|
chunk_start = -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Skip subgroup_id_index to put it in the last push register. */
|
|
|
|
|
if (subgroup_id_index == (int)u)
|
|
|
|
|
continue;
|
|
|
|
|
/* Add the CS local thread ID uniform at the end of the push constants */
|
|
|
|
|
if (subgroup_id_index >= 0)
|
|
|
|
|
push_constant_loc[subgroup_id_index] = num_push_constants++;
|
|
|
|
|
|
|
|
|
|
if (chunk_start == -1) {
|
|
|
|
|
chunk_start = u;
|
|
|
|
|
align = slots[u].align;
|
|
|
|
|
/* As the uniforms are going to be reordered, stash the old array and
|
|
|
|
|
* create two new arrays for push/pull params.
|
|
|
|
|
*/
|
|
|
|
|
uint32_t *param = stage_prog_data->param;
|
|
|
|
|
stage_prog_data->nr_params = num_push_constants;
|
|
|
|
|
if (num_push_constants) {
|
|
|
|
|
stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
|
|
|
|
|
num_push_constants);
|
|
|
|
|
} else {
|
|
|
|
|
/* Offset into the chunk */
|
|
|
|
|
unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
|
|
|
|
|
|
|
|
|
|
/* Shift the slot alignment down by the chunk offset so it is
|
|
|
|
|
* comparable with the base chunk alignment.
|
|
|
|
|
*/
|
|
|
|
|
struct cplx_align slot_align = slots[u].align;
|
|
|
|
|
slot_align.offset =
|
|
|
|
|
(slot_align.offset - chunk_offset) & (align.mul - 1);
|
|
|
|
|
|
|
|
|
|
align = cplx_align_combine(align, slot_align);
|
|
|
|
|
stage_prog_data->param = NULL;
|
|
|
|
|
}
|
|
|
|
|
assert(stage_prog_data->nr_pull_params == 0);
|
|
|
|
|
assert(stage_prog_data->pull_param == NULL);
|
|
|
|
|
if (num_pull_constants > 0) {
|
|
|
|
|
stage_prog_data->nr_pull_params = num_pull_constants;
|
|
|
|
|
stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
|
|
|
|
|
num_pull_constants);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Sanity check the alignment */
|
|
|
|
|
cplx_align_assert_sane(align);
|
|
|
|
|
|
|
|
|
|
if (slots[u].contiguous)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* Adjust the alignment to be in terms of slots, not bytes */
|
|
|
|
|
assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
|
|
|
|
|
assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
|
|
|
|
|
align.mul /= UNIFORM_SLOT_SIZE;
|
|
|
|
|
align.offset /= UNIFORM_SLOT_SIZE;
|
|
|
|
|
|
|
|
|
|
unsigned push_start_align = cplx_align_apply(align, num_push_constants);
|
|
|
|
|
unsigned chunk_size = u - chunk_start + 1;
|
|
|
|
|
if ((!compiler->supports_pull_constants && u < UBO_START) ||
|
|
|
|
|
(chunk_size < max_chunk_size &&
|
|
|
|
|
push_start_align + chunk_size <= max_push_components)) {
|
|
|
|
|
/* Align up the number of push constants */
|
|
|
|
|
num_push_constants = push_start_align;
|
|
|
|
|
for (unsigned i = 0; i < chunk_size; i++)
|
|
|
|
|
push_constant_loc[chunk_start + i] = num_push_constants++;
|
|
|
|
|
} else {
|
|
|
|
|
/* We need to pull this one */
|
|
|
|
|
num_pull_constants = cplx_align_apply(align, num_pull_constants);
|
|
|
|
|
for (unsigned i = 0; i < chunk_size; i++)
|
|
|
|
|
pull_constant_loc[chunk_start + i] = num_pull_constants++;
|
|
|
|
|
/* Up until now, the param[] array has been indexed by reg + offset
|
|
|
|
|
* of UNIFORM registers. Move pull constants into pull_param[] and
|
|
|
|
|
* condense param[] to only contain the uniforms we chose to push.
|
|
|
|
|
*
|
|
|
|
|
* NOTE: Because we are condensing the params[] array, we know that
|
|
|
|
|
* push_constant_loc[i] <= i and we can do it in one smooth loop without
|
|
|
|
|
* having to make a copy.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned int i = 0; i < uniforms; i++) {
|
|
|
|
|
uint32_t value = param[i];
|
|
|
|
|
if (pull_constant_loc[i] != -1) {
|
|
|
|
|
stage_prog_data->pull_param[pull_constant_loc[i]] = value;
|
|
|
|
|
} else if (push_constant_loc[i] != -1) {
|
|
|
|
|
stage_prog_data->param[push_constant_loc[i]] = value;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Reset the chunk and start again */
|
|
|
|
|
chunk_start = -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Add the CS local thread ID uniform at the end of the push constants */
|
|
|
|
|
if (subgroup_id_index >= 0)
|
|
|
|
|
push_constant_loc[subgroup_id_index] = num_push_constants++;
|
|
|
|
|
|
|
|
|
|
/* As the uniforms are going to be reordered, stash the old array and
|
|
|
|
|
* create two new arrays for push/pull params.
|
|
|
|
|
*/
|
|
|
|
|
uint32_t *param = stage_prog_data->param;
|
|
|
|
|
stage_prog_data->nr_params = num_push_constants;
|
|
|
|
|
if (num_push_constants) {
|
|
|
|
|
stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
|
|
|
|
|
num_push_constants);
|
|
|
|
|
ralloc_free(param);
|
|
|
|
|
} else {
|
|
|
|
|
stage_prog_data->param = NULL;
|
|
|
|
|
}
|
|
|
|
|
assert(stage_prog_data->nr_pull_params == 0);
|
|
|
|
|
assert(stage_prog_data->pull_param == NULL);
|
|
|
|
|
if (num_pull_constants > 0) {
|
|
|
|
|
stage_prog_data->nr_pull_params = num_pull_constants;
|
|
|
|
|
stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
|
|
|
|
|
num_pull_constants);
|
|
|
|
|
/* If we don't want to compact anything, just set up dummy push/pull
|
|
|
|
|
* arrays. All the rest of the compiler cares about are these arrays.
|
|
|
|
|
*/
|
|
|
|
|
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
|
|
|
pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
|
|
|
|
|
|
|
|
for (unsigned u = 0; u < uniforms; u++)
|
|
|
|
|
push_constant_loc[u] = u;
|
|
|
|
|
|
|
|
|
|
memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Now that we know how many regular uniforms we'll push, reduce the
|
|
|
|
@@ -2475,24 +2506,6 @@ fs_visitor::assign_constant_locations()
|
|
|
|
|
push_length += range->length;
|
|
|
|
|
}
|
|
|
|
|
assert(push_length <= 64);
|
|
|
|
|
|
|
|
|
|
/* Up until now, the param[] array has been indexed by reg + offset
|
|
|
|
|
* of UNIFORM registers. Move pull constants into pull_param[] and
|
|
|
|
|
* condense param[] to only contain the uniforms we chose to push.
|
|
|
|
|
*
|
|
|
|
|
* NOTE: Because we are condensing the params[] array, we know that
|
|
|
|
|
* push_constant_loc[i] <= i and we can do it in one smooth loop without
|
|
|
|
|
* having to make a copy.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned int i = 0; i < uniforms; i++) {
|
|
|
|
|
uint32_t value = param[i];
|
|
|
|
|
if (pull_constant_loc[i] != -1) {
|
|
|
|
|
stage_prog_data->pull_param[pull_constant_loc[i]] = value;
|
|
|
|
|
} else if (push_constant_loc[i] != -1) {
|
|
|
|
|
stage_prog_data->param[push_constant_loc[i]] = value;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
ralloc_free(param);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|