broadcom/vc5: Add support for register spilling.
Our register spilling support is nice to have since vc4 couldn't at all, but we're still very restricted due to needing to not spill during a TMU operation, or during the last segment of the program (which would be nice to spill a value of, when there's a long-lived value being passed through with little modification from the start to the end). We could do better by emitting unspills for the last-segment values just before the last thrsw, since the last segment is probably not the maximum interference area. Fixes GTF uniform_buffer_object_arrays_of_all_valid_basic_types and 3 others.
This commit is contained in:
@@ -1919,12 +1919,11 @@ vir_remove_thrsw(struct v3d_compile *c)
|
|||||||
vir_remove_instruction(c, inst);
|
vir_remove_instruction(c, inst);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
vir_calculate_live_intervals(c);
|
|
||||||
|
|
||||||
c->last_thrsw = NULL;
|
c->last_thrsw = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
void
|
||||||
vir_emit_last_thrsw(struct v3d_compile *c)
|
vir_emit_last_thrsw(struct v3d_compile *c)
|
||||||
{
|
{
|
||||||
/* On V3D before 4.1, we need a TMU op to be outstanding when thread
|
/* On V3D before 4.1, we need a TMU op to be outstanding when thread
|
||||||
@@ -2012,16 +2011,16 @@ v3d_nir_to_vir(struct v3d_compile *c)
|
|||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Compute the live ranges so we can figure out interference. */
|
|
||||||
vir_calculate_live_intervals(c);
|
|
||||||
|
|
||||||
/* Attempt to allocate registers for the temporaries. If we fail,
|
/* Attempt to allocate registers for the temporaries. If we fail,
|
||||||
* reduce thread count and try again.
|
* reduce thread count and try again.
|
||||||
*/
|
*/
|
||||||
int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
|
int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
|
||||||
struct qpu_reg *temp_registers;
|
struct qpu_reg *temp_registers;
|
||||||
while (true) {
|
while (true) {
|
||||||
temp_registers = v3d_register_allocate(c);
|
bool spilled;
|
||||||
|
temp_registers = v3d_register_allocate(c, &spilled);
|
||||||
|
if (spilled)
|
||||||
|
continue;
|
||||||
|
|
||||||
if (temp_registers)
|
if (temp_registers)
|
||||||
break;
|
break;
|
||||||
|
@@ -248,6 +248,12 @@ enum quniform_contents {
|
|||||||
|
|
||||||
QUNIFORM_ALPHA_REF,
|
QUNIFORM_ALPHA_REF,
|
||||||
QUNIFORM_SAMPLE_MASK,
|
QUNIFORM_SAMPLE_MASK,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the the offset of the scratch buffer for register spilling.
|
||||||
|
*/
|
||||||
|
QUNIFORM_SPILL_OFFSET,
|
||||||
|
QUNIFORM_SPILL_SIZE_PER_THREAD,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct v3d_varying_slot {
|
struct v3d_varying_slot {
|
||||||
@@ -506,6 +512,20 @@ struct v3d_compile {
|
|||||||
uint8_t vattr_sizes[V3D_MAX_VS_INPUTS];
|
uint8_t vattr_sizes[V3D_MAX_VS_INPUTS];
|
||||||
uint32_t num_vpm_writes;
|
uint32_t num_vpm_writes;
|
||||||
|
|
||||||
|
/* Size in bytes of registers that have been spilled. This is how much
|
||||||
|
* space needs to be available in the spill BO per thread per QPU.
|
||||||
|
*/
|
||||||
|
uint32_t spill_size;
|
||||||
|
/* Shader-db stats for register spilling. */
|
||||||
|
uint32_t spills, fills;
|
||||||
|
/**
|
||||||
|
* Register spilling's per-thread base address, shared between each
|
||||||
|
* spill/fill's addressing calculations.
|
||||||
|
*/
|
||||||
|
struct qreg spill_base;
|
||||||
|
/* Bit vector of which temps may be spilled */
|
||||||
|
BITSET_WORD *spillable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
|
* Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
|
||||||
*
|
*
|
||||||
@@ -600,6 +620,7 @@ struct v3d_prog_data {
|
|||||||
struct v3d_ubo_range *ubo_ranges;
|
struct v3d_ubo_range *ubo_ranges;
|
||||||
uint32_t num_ubo_ranges;
|
uint32_t num_ubo_ranges;
|
||||||
uint32_t ubo_size;
|
uint32_t ubo_size;
|
||||||
|
uint32_t spill_size;
|
||||||
|
|
||||||
uint8_t num_inputs;
|
uint8_t num_inputs;
|
||||||
uint8_t threads;
|
uint8_t threads;
|
||||||
@@ -697,6 +718,7 @@ void vir_set_unpack(struct qinst *inst, int src,
|
|||||||
enum v3d_qpu_input_unpack unpack);
|
enum v3d_qpu_input_unpack unpack);
|
||||||
|
|
||||||
struct qreg vir_get_temp(struct v3d_compile *c);
|
struct qreg vir_get_temp(struct v3d_compile *c);
|
||||||
|
void vir_emit_last_thrsw(struct v3d_compile *c);
|
||||||
void vir_calculate_live_intervals(struct v3d_compile *c);
|
void vir_calculate_live_intervals(struct v3d_compile *c);
|
||||||
bool vir_has_implicit_uniform(struct qinst *inst);
|
bool vir_has_implicit_uniform(struct qinst *inst);
|
||||||
int vir_get_implicit_uniform_src(struct qinst *inst);
|
int vir_get_implicit_uniform_src(struct qinst *inst);
|
||||||
@@ -746,7 +768,7 @@ void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
|
|||||||
void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers);
|
void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers);
|
||||||
uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
|
uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
|
||||||
void qpu_validate(struct v3d_compile *c);
|
void qpu_validate(struct v3d_compile *c);
|
||||||
struct qpu_reg *v3d_register_allocate(struct v3d_compile *c);
|
struct qpu_reg *v3d_register_allocate(struct v3d_compile *c, bool *spilled);
|
||||||
bool vir_init_reg_sets(struct v3d_compiler *compiler);
|
bool vir_init_reg_sets(struct v3d_compiler *compiler);
|
||||||
|
|
||||||
void vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf);
|
void vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf);
|
||||||
|
@@ -354,10 +354,17 @@ vir_get_temp(struct v3d_compile *c)
|
|||||||
if (c->num_temps > c->defs_array_size) {
|
if (c->num_temps > c->defs_array_size) {
|
||||||
uint32_t old_size = c->defs_array_size;
|
uint32_t old_size = c->defs_array_size;
|
||||||
c->defs_array_size = MAX2(old_size * 2, 16);
|
c->defs_array_size = MAX2(old_size * 2, 16);
|
||||||
|
|
||||||
c->defs = reralloc(c, c->defs, struct qinst *,
|
c->defs = reralloc(c, c->defs, struct qinst *,
|
||||||
c->defs_array_size);
|
c->defs_array_size);
|
||||||
memset(&c->defs[old_size], 0,
|
memset(&c->defs[old_size], 0,
|
||||||
sizeof(c->defs[0]) * (c->defs_array_size - old_size));
|
sizeof(c->defs[0]) * (c->defs_array_size - old_size));
|
||||||
|
|
||||||
|
c->spillable = reralloc(c, c->spillable,
|
||||||
|
BITSET_WORD,
|
||||||
|
BITSET_WORDS(c->defs_array_size));
|
||||||
|
for (int i = old_size; i < c->defs_array_size; i++)
|
||||||
|
BITSET_SET(c->spillable, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
return reg;
|
return reg;
|
||||||
@@ -653,6 +660,7 @@ v3d_set_prog_data(struct v3d_compile *c,
|
|||||||
{
|
{
|
||||||
prog_data->threads = c->threads;
|
prog_data->threads = c->threads;
|
||||||
prog_data->single_seg = !c->last_thrsw;
|
prog_data->single_seg = !c->last_thrsw;
|
||||||
|
prog_data->spill_size = c->spill_size;
|
||||||
|
|
||||||
v3d_set_prog_data_uniforms(c, prog_data);
|
v3d_set_prog_data_uniforms(c, prog_data);
|
||||||
v3d_set_prog_data_ubo(c, prog_data);
|
v3d_set_prog_data_ubo(c, prog_data);
|
||||||
|
@@ -33,6 +33,211 @@
|
|||||||
#define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
|
#define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
|
||||||
#define PHYS_COUNT 64
|
#define PHYS_COUNT 64
|
||||||
|
|
||||||
|
static bool
|
||||||
|
is_last_ldtmu(struct qinst *inst, struct qblock *block)
|
||||||
|
{
|
||||||
|
list_for_each_entry_from(struct qinst, scan_inst, inst,
|
||||||
|
&block->instructions, link) {
|
||||||
|
if (inst->qpu.sig.ldtmu)
|
||||||
|
return false;
|
||||||
|
if (v3d_qpu_writes_tmu(&inst->qpu))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
|
||||||
|
uint32_t *temp_to_node)
|
||||||
|
{
|
||||||
|
float block_scale = 1.0;
|
||||||
|
float spill_costs[c->num_temps];
|
||||||
|
bool in_tmu_operation = false;
|
||||||
|
bool started_last_seg = false;
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < c->num_temps; i++)
|
||||||
|
spill_costs[i] = 0.0;
|
||||||
|
|
||||||
|
/* XXX: Scale the cost up when inside of a loop. */
|
||||||
|
vir_for_each_block(block, c) {
|
||||||
|
vir_for_each_inst(inst, block) {
|
||||||
|
/* We can't insert a new TMU operation while currently
|
||||||
|
* in a TMU operation, and we can't insert new thread
|
||||||
|
* switches after starting output writes.
|
||||||
|
*/
|
||||||
|
bool no_spilling =
|
||||||
|
(in_tmu_operation ||
|
||||||
|
(c->threads > 1 && started_last_seg));
|
||||||
|
|
||||||
|
for (int i = 0; i < vir_get_nsrc(inst); i++) {
|
||||||
|
if (inst->src[i].file != QFILE_TEMP)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
int temp = inst->src[i].index;
|
||||||
|
if (no_spilling) {
|
||||||
|
BITSET_CLEAR(c->spillable,
|
||||||
|
temp);
|
||||||
|
} else {
|
||||||
|
spill_costs[temp] += block_scale;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inst->dst.file == QFILE_TEMP) {
|
||||||
|
int temp = inst->dst.index;
|
||||||
|
|
||||||
|
if (no_spilling) {
|
||||||
|
BITSET_CLEAR(c->spillable,
|
||||||
|
temp);
|
||||||
|
} else {
|
||||||
|
spill_costs[temp] += block_scale;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inst->is_last_thrsw)
|
||||||
|
started_last_seg = true;
|
||||||
|
|
||||||
|
if (v3d_qpu_writes_vpm(&inst->qpu) ||
|
||||||
|
v3d_qpu_uses_tlb(&inst->qpu))
|
||||||
|
started_last_seg = true;
|
||||||
|
|
||||||
|
/* Track when we're in between a TMU setup and the
|
||||||
|
* final LDTMU from that TMU setup. We can't
|
||||||
|
* spill/fill any temps during that time, because that
|
||||||
|
* involves inserting a new TMU setup/LDTMU sequence.
|
||||||
|
*/
|
||||||
|
if (inst->qpu.sig.ldtmu &&
|
||||||
|
is_last_ldtmu(inst, block))
|
||||||
|
in_tmu_operation = false;
|
||||||
|
|
||||||
|
if (v3d_qpu_writes_tmu(&inst->qpu))
|
||||||
|
in_tmu_operation = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < c->num_temps; i++) {
|
||||||
|
int node = temp_to_node[i];
|
||||||
|
|
||||||
|
if (BITSET_TEST(c->spillable, i))
|
||||||
|
ra_set_node_spill_cost(g, node, spill_costs[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ra_get_best_spill_node(g);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The spill offset for this thread takes a bit of setup, so do it once at
|
||||||
|
* program start.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
v3d_setup_spill_base(struct v3d_compile *c)
|
||||||
|
{
|
||||||
|
c->cursor = vir_before_block(vir_entry_block(c));
|
||||||
|
|
||||||
|
int start_num_temps = c->num_temps;
|
||||||
|
|
||||||
|
/* Each thread wants to be in a separate region of the scratch space
|
||||||
|
* so that the QPUs aren't fighting over cache lines. We have the
|
||||||
|
* driver keep a single global spill BO rather than
|
||||||
|
* per-spilling-program BOs, so we need a uniform from the driver for
|
||||||
|
* what the per-thread scale is.
|
||||||
|
*/
|
||||||
|
struct qreg thread_offset =
|
||||||
|
vir_UMUL(c,
|
||||||
|
vir_TIDX(c),
|
||||||
|
vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0));
|
||||||
|
|
||||||
|
/* Each channel in a reg is 4 bytes, so scale them up by that. */
|
||||||
|
struct qreg element_offset = vir_SHL(c, vir_EIDX(c),
|
||||||
|
vir_uniform_ui(c, 2));
|
||||||
|
|
||||||
|
c->spill_base = vir_ADD(c,
|
||||||
|
vir_ADD(c, thread_offset, element_offset),
|
||||||
|
vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0));
|
||||||
|
|
||||||
|
/* Make sure that we don't spill the spilling setup instructions. */
|
||||||
|
for (int i = start_num_temps; i < c->num_temps; i++)
|
||||||
|
BITSET_CLEAR(c->spillable, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset)
|
||||||
|
{
|
||||||
|
vir_ADD_dest(c, vir_reg(QFILE_MAGIC,
|
||||||
|
V3D_QPU_WADDR_TMUA),
|
||||||
|
c->spill_base,
|
||||||
|
vir_uniform_ui(c, spill_offset));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
v3d_spill_reg(struct v3d_compile *c, int spill_temp)
|
||||||
|
{
|
||||||
|
uint32_t spill_offset = c->spill_size;
|
||||||
|
c->spill_size += 16 * sizeof(uint32_t);
|
||||||
|
|
||||||
|
if (spill_offset == 0)
|
||||||
|
v3d_setup_spill_base(c);
|
||||||
|
|
||||||
|
struct qinst *last_thrsw = c->last_thrsw;
|
||||||
|
assert(!last_thrsw || last_thrsw->is_last_thrsw);
|
||||||
|
|
||||||
|
int start_num_temps = c->num_temps;
|
||||||
|
|
||||||
|
vir_for_each_inst_inorder(inst, c) {
|
||||||
|
for (int i = 0; i < vir_get_nsrc(inst); i++) {
|
||||||
|
if (inst->src[i].file != QFILE_TEMP ||
|
||||||
|
inst->src[i].index != spill_temp) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
c->cursor = vir_before_inst(inst);
|
||||||
|
|
||||||
|
v3d_emit_spill_tmua(c, spill_offset);
|
||||||
|
vir_emit_thrsw(c);
|
||||||
|
inst->src[i] = vir_LDTMU(c);
|
||||||
|
c->fills++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inst->dst.file == QFILE_TEMP &&
|
||||||
|
inst->dst.index == spill_temp) {
|
||||||
|
c->cursor = vir_after_inst(inst);
|
||||||
|
|
||||||
|
inst->dst.index = c->num_temps++;
|
||||||
|
vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
|
||||||
|
inst->dst);
|
||||||
|
v3d_emit_spill_tmua(c, spill_offset);
|
||||||
|
vir_emit_thrsw(c);
|
||||||
|
c->spills++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If we didn't have a last-thrsw inserted by nir_to_vir and
|
||||||
|
* we've been inserting thrsws, then insert a new last_thrsw
|
||||||
|
* right before we start the vpm/tlb sequence for the last
|
||||||
|
* thread segment.
|
||||||
|
*/
|
||||||
|
if (!last_thrsw && c->last_thrsw &&
|
||||||
|
(v3d_qpu_writes_vpm(&inst->qpu) ||
|
||||||
|
v3d_qpu_uses_tlb(&inst->qpu))) {
|
||||||
|
c->cursor = vir_before_inst(inst);
|
||||||
|
vir_emit_thrsw(c);
|
||||||
|
|
||||||
|
last_thrsw = c->last_thrsw;
|
||||||
|
last_thrsw->is_last_thrsw = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Make sure c->last_thrsw is the actual last thrsw, not just one we
|
||||||
|
* inserted in our most recent unspill.
|
||||||
|
*/
|
||||||
|
if (last_thrsw)
|
||||||
|
c->last_thrsw = last_thrsw;
|
||||||
|
|
||||||
|
/* Don't allow spilling of our spilling instructions. There's no way
|
||||||
|
* they can help get things colored.
|
||||||
|
*/
|
||||||
|
for (int i = start_num_temps; i < c->num_temps; i++)
|
||||||
|
BITSET_CLEAR(c->spillable, i);
|
||||||
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
vir_init_reg_sets(struct v3d_compiler *compiler)
|
vir_init_reg_sets(struct v3d_compiler *compiler)
|
||||||
{
|
{
|
||||||
@@ -96,7 +301,7 @@ node_to_temp_priority(const void *in_a, const void *in_b)
|
|||||||
* The return value should be freed by the caller.
|
* The return value should be freed by the caller.
|
||||||
*/
|
*/
|
||||||
struct qpu_reg *
|
struct qpu_reg *
|
||||||
v3d_register_allocate(struct v3d_compile *c)
|
v3d_register_allocate(struct v3d_compile *c, bool *spilled)
|
||||||
{
|
{
|
||||||
struct node_to_temp_map map[c->num_temps];
|
struct node_to_temp_map map[c->num_temps];
|
||||||
uint32_t temp_to_node[c->num_temps];
|
uint32_t temp_to_node[c->num_temps];
|
||||||
@@ -105,9 +310,10 @@ v3d_register_allocate(struct v3d_compile *c)
|
|||||||
sizeof(*temp_registers));
|
sizeof(*temp_registers));
|
||||||
int acc_nodes[ACC_COUNT];
|
int acc_nodes[ACC_COUNT];
|
||||||
|
|
||||||
struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
|
*spilled = false;
|
||||||
c->num_temps +
|
|
||||||
ARRAY_SIZE(acc_nodes));
|
vir_calculate_live_intervals(c);
|
||||||
|
|
||||||
/* Convert 1, 2, 4 threads to 0, 1, 2 index.
|
/* Convert 1, 2, 4 threads to 0, 1, 2 index.
|
||||||
*
|
*
|
||||||
* V3D 4.x has double the physical register space, so 64 physical regs
|
* V3D 4.x has double the physical register space, so 64 physical regs
|
||||||
@@ -119,6 +325,10 @@ v3d_register_allocate(struct v3d_compile *c)
|
|||||||
thread_index--;
|
thread_index--;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
|
||||||
|
c->num_temps +
|
||||||
|
ARRAY_SIZE(acc_nodes));
|
||||||
|
|
||||||
/* Make some fixed nodes for the accumulators, which we will need to
|
/* Make some fixed nodes for the accumulators, which we will need to
|
||||||
* interfere with when ops have implied r3/r4 writes or for the thread
|
* interfere with when ops have implied r3/r4 writes or for the thread
|
||||||
* switches. We could represent these as classes for the nodes to
|
* switches. We could represent these as classes for the nodes to
|
||||||
@@ -254,6 +464,20 @@ v3d_register_allocate(struct v3d_compile *c)
|
|||||||
|
|
||||||
bool ok = ra_allocate(g);
|
bool ok = ra_allocate(g);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
|
/* Try to spill, if we can't reduce threading first. */
|
||||||
|
if (thread_index == 0) {
|
||||||
|
int node = v3d_choose_spill_node(c, g, temp_to_node);
|
||||||
|
|
||||||
|
if (node != -1) {
|
||||||
|
v3d_spill_reg(c, map[node].temp);
|
||||||
|
ralloc_free(g);
|
||||||
|
|
||||||
|
/* Ask the outer loop to call back in. */
|
||||||
|
*spilled = true;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
free(temp_registers);
|
free(temp_registers);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
@@ -280,5 +504,17 @@ v3d_register_allocate(struct v3d_compile *c)
|
|||||||
|
|
||||||
ralloc_free(g);
|
ralloc_free(g);
|
||||||
|
|
||||||
|
if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
|
||||||
|
fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d spills\n",
|
||||||
|
vir_get_stage_name(c),
|
||||||
|
c->program_id, c->variant_id,
|
||||||
|
c->spills);
|
||||||
|
|
||||||
|
fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d fills\n",
|
||||||
|
vir_get_stage_name(c),
|
||||||
|
c->program_id, c->variant_id,
|
||||||
|
c->fills);
|
||||||
|
}
|
||||||
|
|
||||||
return temp_registers;
|
return temp_registers;
|
||||||
}
|
}
|
||||||
|
@@ -154,6 +154,9 @@ struct vc5_compiled_shader {
|
|||||||
struct vc5_program_stateobj {
|
struct vc5_program_stateobj {
|
||||||
struct vc5_uncompiled_shader *bind_vs, *bind_fs;
|
struct vc5_uncompiled_shader *bind_vs, *bind_fs;
|
||||||
struct vc5_compiled_shader *cs, *vs, *fs;
|
struct vc5_compiled_shader *cs, *vs, *fs;
|
||||||
|
|
||||||
|
struct vc5_bo *spill_bo;
|
||||||
|
int spill_size_per_thread;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct vc5_constbuf_stateobj {
|
struct vc5_constbuf_stateobj {
|
||||||
|
@@ -267,6 +267,21 @@ vc5_get_compiled_shader(struct vc5_context *vc5, struct v3d_key *key)
|
|||||||
memcpy(dup_key, key, key_size);
|
memcpy(dup_key, key, key_size);
|
||||||
_mesa_hash_table_insert(ht, dup_key, shader);
|
_mesa_hash_table_insert(ht, dup_key, shader);
|
||||||
|
|
||||||
|
if (shader->prog_data.base->spill_size >
|
||||||
|
vc5->prog.spill_size_per_thread) {
|
||||||
|
/* Max 4 QPUs per slice, 3 slices per core. We only do single
|
||||||
|
* core so far. This overallocates memory on smaller cores.
|
||||||
|
*/
|
||||||
|
int total_spill_size =
|
||||||
|
4 * 3 * shader->prog_data.base->spill_size;
|
||||||
|
|
||||||
|
vc5_bo_unreference(&vc5->prog.spill_bo);
|
||||||
|
vc5->prog.spill_bo = vc5_bo_alloc(vc5->screen,
|
||||||
|
total_spill_size, "spill");
|
||||||
|
vc5->prog.spill_size_per_thread =
|
||||||
|
shader->prog_data.base->spill_size;
|
||||||
|
}
|
||||||
|
|
||||||
return shader;
|
return shader;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -389,6 +389,16 @@ vc5_write_uniforms(struct vc5_context *vc5, struct vc5_compiled_shader *shader,
|
|||||||
/* XXX */
|
/* XXX */
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case QUNIFORM_SPILL_OFFSET:
|
||||||
|
cl_aligned_reloc(&job->indirect, &uniforms,
|
||||||
|
vc5->prog.spill_bo, 0);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case QUNIFORM_SPILL_SIZE_PER_THREAD:
|
||||||
|
cl_aligned_u32(&uniforms,
|
||||||
|
vc5->prog.spill_size_per_thread);
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
assert(quniform_contents_is_texture_p0(uinfo->contents[i]));
|
assert(quniform_contents_is_texture_p0(uinfo->contents[i]));
|
||||||
|
|
||||||
@@ -451,6 +461,8 @@ vc5_set_shader_uniform_dirty_flags(struct vc5_compiled_shader *shader)
|
|||||||
case QUNIFORM_TEXTURE_DEPTH:
|
case QUNIFORM_TEXTURE_DEPTH:
|
||||||
case QUNIFORM_TEXTURE_ARRAY_SIZE:
|
case QUNIFORM_TEXTURE_ARRAY_SIZE:
|
||||||
case QUNIFORM_TEXTURE_LEVELS:
|
case QUNIFORM_TEXTURE_LEVELS:
|
||||||
|
case QUNIFORM_SPILL_OFFSET:
|
||||||
|
case QUNIFORM_SPILL_SIZE_PER_THREAD:
|
||||||
/* We could flag this on just the stage we're
|
/* We could flag this on just the stage we're
|
||||||
* compiling for, but it's not passed in.
|
* compiling for, but it's not passed in.
|
||||||
*/
|
*/
|
||||||
|
Reference in New Issue
Block a user