vc4: Add support for ARL and indirect register access on TGSI_FILE_CONSTANT.
Fixes 14 ARB_vp tests (which had no lowering done), and should improve performance of indirect uniform array access in GLSL.
This commit is contained in:
@@ -128,6 +128,7 @@ struct exec_info {
|
||||
* Setup") for definitions of the texture parameters.
|
||||
*/
|
||||
struct vc4_texture_sample_info {
|
||||
bool is_direct;
|
||||
uint32_t p_offset[4];
|
||||
};
|
||||
|
||||
|
@@ -767,6 +767,23 @@ reloc_tex(struct exec_info *exec,
|
||||
uint32_t cube_map_stride = 0;
|
||||
enum vc4_texture_data_type type;
|
||||
|
||||
if (!vc4_use_bo(exec, texture_handle_index, VC4_MODE_RENDER, &tex))
|
||||
return false;
|
||||
|
||||
if (sample->is_direct) {
|
||||
uint32_t remaining_size = tex->base.size - p0;
|
||||
if (p0 > tex->base.size - 4) {
|
||||
DRM_ERROR("UBO offset greater than UBO size\n");
|
||||
return false;
|
||||
}
|
||||
if (p1 > remaining_size - 4) {
|
||||
DRM_ERROR("UBO clamp would allow reads outside of UBO\n");
|
||||
return false;
|
||||
}
|
||||
*validated_p0 = tex->paddr + p0;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (width == 0)
|
||||
width = 2048;
|
||||
if (height == 0)
|
||||
@@ -832,9 +849,6 @@ reloc_tex(struct exec_info *exec,
|
||||
tiling_format = VC4_TILING_FORMAT_T;
|
||||
}
|
||||
|
||||
if (!vc4_use_bo(exec, texture_handle_index, VC4_MODE_RENDER, &tex))
|
||||
return false;
|
||||
|
||||
if (!check_tex_size(exec, tex, offset + cube_map_stride * 5,
|
||||
tiling_format, width, height, cpp)) {
|
||||
return false;
|
||||
|
@@ -51,8 +51,39 @@
|
||||
struct vc4_shader_validation_state {
|
||||
struct vc4_texture_sample_info tmu_setup[2];
|
||||
int tmu_write_count[2];
|
||||
|
||||
/* For registers that were last written to by a MIN instruction with
|
||||
* one argument being a uniform, the address of the uniform.
|
||||
* Otherwise, ~0.
|
||||
*
|
||||
* This is used for the validation of direct address memory reads.
|
||||
*/
|
||||
uint32_t live_clamp_offsets[32 + 32 + 4];
|
||||
};
|
||||
|
||||
static uint32_t
|
||||
waddr_to_live_reg_index(uint32_t waddr, bool is_b)
|
||||
{
|
||||
if (waddr < 32) {
|
||||
if (is_b)
|
||||
return 32 + waddr;
|
||||
else
|
||||
return waddr;
|
||||
} else if (waddr <= QPU_W_ACC3) {
|
||||
|
||||
return 64 + waddr - QPU_W_ACC0;
|
||||
} else {
|
||||
return ~0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
is_tmu_submit(uint32_t waddr)
|
||||
{
|
||||
return (waddr == QPU_W_TMU0_S ||
|
||||
waddr == QPU_W_TMU1_S);
|
||||
}
|
||||
|
||||
static bool
|
||||
is_tmu_write(uint32_t waddr)
|
||||
{
|
||||
@@ -75,24 +106,86 @@ record_validated_texture_sample(struct vc4_validated_shader_info *validated_shad
|
||||
if (!temp_samples)
|
||||
return false;
|
||||
|
||||
memcpy(temp_samples[s].p_offset,
|
||||
validation_state->tmu_setup[tmu].p_offset,
|
||||
validation_state->tmu_write_count[tmu] * sizeof(uint32_t));
|
||||
for (i = validation_state->tmu_write_count[tmu]; i < 4; i++)
|
||||
temp_samples[s].p_offset[i] = ~0;
|
||||
memcpy(&temp_samples[s],
|
||||
&validation_state->tmu_setup[tmu],
|
||||
sizeof(*temp_samples));
|
||||
|
||||
validated_shader->num_texture_samples = s + 1;
|
||||
validated_shader->texture_samples = temp_samples;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
validation_state->tmu_setup[tmu].p_offset[i] = ~0;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
check_tmu_write(struct vc4_validated_shader_info *validated_shader,
|
||||
check_tmu_write(uint64_t inst,
|
||||
struct vc4_validated_shader_info *validated_shader,
|
||||
struct vc4_shader_validation_state *validation_state,
|
||||
uint32_t waddr)
|
||||
bool is_mul)
|
||||
{
|
||||
uint32_t waddr = (is_mul ?
|
||||
QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
|
||||
QPU_GET_FIELD(inst, QPU_WADDR_ADD));
|
||||
uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
|
||||
uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
|
||||
int tmu = waddr > QPU_W_TMU0_B;
|
||||
bool submit = is_tmu_submit(waddr);
|
||||
bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0;
|
||||
|
||||
if (is_direct) {
|
||||
uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
|
||||
uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
|
||||
uint32_t clamp_offset = ~0;
|
||||
|
||||
/* Make sure that this texture load is an add of the base
|
||||
* address of the UBO to a clamped offset within the UBO.
|
||||
*/
|
||||
if (is_mul ||
|
||||
QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
|
||||
DRM_ERROR("direct TMU load wasn't an add\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
/* We assert that the the clamped address is the first
|
||||
* argument, and the UBO base address is the second argument.
|
||||
* This is arbitrary, but simpler than supporting flipping the
|
||||
* two either way.
|
||||
*/
|
||||
if (add_a == QPU_MUX_A) {
|
||||
clamp_offset = validation_state->live_clamp_offsets[raddr_a];
|
||||
} else if (add_a == QPU_MUX_B) {
|
||||
clamp_offset = validation_state->live_clamp_offsets[32 + raddr_b];
|
||||
} else if (add_a <= QPU_MUX_R4) {
|
||||
clamp_offset = validation_state->live_clamp_offsets[64 + add_a];
|
||||
}
|
||||
|
||||
if (clamp_offset == ~0) {
|
||||
DRM_ERROR("direct TMU load wasn't clamped\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Store the clamp value's offset in p1 (see reloc_tex() in
|
||||
* vc4_validate.c).
|
||||
*/
|
||||
validation_state->tmu_setup[tmu].p_offset[1] =
|
||||
clamp_offset;
|
||||
|
||||
if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
|
||||
!(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
|
||||
DRM_ERROR("direct TMU load didn't add to a uniform\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
validation_state->tmu_setup[tmu].is_direct = true;
|
||||
} else {
|
||||
if (raddr_a == QPU_R_UNIF || raddr_b == QPU_R_UNIF) {
|
||||
DRM_ERROR("uniform read in the same instruction as "
|
||||
"texture setup.\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (validation_state->tmu_write_count[tmu] >= 4) {
|
||||
DRM_ERROR("TMU%d got too many parameters before dispatch\n",
|
||||
@@ -102,9 +195,13 @@ check_tmu_write(struct vc4_validated_shader_info *validated_shader,
|
||||
validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] =
|
||||
validated_shader->uniforms_size;
|
||||
validation_state->tmu_write_count[tmu]++;
|
||||
validated_shader->uniforms_size += 4;
|
||||
/* Since direct uses a RADDR uniform reference, it will get counted in
|
||||
* check_instruction_reads()
|
||||
*/
|
||||
if (!is_direct)
|
||||
validated_shader->uniforms_size += 4;
|
||||
|
||||
if (waddr == QPU_W_TMU0_S || waddr == QPU_W_TMU1_S) {
|
||||
if (submit) {
|
||||
if (!record_validated_texture_sample(validated_shader,
|
||||
validation_state, tmu)) {
|
||||
return false;
|
||||
@@ -117,10 +214,17 @@ check_tmu_write(struct vc4_validated_shader_info *validated_shader,
|
||||
}
|
||||
|
||||
static bool
|
||||
check_register_write(struct vc4_validated_shader_info *validated_shader,
|
||||
check_register_write(uint64_t inst,
|
||||
struct vc4_validated_shader_info *validated_shader,
|
||||
struct vc4_shader_validation_state *validation_state,
|
||||
uint32_t waddr)
|
||||
bool is_mul)
|
||||
{
|
||||
uint32_t waddr = (is_mul ?
|
||||
QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
|
||||
QPU_GET_FIELD(inst, QPU_WADDR_ADD));
|
||||
bool is_b = is_mul != ((inst & QPU_PM) != 0);
|
||||
uint32_t live_reg_index;
|
||||
|
||||
switch (waddr) {
|
||||
case QPU_W_UNIFORMS_ADDRESS:
|
||||
/* XXX: We'll probably need to support this for reladdr, but
|
||||
@@ -145,8 +249,8 @@ check_register_write(struct vc4_validated_shader_info *validated_shader,
|
||||
case QPU_W_TMU1_T:
|
||||
case QPU_W_TMU1_R:
|
||||
case QPU_W_TMU1_B:
|
||||
return check_tmu_write(validated_shader, validation_state,
|
||||
waddr);
|
||||
return check_tmu_write(inst, validated_shader, validation_state,
|
||||
is_mul);
|
||||
|
||||
case QPU_W_HOST_INT:
|
||||
case QPU_W_TMU_NOSWAP:
|
||||
@@ -174,9 +278,44 @@ check_register_write(struct vc4_validated_shader_info *validated_shader,
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Clear out the live offset clamp tracking for the written register.
|
||||
* If this particular instruction is setting up an offset clamp, it'll
|
||||
* get tracked immediately after we return.
|
||||
*/
|
||||
live_reg_index = waddr_to_live_reg_index(waddr, is_b);
|
||||
if (live_reg_index != ~0)
|
||||
validation_state->live_clamp_offsets[live_reg_index] = ~0;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
track_live_clamps(uint64_t inst,
|
||||
struct vc4_validated_shader_info *validated_shader,
|
||||
struct vc4_shader_validation_state *validation_state)
|
||||
{
|
||||
uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
|
||||
uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
|
||||
uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
|
||||
uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
|
||||
bool pm = inst & QPU_PM;
|
||||
uint32_t live_reg_index;
|
||||
|
||||
if (QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_MIN)
|
||||
return;
|
||||
|
||||
if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
|
||||
!(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
|
||||
return;
|
||||
}
|
||||
|
||||
live_reg_index = waddr_to_live_reg_index(waddr_add, pm);
|
||||
if (live_reg_index != ~0) {
|
||||
validation_state->live_clamp_offsets[live_reg_index] =
|
||||
validated_shader->uniforms_size;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
check_instruction_writes(uint64_t inst,
|
||||
struct vc4_validated_shader_info *validated_shader,
|
||||
@@ -184,33 +323,30 @@ check_instruction_writes(uint64_t inst,
|
||||
{
|
||||
uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
|
||||
uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
|
||||
bool ok;
|
||||
|
||||
if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) {
|
||||
DRM_ERROR("ADD and MUL both set up textures\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
return (check_register_write(validated_shader, validation_state, waddr_add) &&
|
||||
check_register_write(validated_shader, validation_state, waddr_mul));
|
||||
ok = (check_register_write(inst, validated_shader, validation_state, false) &&
|
||||
check_register_write(inst, validated_shader, validation_state, true));
|
||||
|
||||
track_live_clamps(inst, validated_shader, validation_state);
|
||||
|
||||
return ok;
|
||||
}
|
||||
|
||||
static bool
|
||||
check_instruction_reads(uint64_t inst,
|
||||
struct vc4_validated_shader_info *validated_shader)
|
||||
{
|
||||
uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
|
||||
uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
|
||||
uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
|
||||
uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
|
||||
|
||||
if (raddr_a == QPU_R_UNIF ||
|
||||
raddr_b == QPU_R_UNIF) {
|
||||
if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul)) {
|
||||
DRM_ERROR("uniform read in the same instruction as "
|
||||
"texture setup\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
/* This can't overflow the uint32_t, because we're reading 8
|
||||
* bytes of instruction to increment by 4 here, so we'd
|
||||
* already be OOM.
|
||||
@@ -231,9 +367,15 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj,
|
||||
uint64_t *shader;
|
||||
struct vc4_validated_shader_info *validated_shader;
|
||||
struct vc4_shader_validation_state validation_state;
|
||||
int i;
|
||||
|
||||
memset(&validation_state, 0, sizeof(validation_state));
|
||||
|
||||
for (i = 0; i < 8; i++)
|
||||
validation_state.tmu_setup[i / 4].p_offset[i % 4] = ~0;
|
||||
for (i = 0; i < ARRAY_SIZE(validation_state.live_clamp_offsets); i++)
|
||||
validation_state.live_clamp_offsets[i] = ~0;
|
||||
|
||||
if (start_offset + sizeof(uint64_t) > shader_obj->base.size) {
|
||||
DRM_ERROR("shader starting at %d outside of BO sized %d\n",
|
||||
start_offset,
|
||||
|
@@ -87,12 +87,35 @@ struct vc4_uncompiled_shader {
|
||||
const struct tgsi_token *twoside_tokens;
|
||||
};
|
||||
|
||||
struct vc4_ubo_range {
|
||||
/**
|
||||
* offset in bytes from the start of the ubo where this range is
|
||||
* uploaded.
|
||||
*
|
||||
* Only set once used is set.
|
||||
*/
|
||||
uint32_t dst_offset;
|
||||
|
||||
/**
|
||||
* offset in bytes from the start of the gallium uniforms where the
|
||||
* data comes from.
|
||||
*/
|
||||
uint32_t src_offset;
|
||||
|
||||
/** size in bytes of this ubo range */
|
||||
uint32_t size;
|
||||
};
|
||||
|
||||
struct vc4_compiled_shader {
|
||||
uint64_t program_id;
|
||||
struct vc4_bo *bo;
|
||||
|
||||
struct vc4_shader_uniform_info uniforms;
|
||||
|
||||
struct vc4_ubo_range *ubo_ranges;
|
||||
uint32_t num_ubo_ranges;
|
||||
uint32_t ubo_size;
|
||||
|
||||
/** bitmask of which inputs are color inputs, for flat shade handling. */
|
||||
uint32_t color_inputs;
|
||||
|
||||
|
@@ -92,7 +92,8 @@ qir_opt_dead_code(struct vc4_compile *c)
|
||||
if (dce_tex && (inst->op == QOP_TEX_S ||
|
||||
inst->op == QOP_TEX_T ||
|
||||
inst->op == QOP_TEX_R ||
|
||||
inst->op == QOP_TEX_B)) {
|
||||
inst->op == QOP_TEX_B ||
|
||||
inst->op == QOP_TEX_DIRECT)) {
|
||||
dce(c, inst);
|
||||
progress = true;
|
||||
continue;
|
||||
|
@@ -164,9 +164,41 @@ qir_uniform_f(struct vc4_compile *c, float f)
|
||||
}
|
||||
|
||||
static struct qreg
|
||||
get_src(struct vc4_compile *c, unsigned tgsi_op,
|
||||
struct tgsi_src_register *src, int i)
|
||||
indirect_uniform_load(struct vc4_compile *c,
|
||||
struct tgsi_full_src_register *src, int swiz)
|
||||
{
|
||||
struct tgsi_ind_register *indirect = &src->Indirect;
|
||||
struct vc4_compiler_ubo_range *range = &c->ubo_ranges[indirect->ArrayID];
|
||||
if (!range->used) {
|
||||
range->used = true;
|
||||
range->dst_offset = c->next_ubo_dst_offset;
|
||||
c->next_ubo_dst_offset += range->size;
|
||||
c->num_ubo_ranges++;
|
||||
};
|
||||
|
||||
assert(src->Register.Indirect);
|
||||
assert(indirect->File == TGSI_FILE_ADDRESS);
|
||||
|
||||
struct qreg addr_val = c->addr[indirect->Swizzle];
|
||||
struct qreg indirect_offset =
|
||||
qir_ADD(c, addr_val, qir_uniform_ui(c,
|
||||
range->dst_offset +
|
||||
(src->Register.Index * 16)+
|
||||
swiz * 4));
|
||||
indirect_offset = qir_MIN(c, indirect_offset, qir_uniform_ui(c, (range->dst_offset +
|
||||
range->size - 4)));
|
||||
|
||||
qir_TEX_DIRECT(c, indirect_offset, add_uniform(c, QUNIFORM_UBO_ADDR, 0));
|
||||
struct qreg r4 = qir_TEX_RESULT(c);
|
||||
c->num_texture_samples++;
|
||||
return qir_MOV(c, r4);
|
||||
}
|
||||
|
||||
static struct qreg
|
||||
get_src(struct vc4_compile *c, unsigned tgsi_op,
|
||||
struct tgsi_full_src_register *full_src, int i)
|
||||
{
|
||||
struct tgsi_src_register *src = &full_src->Register;
|
||||
struct qreg r = c->undef;
|
||||
|
||||
uint32_t s = i;
|
||||
@@ -187,8 +219,6 @@ get_src(struct vc4_compile *c, unsigned tgsi_op,
|
||||
abort();
|
||||
}
|
||||
|
||||
assert(!src->Indirect);
|
||||
|
||||
switch (src->File) {
|
||||
case TGSI_FILE_NULL:
|
||||
return r;
|
||||
@@ -199,8 +229,12 @@ get_src(struct vc4_compile *c, unsigned tgsi_op,
|
||||
r = c->consts[src->Index * 4 + s];
|
||||
break;
|
||||
case TGSI_FILE_CONSTANT:
|
||||
r = get_temp_for_uniform(c, QUNIFORM_UNIFORM,
|
||||
src->Index * 4 + s);
|
||||
if (src->Indirect) {
|
||||
r = indirect_uniform_load(c, full_src, s);
|
||||
} else {
|
||||
r = get_temp_for_uniform(c, QUNIFORM_UNIFORM,
|
||||
src->Index * 4 + s);
|
||||
}
|
||||
break;
|
||||
case TGSI_FILE_INPUT:
|
||||
r = c->inputs[src->Index * 4 + s];
|
||||
@@ -250,6 +284,10 @@ update_dst(struct vc4_compile *c, struct tgsi_full_instruction *tgsi_inst,
|
||||
c->num_outputs = MAX2(c->num_outputs,
|
||||
tgsi_dst->Index * 4 + i + 1);
|
||||
break;
|
||||
case TGSI_FILE_ADDRESS:
|
||||
assert(tgsi_dst->Index == 0);
|
||||
c->addr[i] = val;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "unknown dst file %d\n", tgsi_dst->File);
|
||||
abort();
|
||||
@@ -906,6 +944,29 @@ tgsi_to_qir_ssg(struct vc4_compile *c,
|
||||
qir_uniform_f(c, -1.0));
|
||||
}
|
||||
|
||||
/* Compare to tgsi_to_qir_flr() for the floor logic. */
|
||||
static struct qreg
|
||||
tgsi_to_qir_arl(struct vc4_compile *c,
|
||||
struct tgsi_full_instruction *tgsi_inst,
|
||||
enum qop op, struct qreg *src, int i)
|
||||
{
|
||||
struct qreg trunc = qir_FTOI(c, src[0 * 4 + i]);
|
||||
struct qreg scaled = qir_SHL(c, trunc, qir_uniform_ui(c, 4));
|
||||
|
||||
qir_SF(c, qir_FSUB(c, src[0 * 4 + i], qir_ITOF(c, trunc)));
|
||||
|
||||
return qir_SEL_X_Y_NS(c, qir_SUB(c, scaled, qir_uniform_ui(c, 4)),
|
||||
scaled);
|
||||
}
|
||||
|
||||
static struct qreg
|
||||
tgsi_to_qir_uarl(struct vc4_compile *c,
|
||||
struct tgsi_full_instruction *tgsi_inst,
|
||||
enum qop op, struct qreg *src, int i)
|
||||
{
|
||||
return qir_SHL(c, src[0 * 4 + i], qir_uniform_ui(c, 4));
|
||||
}
|
||||
|
||||
static void
|
||||
emit_vertex_input(struct vc4_compile *c, int attr)
|
||||
{
|
||||
@@ -1086,6 +1147,24 @@ add_output(struct vc4_compile *c,
|
||||
c->output_semantics[decl_offset].swizzle = semantic_swizzle;
|
||||
}
|
||||
|
||||
static void
|
||||
add_array_info(struct vc4_compile *c, uint32_t array_id,
|
||||
uint32_t start, uint32_t size)
|
||||
{
|
||||
if (array_id >= c->ubo_ranges_array_size) {
|
||||
c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2,
|
||||
array_id + 1);
|
||||
c->ubo_ranges = reralloc(c, c->ubo_ranges,
|
||||
struct vc4_compiler_ubo_range,
|
||||
c->ubo_ranges_array_size);
|
||||
}
|
||||
|
||||
c->ubo_ranges[array_id].dst_offset = 0;
|
||||
c->ubo_ranges[array_id].src_offset = start;
|
||||
c->ubo_ranges[array_id].size = size;
|
||||
c->ubo_ranges[array_id].used = false;
|
||||
}
|
||||
|
||||
static void
|
||||
emit_tgsi_declaration(struct vc4_compile *c,
|
||||
struct tgsi_full_declaration *decl)
|
||||
@@ -1152,6 +1231,14 @@ emit_tgsi_declaration(struct vc4_compile *c,
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case TGSI_FILE_CONSTANT:
|
||||
add_array_info(c,
|
||||
decl->Array.ArrayID,
|
||||
decl->Range.First * 16,
|
||||
(decl->Range.Last -
|
||||
decl->Range.First + 1) * 16);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1219,6 +1306,8 @@ emit_tgsi_instruction(struct vc4_compile *c,
|
||||
[TGSI_OPCODE_COS] = { 0, tgsi_to_qir_cos },
|
||||
[TGSI_OPCODE_CLAMP] = { 0, tgsi_to_qir_clamp },
|
||||
[TGSI_OPCODE_SSG] = { 0, tgsi_to_qir_ssg },
|
||||
[TGSI_OPCODE_ARL] = { 0, tgsi_to_qir_arl },
|
||||
[TGSI_OPCODE_UARL] = { 0, tgsi_to_qir_uarl },
|
||||
};
|
||||
static int asdf = 0;
|
||||
uint32_t tgsi_op = tgsi_inst->Instruction.Opcode;
|
||||
@@ -1231,7 +1320,7 @@ emit_tgsi_instruction(struct vc4_compile *c,
|
||||
for (int i = 0; i < 4; i++) {
|
||||
src_regs[4 * s + i] =
|
||||
get_src(c, tgsi_inst->Instruction.Opcode,
|
||||
&tgsi_inst->Src[s].Register, i);
|
||||
&tgsi_inst->Src[s], i);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1833,6 +1922,9 @@ vc4_shader_tgsi_to_qir(struct vc4_context *vc4, enum qstage stage,
|
||||
int ret;
|
||||
|
||||
c->stage = stage;
|
||||
for (int i = 0; i < 4; i++)
|
||||
c->addr[i] = qir_uniform_f(c, 0.0);
|
||||
|
||||
c->shader_state = &key->shader_state->base;
|
||||
c->program_id = key->shader_state->program_id;
|
||||
c->variant_id = key->shader_state->compiled_variant_count++;
|
||||
@@ -2065,6 +2157,31 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
|
||||
c->qpu_inst_count * sizeof(uint64_t),
|
||||
"code");
|
||||
|
||||
/* Copy the compiler UBO range state to the compiled shader, dropping
|
||||
* out arrays that were never referenced by an indirect load.
|
||||
*
|
||||
* (Note that QIR dead code elimination of an array access still
|
||||
* leaves that array alive, though)
|
||||
*/
|
||||
if (c->num_ubo_ranges) {
|
||||
shader->num_ubo_ranges = c->num_ubo_ranges;
|
||||
shader->ubo_ranges = ralloc_array(shader, struct vc4_ubo_range,
|
||||
c->num_ubo_ranges);
|
||||
uint32_t j = 0;
|
||||
for (int i = 0; i < c->ubo_ranges_array_size; i++) {
|
||||
struct vc4_compiler_ubo_range *range =
|
||||
&c->ubo_ranges[i];
|
||||
if (!range->used)
|
||||
continue;
|
||||
|
||||
shader->ubo_ranges[j].dst_offset = range->dst_offset;
|
||||
shader->ubo_ranges[j].src_offset = range->src_offset;
|
||||
shader->ubo_ranges[j].size = range->size;
|
||||
shader->ubo_size += c->ubo_ranges[i].size;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
qir_compile_destroy(c);
|
||||
|
||||
struct vc4_key *dup_key;
|
||||
@@ -2461,6 +2578,24 @@ get_texrect_scale(struct vc4_texture_stateobj *texstate,
|
||||
return fui(1.0f / dim);
|
||||
}
|
||||
|
||||
static struct vc4_bo *
|
||||
vc4_upload_ubo(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
|
||||
const uint32_t *gallium_uniforms)
|
||||
{
|
||||
if (!shader->ubo_size)
|
||||
return NULL;
|
||||
|
||||
struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo");
|
||||
uint32_t *data = vc4_bo_map(ubo);
|
||||
for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) {
|
||||
memcpy(data + shader->ubo_ranges[i].dst_offset,
|
||||
gallium_uniforms + shader->ubo_ranges[i].src_offset,
|
||||
shader->ubo_ranges[i].size);
|
||||
}
|
||||
|
||||
return ubo;
|
||||
}
|
||||
|
||||
void
|
||||
vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
|
||||
struct vc4_constbuf_stateobj *cb,
|
||||
@@ -2468,6 +2603,7 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
|
||||
{
|
||||
struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
|
||||
const uint32_t *gallium_uniforms = cb->cb[0].user_buffer;
|
||||
struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms);
|
||||
|
||||
cl_start_shader_reloc(&vc4->uniforms, uinfo->num_texture_samples);
|
||||
|
||||
@@ -2512,6 +2648,10 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
|
||||
write_texture_p2(vc4, texstate, uinfo->data[i]);
|
||||
break;
|
||||
|
||||
case QUNIFORM_UBO_ADDR:
|
||||
cl_reloc(vc4, &vc4->uniforms, ubo, 0);
|
||||
break;
|
||||
|
||||
case QUNIFORM_TEXTURE_BORDER_COLOR:
|
||||
write_texture_border_color(vc4, texstate, uinfo->data[i]);
|
||||
break;
|
||||
|
@@ -93,6 +93,7 @@ static const struct qir_op_info qir_op_info[] = {
|
||||
[QOP_TEX_T] = { "tex_t", 0, 2 },
|
||||
[QOP_TEX_R] = { "tex_r", 0, 2 },
|
||||
[QOP_TEX_B] = { "tex_b", 0, 2 },
|
||||
[QOP_TEX_DIRECT] = { "tex_direct", 0, 2 },
|
||||
[QOP_TEX_RESULT] = { "tex_result", 1, 0, true },
|
||||
[QOP_R4_UNPACK_A] = { "r4_unpack_a", 1, 1 },
|
||||
[QOP_R4_UNPACK_B] = { "r4_unpack_b", 1, 1 },
|
||||
|
@@ -122,6 +122,16 @@ enum qop {
|
||||
QOP_TEX_R,
|
||||
/** Texture LOD bias parameter write */
|
||||
QOP_TEX_B,
|
||||
|
||||
/**
|
||||
* Texture-unit 4-byte read with address provided direct in S
|
||||
* cooordinate.
|
||||
*
|
||||
* The first operand is the offset from the start of the UBO, and the
|
||||
* second is the uniform that has the UBO's base pointer.
|
||||
*/
|
||||
QOP_TEX_DIRECT,
|
||||
|
||||
/**
|
||||
* Signal of texture read being necessary and then reading r4 into
|
||||
* the destination
|
||||
@@ -207,6 +217,8 @@ enum quniform_contents {
|
||||
/** A reference to a texture config parameter 2 cubemap stride uniform */
|
||||
QUNIFORM_TEXTURE_CONFIG_P2,
|
||||
|
||||
QUNIFORM_UBO_ADDR,
|
||||
|
||||
QUNIFORM_TEXRECT_SCALE_X,
|
||||
QUNIFORM_TEXRECT_SCALE_Y,
|
||||
|
||||
@@ -224,6 +236,31 @@ struct vc4_varying_semantic {
|
||||
uint8_t swizzle;
|
||||
};
|
||||
|
||||
struct vc4_compiler_ubo_range {
|
||||
/**
|
||||
* offset in bytes from the start of the ubo where this range is
|
||||
* uploaded.
|
||||
*
|
||||
* Only set once used is set.
|
||||
*/
|
||||
uint32_t dst_offset;
|
||||
|
||||
/**
|
||||
* offset in bytes from the start of the gallium uniforms where the
|
||||
* data comes from.
|
||||
*/
|
||||
uint32_t src_offset;
|
||||
|
||||
/** size in bytes of this ubo range */
|
||||
uint32_t size;
|
||||
|
||||
/**
|
||||
* Set if this range is used by the shader for indirect uniforms
|
||||
* access.
|
||||
*/
|
||||
bool used;
|
||||
};
|
||||
|
||||
struct vc4_compile {
|
||||
struct vc4_context *vc4;
|
||||
struct tgsi_parse_context parser;
|
||||
@@ -236,12 +273,19 @@ struct vc4_compile {
|
||||
struct qreg *inputs;
|
||||
struct qreg *outputs;
|
||||
struct qreg *consts;
|
||||
struct qreg addr[4]; /* TGSI ARL destination. */
|
||||
uint32_t temps_array_size;
|
||||
uint32_t inputs_array_size;
|
||||
uint32_t outputs_array_size;
|
||||
uint32_t uniforms_array_size;
|
||||
uint32_t consts_array_size;
|
||||
uint32_t num_consts;
|
||||
|
||||
struct vc4_compiler_ubo_range *ubo_ranges;
|
||||
uint32_t ubo_ranges_array_size;
|
||||
uint32_t num_ubo_ranges;
|
||||
uint32_t next_ubo_dst_offset;
|
||||
|
||||
struct qreg line_x, point_x, point_y;
|
||||
struct qreg discard;
|
||||
|
||||
@@ -409,6 +453,7 @@ QIR_NODST_2(TEX_S)
|
||||
QIR_NODST_2(TEX_T)
|
||||
QIR_NODST_2(TEX_R)
|
||||
QIR_NODST_2(TEX_B)
|
||||
QIR_NODST_2(TEX_DIRECT)
|
||||
QIR_ALU0(FRAG_X)
|
||||
QIR_ALU0(FRAG_Y)
|
||||
QIR_ALU0(FRAG_Z)
|
||||
|
@@ -517,6 +517,11 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
|
||||
src[0]));
|
||||
break;
|
||||
|
||||
case QOP_TEX_DIRECT:
|
||||
fixup_raddr_conflict(c, &src[0], &src[1]);
|
||||
queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
|
||||
break;
|
||||
|
||||
case QOP_TEX_RESULT:
|
||||
queue(c, qpu_NOP());
|
||||
*last_inst(c) = qpu_set_sig(*last_inst(c),
|
||||
|
@@ -299,8 +299,9 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
|
||||
case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
|
||||
case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
|
||||
case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
|
||||
case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
|
||||
return 0;
|
||||
case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
|
||||
return 1;
|
||||
case PIPE_SHADER_CAP_SUBROUTINES:
|
||||
return 0;
|
||||
case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
|
||||
|
Reference in New Issue
Block a user