v3d: Use the core tex lowering.
Even without any clever optimization on the unpack operations, this gives us a useful value for the channels read field, which we can use to avoid ldtmu instructions to the no-op register. instructions in affected programs: 890712 -> 881974 (-0.98%)
This commit is contained in:
@@ -126,19 +126,12 @@ v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool return_16 = (c->key->tex[unit].return_size == 16 ||
|
|
||||||
p0_unpacked.shadow);
|
|
||||||
|
|
||||||
/* Limit the number of channels returned to both how many the NIR
|
/* Limit the number of channels returned to both how many the NIR
|
||||||
* instruction writes and how many the instruction could produce.
|
* instruction writes and how many the instruction could produce.
|
||||||
*/
|
*/
|
||||||
uint32_t instr_return_channels = nir_tex_instr_dest_size(instr);
|
assert(instr->dest.is_ssa);
|
||||||
if (return_16)
|
|
||||||
instr_return_channels = (instr_return_channels + 1) / 2;
|
|
||||||
|
|
||||||
p1_unpacked.return_words_of_texture_data =
|
p1_unpacked.return_words_of_texture_data =
|
||||||
(1 << MIN2(instr_return_channels,
|
nir_ssa_def_components_read(&instr->dest.ssa);
|
||||||
c->key->tex[unit].return_channels)) - 1;
|
|
||||||
|
|
||||||
uint32_t p0_packed;
|
uint32_t p0_packed;
|
||||||
V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_pack(NULL,
|
V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_pack(NULL,
|
||||||
@@ -193,56 +186,8 @@ v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
|
|||||||
|
|
||||||
vir_emit_thrsw(c);
|
vir_emit_thrsw(c);
|
||||||
|
|
||||||
struct qreg return_values[4];
|
|
||||||
for (int i = 0; i < 4; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
/* Swizzling .zw of an RG texture should give undefined
|
|
||||||
* results, not crash the compiler.
|
|
||||||
*/
|
|
||||||
if (p1_unpacked.return_words_of_texture_data & (1 << i))
|
if (p1_unpacked.return_words_of_texture_data & (1 << i))
|
||||||
return_values[i] = vir_LDTMU(c);
|
ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
|
||||||
else
|
|
||||||
return_values[i] = c->undef;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < nir_tex_instr_dest_size(instr); i++) {
|
|
||||||
struct qreg chan;
|
|
||||||
|
|
||||||
if (return_16) {
|
|
||||||
STATIC_ASSERT(PIPE_SWIZZLE_X == 0);
|
|
||||||
chan = return_values[i / 2];
|
|
||||||
|
|
||||||
if (nir_alu_type_get_base_type(instr->dest_type) ==
|
|
||||||
nir_type_float) {
|
|
||||||
enum v3d_qpu_input_unpack unpack;
|
|
||||||
if (i & 1)
|
|
||||||
unpack = V3D_QPU_UNPACK_H;
|
|
||||||
else
|
|
||||||
unpack = V3D_QPU_UNPACK_L;
|
|
||||||
|
|
||||||
chan = vir_FMOV(c, chan);
|
|
||||||
vir_set_unpack(c->defs[chan.index], 0, unpack);
|
|
||||||
} else {
|
|
||||||
/* If we're unpacking the low field, shift it
|
|
||||||
* up to the top first.
|
|
||||||
*/
|
|
||||||
if ((i & 1) == 0) {
|
|
||||||
chan = vir_SHL(c, chan,
|
|
||||||
vir_uniform_ui(c, 16));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Do proper sign extension to a 32-bit int. */
|
|
||||||
if (nir_alu_type_get_base_type(instr->dest_type) ==
|
|
||||||
nir_type_int) {
|
|
||||||
chan = vir_ASR(c, chan,
|
|
||||||
vir_uniform_ui(c, 16));
|
|
||||||
} else {
|
|
||||||
chan = vir_SHR(c, chan,
|
|
||||||
vir_uniform_ui(c, 16));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chan = vir_MOV(c, return_values[i]);
|
|
||||||
}
|
|
||||||
ntq_store_dest(c, &instr->dest, i, chan);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -166,18 +166,10 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
|
|||||||
|
|
||||||
/* Limit the number of channels returned to both how many the NIR
|
/* Limit the number of channels returned to both how many the NIR
|
||||||
* instruction writes and how many the instruction could produce.
|
* instruction writes and how many the instruction could produce.
|
||||||
*
|
|
||||||
* XXX perf: Can we also limit to the number of channels that are
|
|
||||||
* actually read by the users of this NIR dest, so that we don't need
|
|
||||||
* to emit unused LDTMUs?
|
|
||||||
*/
|
*/
|
||||||
uint32_t instr_return_channels = nir_tex_instr_dest_size(instr);
|
assert(instr->dest.is_ssa);
|
||||||
if (!p1_unpacked.output_type_32_bit)
|
|
||||||
instr_return_channels = (instr_return_channels + 1) / 2;
|
|
||||||
|
|
||||||
p0_unpacked.return_words_of_texture_data =
|
p0_unpacked.return_words_of_texture_data =
|
||||||
(1 << MIN2(instr_return_channels,
|
nir_ssa_def_components_read(&instr->dest.ssa);
|
||||||
c->key->tex[unit].return_channels)) - 1;
|
|
||||||
|
|
||||||
/* Word enables can't ask for more channels than the output type could
|
/* Word enables can't ask for more channels than the output type could
|
||||||
* provide (2 for f16, 4 for 32-bit).
|
* provide (2 for f16, 4 for 32-bit).
|
||||||
@@ -232,62 +224,8 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
|
|||||||
while (tmu_writes > 16 / c->threads)
|
while (tmu_writes > 16 / c->threads)
|
||||||
c->threads /= 2;
|
c->threads /= 2;
|
||||||
|
|
||||||
struct qreg return_values[4];
|
|
||||||
for (int i = 0; i < 4; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
/* Swizzling .zw of an RG texture should give undefined
|
|
||||||
* results, not crash the compiler.
|
|
||||||
*/
|
|
||||||
if (p0_unpacked.return_words_of_texture_data & (1 << i))
|
if (p0_unpacked.return_words_of_texture_data & (1 << i))
|
||||||
return_values[i] = vir_LDTMU(c);
|
ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
|
||||||
else
|
|
||||||
return_values[i] = c->undef;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < nir_tex_instr_dest_size(instr); i++) {
|
|
||||||
struct qreg chan;
|
|
||||||
|
|
||||||
if (!p1_unpacked.output_type_32_bit) {
|
|
||||||
STATIC_ASSERT(PIPE_SWIZZLE_X == 0);
|
|
||||||
chan = return_values[i / 2];
|
|
||||||
|
|
||||||
/* XXX perf: We should move this unpacking into NIR.
|
|
||||||
* That would give us exposure of these types to NIR
|
|
||||||
* optimization, so that (for example) a repacking of
|
|
||||||
* half-float samples to the half-float render target
|
|
||||||
* could be eliminated.
|
|
||||||
*/
|
|
||||||
if (nir_alu_type_get_base_type(instr->dest_type) ==
|
|
||||||
nir_type_float) {
|
|
||||||
enum v3d_qpu_input_unpack unpack;
|
|
||||||
if (i & 1)
|
|
||||||
unpack = V3D_QPU_UNPACK_H;
|
|
||||||
else
|
|
||||||
unpack = V3D_QPU_UNPACK_L;
|
|
||||||
|
|
||||||
chan = vir_FMOV(c, chan);
|
|
||||||
vir_set_unpack(c->defs[chan.index], 0, unpack);
|
|
||||||
} else {
|
|
||||||
/* If we're unpacking the low field, shift it
|
|
||||||
* up to the top first.
|
|
||||||
*/
|
|
||||||
if ((i & 1) == 0) {
|
|
||||||
chan = vir_SHL(c, chan,
|
|
||||||
vir_uniform_ui(c, 16));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Do proper sign extension to a 32-bit int. */
|
|
||||||
if (nir_alu_type_get_base_type(instr->dest_type) ==
|
|
||||||
nir_type_int) {
|
|
||||||
chan = vir_ASR(c, chan,
|
|
||||||
vir_uniform_ui(c, 16));
|
|
||||||
} else {
|
|
||||||
chan = vir_SHR(c, chan,
|
|
||||||
vir_uniform_ui(c, 16));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chan = vir_MOV(c, return_values[i]);
|
|
||||||
}
|
|
||||||
ntq_store_dest(c, &instr->dest, i, chan);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -630,6 +630,10 @@ v3d_lower_nir(struct v3d_compile *c)
|
|||||||
tex_options.saturate_t |= 1 << i;
|
tex_options.saturate_t |= 1 << i;
|
||||||
if (c->key->tex[i].clamp_r)
|
if (c->key->tex[i].clamp_r)
|
||||||
tex_options.saturate_r |= 1 << i;
|
tex_options.saturate_r |= 1 << i;
|
||||||
|
if (c->key->tex[i].return_size == 16) {
|
||||||
|
tex_options.lower_tex_packing[i] =
|
||||||
|
nir_lower_tex_packing_16;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
|
NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
|
||||||
|
Reference in New Issue
Block a user