i965: Use pushed UBO data in the scalar backend.

This actually takes advantage of the newly pushed UBO data, avoiding
pull loads.

Improves performance in GLBenchmark Manhattan 3.1 by:

   HSW: ~1%, BDW/SKL/KBL GT2: 3-4%, SKL GT4: 7-8%, APL: 4-5%.
   (thanks to Eero Tamminen for these numbers)

shader-db results on Skylake, ignoring programs with spill/fill changes:

   total instructions in shared programs: 13963994 -> 13651893 (-2.24%)
   instructions in affected programs: 4250328 -> 3938227 (-7.34%)
   helped: 28527
   HURT: 0

   total cycles in shared programs: 179808608 -> 172535170 (-4.05%)
   cycles in affected programs: 79720410 -> 72446972 (-9.12%)
   helped: 26951
   HURT: 1248

   LOST:   46
   GAINED: 21

Many "Deus Ex: Mankind Divided" shaders which already spilled end up
spill a lot more (about 240 programs hurt, 9 helped).  The cycle
estimator suggests this is still overall a win (-0.23% in cycle counts)
presumably because we trade pull loads for fills.

v2: Drop "PULL" environment variable left in for initial debugging
    (caught by Matt).

Reviewed-by: Matt Turner <mattst88@gmail.com>
This commit is contained in:
Kenneth Graunke
2016-11-29 05:20:20 -08:00
parent c9ef27e77b
commit b2da123801
3 changed files with 64 additions and 1 deletions

View File

@@ -1386,7 +1386,9 @@ fs_visitor::assign_curb_setup()
unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
unsigned ubo_push_length = 0;
unsigned ubo_push_start[4];
for (int i = 0; i < 4; i++) {
ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
ubo_push_length += stage_prog_data->ubo_ranges[i].length;
}
@@ -1398,7 +1400,11 @@ fs_visitor::assign_curb_setup()
if (inst->src[i].file == UNIFORM) {
int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
int constant_nr;
if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
if (inst->src[i].nr >= UBO_START) {
/* constant_nr is in 32-bit units, the rest are in bytes */
constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
inst->src[i].offset / 4;
} else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
constant_nr = push_constant_loc[uniform_nr];
} else {
/* Section 5.11 of the OpenGL 4.1 spec says:
@@ -2069,6 +2075,20 @@ fs_visitor::assign_constant_locations()
stage_prog_data->nr_params = num_push_constants;
stage_prog_data->nr_pull_params = num_pull_constants;
/* Now that we know how many regular uniforms we'll push, reduce the
* UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
*/
unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
for (int i = 0; i < 4; i++) {
struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
if (push_length + range->length > 64)
range->length = 64 - push_length;
push_length += range->length;
}
assert(push_length <= 64);
/* Up until now, the param[] array has been indexed by reg + offset
* of UNIFORM registers. Move pull constants into pull_param[] and
* condense param[] to only contain the uniforms we chose to push.
@@ -2103,6 +2123,19 @@ fs_visitor::get_pull_locs(const fs_reg &src,
{
assert(src.file == UNIFORM);
if (src.nr >= UBO_START) {
const struct brw_ubo_range *range =
&prog_data->ubo_ranges[src.nr - UBO_START];
/* If this access is in our (reduced) range, use the push data. */
if (src.offset / 32 < range->length)
return false;
*out_surf_index = prog_data->binding_table.ubo_start + range->block;
*out_pull_index = (32 * range->start + src.offset) / 4;
return true;
}
const unsigned location = src.nr + src.offset / 4;
if (location < uniforms && pull_constant_loc[location] != -1) {

View File

@@ -50,6 +50,8 @@ offset(const fs_reg &reg, const brw::fs_builder &bld, unsigned delta)
return offset(reg, bld.dispatch_width(), delta);
}
#define UBO_START ((1 << 16) - 4)
/**
* The fragment shader front-end.
*

View File

@@ -3822,6 +3822,34 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
* and we have to split it if necessary.
*/
const unsigned type_size = type_sz(dest.type);
/* See if we've selected this as a push constant candidate */
if (const_index) {
const unsigned ubo_block = const_index->u32[0];
const unsigned offset_256b = const_offset->u32[0] / 32;
fs_reg push_reg;
for (int i = 0; i < 4; i++) {
const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
if (range->block == ubo_block &&
offset_256b >= range->start &&
offset_256b < range->start + range->length) {
push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
push_reg.offset = const_offset->u32[0] - 32 * range->start;
break;
}
}
if (push_reg.file != BAD_FILE) {
for (unsigned i = 0; i < instr->num_components; i++) {
bld.MOV(offset(dest, bld, i),
byte_offset(push_reg, i * type_size));
}
break;
}
}
const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);