intel/vec4: Add support for UBO pushing

Shader-db results on Haswell (vec4 only):

    total instructions in shared programs: 2853928 -> 2726576 (-4.46%)
    instructions in affected programs: 855840 -> 728488 (-14.88%)
    helped: 9500
    HURT: 18
    helped stats (abs) min: 1 max: 359 x̄: 13.54 x̃: 11
    helped stats (rel) min: 0.44% max: 53.33% x̄: 19.13% x̃: 17.44%
    HURT stats (abs)   min: 4 max: 124 x̄: 71.00 x̃: 92
    HURT stats (rel)   min: 3.64% max: 77.86% x̄: 46.43% x̃: 52.12%
    95% mean confidence interval for instructions value: -13.78 -12.98
    95% mean confidence interval for instructions %-change: -19.21% -18.81%
    Instructions are helped.

    total cycles in shared programs: 101822616 -> 60245580 (-40.83%)
    cycles in affected programs: 93312382 -> 51735346 (-44.56%)
    helped: 13292
    HURT: 4506
    helped stats (abs) min: 2 max: 1229260 x̄: 3370.82 x̃: 776
    helped stats (rel) min: 0.04% max: 96.70% x̄: 47.56% x̃: 43.76%
    HURT stats (abs)   min: 2 max: 17644 x̄: 716.37 x̃: 82
    HURT stats (rel)   min: 0.02% max: 491.80% x̄: 41.00% x̃: 11.11%
    95% mean confidence interval for cycles value: -3037.07 -1635.03
    95% mean confidence interval for cycles %-change: -26.03% -24.25%
    Cycles are helped.

    total spills in shared programs: 1080 -> 1314 (21.67%)
    spills in affected programs: 74 -> 308 (316.22%)
    helped: 0
    HURT: 47

    total fills in shared programs: 310 -> 497 (60.32%)
    fills in affected programs: 71 -> 258 (263.38%)
    helped: 0
    HURT: 47

    total sends in shared programs: 239884 -> 151799 (-36.72%)
    sends in affected programs: 129302 -> 41217 (-68.12%)
    helped: 9547
    HURT: 0
    helped stats (abs) min: 1 max: 226 x̄: 9.23 x̃: 8
    helped stats (rel) min: 3.12% max: 98.15% x̄: 72.38% x̃: 80.00%
    95% mean confidence interval for sends value: -9.48 -8.98
    95% mean confidence interval for sends %-change: -72.80% -71.97%
    Sends are helped.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10571>
This commit is contained in:
Jason Ekstrand
2021-05-02 17:19:02 -05:00
committed by Marge Bot
parent 89fd196f6b
commit ebba3cad81
5 changed files with 47 additions and 17 deletions

View File

@@ -78,8 +78,6 @@ offset(const fs_reg &reg, const brw::fs_builder &bld, unsigned delta)
return offset(reg, bld.dispatch_width(), delta); return offset(reg, bld.dispatch_width(), delta);
} }
#define UBO_START ((1 << 16) - 4)
struct shader_stats { struct shader_stats {
const char *scheduler_mode; const char *scheduler_mode;
unsigned promoted_constants; unsigned promoted_constants;

View File

@@ -202,8 +202,7 @@ brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
{ {
const struct intel_device_info *devinfo = compiler->devinfo; const struct intel_device_info *devinfo = compiler->devinfo;
if ((devinfo->verx10 <= 70) || if (devinfo->verx10 <= 70) {
!compiler->scalar_stage[nir->info.stage]) {
memset(out_ranges, 0, 4 * sizeof(struct brw_ubo_range)); memset(out_ranges, 0, 4 * sizeof(struct brw_ubo_range));
return; return;
} }

View File

@@ -40,6 +40,8 @@ enum instruction_scheduler_mode {
SCHEDULE_POST, SCHEDULE_POST,
}; };
#define UBO_START ((1 << 16) - 4)
struct backend_shader { struct backend_shader {
protected: protected:

View File

@@ -593,7 +593,7 @@ vec4_visitor::split_uniform_registers()
*/ */
foreach_block_and_inst(block, vec4_instruction, inst, cfg) { foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
for (int i = 0 ; i < 3; i++) { for (int i = 0 ; i < 3; i++) {
if (inst->src[i].file != UNIFORM) if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START)
continue; continue;
assert(!inst->src[i].reladdr); assert(!inst->src[i].reladdr);
@@ -672,7 +672,7 @@ vec4_visitor::pack_uniform_registers()
} }
for (int i = 0 ; i < 3; i++) { for (int i = 0 ; i < 3; i++) {
if (inst->src[i].file != UNIFORM) if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START)
continue; continue;
assert(type_sz(inst->src[i].type) % 4 == 0); assert(type_sz(inst->src[i].type) % 4 == 0);
@@ -782,7 +782,7 @@ vec4_visitor::pack_uniform_registers()
for (int i = 0 ; i < 3; i++) { for (int i = 0 ; i < 3; i++) {
int src = inst->src[i].nr; int src = inst->src[i].nr;
if (inst->src[i].file != UNIFORM) if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START)
continue; continue;
int chan = new_chan[src] / channel_sizes[src]; int chan = new_chan[src] / channel_sizes[src];
@@ -977,7 +977,7 @@ vec4_visitor::move_push_constants_to_pull_constants()
*/ */
foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
for (int i = 0 ; i < 3; i++) { for (int i = 0 ; i < 3; i++) {
if (inst->src[i].file != UNIFORM || if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START ||
pull_constant_loc[inst->src[i].nr] == -1) pull_constant_loc[inst->src[i].nr] == -1)
continue; continue;
@@ -2078,11 +2078,19 @@ vec4_visitor::convert_to_hw_regs()
} }
case UNIFORM: { case UNIFORM: {
reg = stride(byte_offset(brw_vec4_grf( if (src.nr >= UBO_START) {
prog_data->base.dispatch_grf_start_reg + reg = byte_offset(brw_vec4_grf(
src.nr / 2, src.nr % 2 * 4), prog_data->base.dispatch_grf_start_reg +
src.offset), ubo_push_start[src.nr - UBO_START] +
0, 4, 1); src.offset / 32, 0),
src.offset % 32);
} else {
reg = byte_offset(brw_vec4_grf(
prog_data->base.dispatch_grf_start_reg +
src.nr / 2, src.nr % 2 * 4),
src.offset);
}
reg = stride(reg, 0, 4, 1);
reg.type = src.type; reg.type = src.type;
reg.abs = src.abs; reg.abs = src.abs;
reg.negate = src.negate; reg.negate = src.negate;

View File

@@ -624,8 +624,6 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
case nir_intrinsic_load_ubo: { case nir_intrinsic_load_ubo: {
src_reg surf_index; src_reg surf_index;
prog_data->base.has_ubo_pull = true;
dest = get_nir_dest(instr->dest); dest = get_nir_dest(instr->dest);
if (nir_src_is_const(instr->src[0])) { if (nir_src_is_const(instr->src[0])) {
@@ -647,10 +645,31 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
surf_index = emit_uniformize(surf_index); surf_index = emit_uniformize(surf_index);
} }
src_reg push_reg;
src_reg offset_reg; src_reg offset_reg;
if (nir_src_is_const(instr->src[1])) { if (nir_src_is_const(instr->src[1])) {
unsigned load_offset = nir_src_as_uint(instr->src[1]); unsigned load_offset = nir_src_as_uint(instr->src[1]);
offset_reg = brw_imm_ud(load_offset & ~15); unsigned aligned_offset = load_offset & ~15;
offset_reg = brw_imm_ud(aligned_offset);
/* See if we've selected this as a push constant candidate */
if (nir_src_is_const(instr->src[0])) {
const unsigned ubo_block = nir_src_as_uint(instr->src[0]);
const unsigned offset_256b = aligned_offset / 32;
for (int i = 0; i < 4; i++) {
const struct brw_ubo_range *range = &prog_data->base.ubo_ranges[i];
if (range->block == ubo_block &&
offset_256b >= range->start &&
offset_256b < range->start + range->length) {
push_reg = src_reg(dst_reg(UNIFORM, UBO_START + i));
push_reg.type = dest.type;
push_reg.offset = aligned_offset - 32 * range->start;
break;
}
}
}
} else { } else {
offset_reg = src_reg(this, glsl_type::uint_type); offset_reg = src_reg(this, glsl_type::uint_type);
emit(MOV(dst_reg(offset_reg), emit(MOV(dst_reg(offset_reg),
@@ -658,12 +677,15 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
} }
src_reg packed_consts; src_reg packed_consts;
if (nir_dest_bit_size(instr->dest) == 32) { if (push_reg.file != BAD_FILE) {
packed_consts = push_reg;
} else if (nir_dest_bit_size(instr->dest) == 32) {
packed_consts = src_reg(this, glsl_type::vec4_type); packed_consts = src_reg(this, glsl_type::vec4_type);
emit_pull_constant_load_reg(dst_reg(packed_consts), emit_pull_constant_load_reg(dst_reg(packed_consts),
surf_index, surf_index,
offset_reg, offset_reg,
NULL, NULL /* before_block/inst */); NULL, NULL /* before_block/inst */);
prog_data->base.has_ubo_pull = true;
} else { } else {
src_reg temp = src_reg(this, glsl_type::dvec4_type); src_reg temp = src_reg(this, glsl_type::dvec4_type);
src_reg temp_float = retype(temp, BRW_REGISTER_TYPE_F); src_reg temp_float = retype(temp, BRW_REGISTER_TYPE_F);
@@ -676,6 +698,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16u))); emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16u)));
emit_pull_constant_load_reg(dst_reg(byte_offset(temp_float, REG_SIZE)), emit_pull_constant_load_reg(dst_reg(byte_offset(temp_float, REG_SIZE)),
surf_index, offset_reg, NULL, NULL); surf_index, offset_reg, NULL, NULL);
prog_data->base.has_ubo_pull = true;
packed_consts = src_reg(this, glsl_type::dvec4_type); packed_consts = src_reg(this, glsl_type::dvec4_type);
shuffle_64bit_data(dst_reg(packed_consts), temp, false); shuffle_64bit_data(dst_reg(packed_consts), temp, false);