iris/gen9: Optimize slice and subslice load balancing behavior.
See "i965/gen9: Optimize slice and subslice load balancing behavior." for the rationale. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
@@ -307,6 +307,12 @@ iris_blorp_exec(struct blorp_batch *blorp_batch,
|
|||||||
|
|
||||||
iris_require_command_space(batch, 1400);
|
iris_require_command_space(batch, 1400);
|
||||||
|
|
||||||
|
const unsigned scale = params->fast_clear_op ? UINT_MAX : 1;
|
||||||
|
if (ice->state.current_hash_scale != scale) {
|
||||||
|
genX(emit_hashing_mode)(ice, batch, params->x1 - params->x0,
|
||||||
|
params->y1 - params->y0, scale);
|
||||||
|
}
|
||||||
|
|
||||||
blorp_exec(blorp_batch, params);
|
blorp_exec(blorp_batch, params);
|
||||||
|
|
||||||
/* We've smashed all state compared to what the normal 3D pipeline
|
/* We've smashed all state compared to what the normal 3D pipeline
|
||||||
|
@@ -98,6 +98,7 @@ iris_lost_context_state(struct iris_batch *batch)
|
|||||||
}
|
}
|
||||||
|
|
||||||
ice->state.dirty = ~0ull;
|
ice->state.dirty = ~0ull;
|
||||||
|
ice->state.current_hash_scale = 0;
|
||||||
memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid));
|
memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid));
|
||||||
batch->last_surface_base_address = ~0ull;
|
batch->last_surface_base_address = ~0ull;
|
||||||
ice->vtbl.lost_genx_state(ice, batch);
|
ice->vtbl.lost_genx_state(ice, batch);
|
||||||
|
@@ -733,6 +733,9 @@ struct iris_context {
|
|||||||
|
|
||||||
/** Records the size of variable-length state for INTEL_DEBUG=bat */
|
/** Records the size of variable-length state for INTEL_DEBUG=bat */
|
||||||
struct hash_table_u64 *sizes;
|
struct hash_table_u64 *sizes;
|
||||||
|
|
||||||
|
/** Last rendering scale argument provided to genX(emit_hashing_mode). */
|
||||||
|
unsigned current_hash_scale;
|
||||||
} state;
|
} state;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -33,6 +33,10 @@ void genX(emit_urb_setup)(struct iris_context *ice,
|
|||||||
struct iris_batch *batch,
|
struct iris_batch *batch,
|
||||||
const unsigned size[4],
|
const unsigned size[4],
|
||||||
bool tess_present, bool gs_present);
|
bool tess_present, bool gs_present);
|
||||||
|
void genX(emit_hashing_mode)(struct iris_context *ice,
|
||||||
|
struct iris_batch *batch,
|
||||||
|
unsigned width, unsigned height,
|
||||||
|
unsigned scale);
|
||||||
|
|
||||||
/* iris_blorp.c */
|
/* iris_blorp.c */
|
||||||
void genX(init_blorp)(struct iris_context *ice);
|
void genX(init_blorp)(struct iris_context *ice);
|
||||||
|
@@ -5192,6 +5192,9 @@ iris_upload_dirty_render_state(struct iris_context *ice,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ice->state.current_hash_scale != 1)
|
||||||
|
genX(emit_hashing_mode)(ice, batch, UINT_MAX, UINT_MAX, 1);
|
||||||
|
|
||||||
/* TODO: Gen8 PMA fix */
|
/* TODO: Gen8 PMA fix */
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -6462,6 +6465,99 @@ iris_emit_mi_report_perf_count(struct iris_batch *batch,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update the pixel hashing modes that determine the balancing of PS threads
|
||||||
|
* across subslices and slices.
|
||||||
|
*
|
||||||
|
* \param width Width bound of the rendering area (already scaled down if \p
|
||||||
|
* scale is greater than 1).
|
||||||
|
* \param height Height bound of the rendering area (already scaled down if \p
|
||||||
|
* scale is greater than 1).
|
||||||
|
* \param scale The number of framebuffer samples that could potentially be
|
||||||
|
* affected by an individual channel of the PS thread. This is
|
||||||
|
* typically one for single-sampled rendering, but for operations
|
||||||
|
* like CCS resolves and fast clears a single PS invocation may
|
||||||
|
* update a huge number of pixels, in which case a finer
|
||||||
|
* balancing is desirable in order to maximally utilize the
|
||||||
|
* bandwidth available. UINT_MAX can be used as shorthand for
|
||||||
|
* "finest hashing mode available".
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
genX(emit_hashing_mode)(struct iris_context *ice, struct iris_batch *batch,
|
||||||
|
unsigned width, unsigned height, unsigned scale)
|
||||||
|
{
|
||||||
|
#if GEN_GEN == 9
|
||||||
|
const struct gen_device_info *devinfo = &batch->screen->devinfo;
|
||||||
|
const unsigned slice_hashing[] = {
|
||||||
|
/* Because all Gen9 platforms with more than one slice require
|
||||||
|
* three-way subslice hashing, a single "normal" 16x16 slice hashing
|
||||||
|
* block is guaranteed to suffer from substantial imbalance, with one
|
||||||
|
* subslice receiving twice as much work as the other two in the
|
||||||
|
* slice.
|
||||||
|
*
|
||||||
|
* The performance impact of that would be particularly severe when
|
||||||
|
* three-way hashing is also in use for slice balancing (which is the
|
||||||
|
* case for all Gen9 GT4 platforms), because one of the slices
|
||||||
|
* receives one every three 16x16 blocks in either direction, which
|
||||||
|
* is roughly the periodicity of the underlying subslice imbalance
|
||||||
|
* pattern ("roughly" because in reality the hardware's
|
||||||
|
* implementation of three-way hashing doesn't do exact modulo 3
|
||||||
|
* arithmetic, which somewhat decreases the magnitude of this effect
|
||||||
|
* in practice). This leads to a systematic subslice imbalance
|
||||||
|
* within that slice regardless of the size of the primitive. The
|
||||||
|
* 32x32 hashing mode guarantees that the subslice imbalance within a
|
||||||
|
* single slice hashing block is minimal, largely eliminating this
|
||||||
|
* effect.
|
||||||
|
*/
|
||||||
|
_32x32,
|
||||||
|
/* Finest slice hashing mode available. */
|
||||||
|
NORMAL
|
||||||
|
};
|
||||||
|
const unsigned subslice_hashing[] = {
|
||||||
|
/* 16x16 would provide a slight cache locality benefit especially
|
||||||
|
* visible in the sampler L1 cache efficiency of low-bandwidth
|
||||||
|
* non-LLC platforms, but it comes at the cost of greater subslice
|
||||||
|
* imbalance for primitives of dimensions approximately intermediate
|
||||||
|
* between 16x4 and 16x16.
|
||||||
|
*/
|
||||||
|
_16x4,
|
||||||
|
/* Finest subslice hashing mode available. */
|
||||||
|
_8x4
|
||||||
|
};
|
||||||
|
/* Dimensions of the smallest hashing block of a given hashing mode. If
|
||||||
|
* the rendering area is smaller than this there can't possibly be any
|
||||||
|
* benefit from switching to this mode, so we optimize out the
|
||||||
|
* transition.
|
||||||
|
*/
|
||||||
|
const unsigned min_size[][2] = {
|
||||||
|
{ 16, 4 },
|
||||||
|
{ 8, 4 }
|
||||||
|
};
|
||||||
|
const unsigned idx = scale > 1;
|
||||||
|
|
||||||
|
if (width > min_size[idx][0] || height > min_size[idx][1]) {
|
||||||
|
uint32_t gt_mode;
|
||||||
|
|
||||||
|
iris_pack_state(GENX(GT_MODE), >_mode, reg) {
|
||||||
|
reg.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
|
||||||
|
reg.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
|
||||||
|
reg.SubsliceHashing = subslice_hashing[idx];
|
||||||
|
reg.SubsliceHashingMask = -1;
|
||||||
|
};
|
||||||
|
|
||||||
|
iris_emit_raw_pipe_control(batch,
|
||||||
|
"workaround: CS stall before GT_MODE LRI",
|
||||||
|
PIPE_CONTROL_STALL_AT_SCOREBOARD |
|
||||||
|
PIPE_CONTROL_CS_STALL,
|
||||||
|
NULL, 0, 0);
|
||||||
|
|
||||||
|
iris_emit_lri(batch, GT_MODE, gt_mode);
|
||||||
|
|
||||||
|
ice->state.current_hash_scale = scale;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
genX(init_state)(struct iris_context *ice)
|
genX(init_state)(struct iris_context *ice)
|
||||||
{
|
{
|
||||||
|
Reference in New Issue
Block a user