anv: Use PIPE_CONTROL flushes to implement the gen8 VF cache WA

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
This commit is contained in:
Jason Ekstrand
2019-11-25 21:55:51 -06:00
parent 1b5cb92b62
commit 46af0ecc1d
6 changed files with 245 additions and 20 deletions

View File

@@ -141,8 +141,12 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
}
}
/* We only allow 48-bit addresses with softpin because knowing the actual
* address is required for the vertex cache flush workaround.
*/
device->supports_48bit_addresses = (device->info.gen >= 8) &&
gtt_size > (4ULL << 30 /* GiB */);
device->has_softpin &&
gtt_size > (4ULL << 30 /* GiB */);
uint64_t heap_size = anv_compute_heap_size(fd, gtt_size);
@@ -471,10 +475,6 @@ anv_physical_device_init(struct anv_physical_device *device,
goto fail;
}
result = anv_physical_device_init_heaps(device, fd);
if (result != VK_SUCCESS)
goto fail;
device->has_softpin = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN);
device->has_exec_async = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_ASYNC);
device->has_exec_capture = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_CAPTURE);
@@ -484,6 +484,10 @@ anv_physical_device_init(struct anv_physical_device *device,
anv_gem_supports_syncobj_wait(fd);
device->has_context_priority = anv_gem_has_context_priority(fd);
result = anv_physical_device_init_heaps(device, fd);
if (result != VK_SUCCESS)
goto fail;
device->use_softpin = device->has_softpin &&
device->supports_48bit_addresses;

View File

@@ -44,6 +44,14 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer);
void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer);
void genX(cmd_buffer_set_binding_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
int vb_index,
struct anv_address vb_address,
uint32_t vb_size);
void genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
uint32_t access_type,
uint64_t vb_used);
void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
unsigned width, unsigned height,
unsigned scale);

View File

@@ -2503,6 +2503,27 @@ struct anv_attachment_state {
struct anv_image_view * image_view;
};
/** State tracking for vertex buffer flushes
*
* On Gen8-9, the VF cache only considers the bottom 32 bits of memory
* addresses. If you happen to have two vertex buffers which get placed
* exactly 4 GiB apart and use them in back-to-back draw calls, you can get
* collisions. In order to solve this problem, we track vertex address ranges
* which are live in the cache and invalidate the cache if one ever exceeds 32
* bits.
*/
struct anv_vb_cache_range {
/* Virtual address at which the live vertex buffer cache range starts for
* this vertex buffer index.
*/
uint64_t start;
/* Virtual address of the byte after where vertex buffer cache range ends.
* This is exclusive such that end - start is the size of the range.
*/
uint64_t end;
};
/** State tracking for particular pipeline bind point
*
* This struct is the base struct for anv_cmd_graphics_state and
@@ -2531,6 +2552,11 @@ struct anv_cmd_graphics_state {
anv_cmd_dirty_mask_t dirty;
uint32_t vb_dirty;
struct anv_vb_cache_range ib_bound_range;
struct anv_vb_cache_range ib_dirty_range;
struct anv_vb_cache_range vb_bound_ranges[33];
struct anv_vb_cache_range vb_dirty_ranges[33];
struct anv_dynamic_state dynamic;
struct {

View File

@@ -139,19 +139,6 @@ blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
struct blorp_address *addr)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
*
* "The VF cache needs to be invalidated before binding and then using
* Vertex Buffers that overlap with any previously bound Vertex Buffer
* (at a 64B granularity) since the last invalidation. A VF cache
* invalidate is performed by setting the "VF Cache Invalidation Enable"
* bit in PIPE_CONTROL."
*
* This restriction first appears in the Skylake PRM but the internal docs
* also list it as being an issue on Broadwell. In order to avoid this
* problem, we align all vertex buffer allocations to 64 bytes.
*/
struct anv_state vb_state =
anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 64);
@@ -170,9 +157,25 @@ blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
uint32_t *sizes,
unsigned num_vbs)
{
/* anv forces all vertex buffers into the low 4GB so there are never any
* transitions that require a VF invalidation.
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
for (unsigned i = 0; i < num_vbs; i++) {
struct anv_address anv_addr = {
.bo = addrs[i].buffer,
.offset = addrs[i].offset,
};
genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer,
i, anv_addr, sizes[i]);
}
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
/* Technically, we should call this *after* 3DPRIMITIVE but it doesn't
* really matter for blorp because we never call apply_pipe_flushes after
* this point.
*/
genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, SEQUENTIAL,
(1 << num_vbs) - 1);
}
#if GEN_GEN >= 8

View File

@@ -1392,6 +1392,10 @@ genX(BeginCommandBuffer)(
* executing anything. The chances are fairly high that they will use
* blorp at least once per primary command buffer so it shouldn't be
* wasted.
*
* There is also a workaround on gen8 which requires us to invalidate the
* VF cache occasionally. It's easier if we can assume we start with a
* fresh cache (See also genX(cmd_buffer_set_binding_for_gen8_vb_flush).)
*/
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
@@ -1598,6 +1602,14 @@ genX(CmdExecuteCommands)(
anv_cmd_buffer_add_secondary(primary, secondary);
}
/* The secondary isn't counted in our VF cache tracking so we need to
* invalidate the whole thing.
*/
if (GEN_GEN >= 8 && GEN_GEN <= 9) {
primary->state.pending_pipe_bits |=
ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
}
/* The secondary may have selected a different pipeline (3D or compute) and
* may have changed the current L3$ configuration. Reset our tracking
* variables to invalid values to ensure that we re-emit these in the case
@@ -1836,6 +1848,18 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
}
if ((GEN_GEN >= 8 && GEN_GEN <= 9) &&
(bits & ANV_PIPE_CS_STALL_BIT) &&
(bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
/* If we are doing a VF cache invalidate AND a CS stall (it must be
* both) then we can reset our vertex cache tracking.
*/
memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
sizeof(cmd_buffer->state.gfx.ib_dirty_range));
}
if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
#if GEN_GEN >= 12
@@ -2830,6 +2854,12 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
#endif
};
#if GEN_GEN >= 8 && GEN_GEN <= 9
genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, vb,
state.BufferStartingAddress,
state.BufferSize);
#endif
GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
i++;
}
@@ -2967,6 +2997,9 @@ emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
.EndAddress = anv_address_add(addr, size),
#endif
});
genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer,
index, addr, size);
}
static void
@@ -3014,6 +3047,25 @@ emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
}
static void
update_dirty_vbs_for_gen8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
uint32_t access_type)
{
struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline;
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
uint64_t vb_used = pipeline->vb_used;
if (vs_prog_data->uses_firstvertex ||
vs_prog_data->uses_baseinstance)
vb_used |= 1ull << ANV_SVGS_VB_INDEX;
if (vs_prog_data->uses_drawid)
vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer,
access_type == RANDOM,
vb_used);
}
void genX(CmdDraw)(
VkCommandBuffer commandBuffer,
uint32_t vertexCount,
@@ -3059,6 +3111,8 @@ void genX(CmdDraw)(
prim.StartInstanceLocation = firstInstance;
prim.BaseVertexLocation = 0;
}
update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
}
void genX(CmdDrawIndexed)(
@@ -3107,6 +3161,8 @@ void genX(CmdDrawIndexed)(
prim.StartInstanceLocation = firstInstance;
prim.BaseVertexLocation = vertexOffset;
}
update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM);
}
/* Auto-Draw / Indirect Registers */
@@ -3179,6 +3235,8 @@ void genX(CmdDrawIndirectByteCountEXT)(
prim.VertexAccessType = SEQUENTIAL;
prim.PrimitiveTopologyType = pipeline->topology;
}
update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */
}
@@ -3263,6 +3321,8 @@ void genX(CmdDrawIndirect)(
prim.PrimitiveTopologyType = pipeline->topology;
}
update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
offset += stride;
}
}
@@ -3311,6 +3371,8 @@ void genX(CmdDrawIndexedIndirect)(
prim.PrimitiveTopologyType = pipeline->topology;
}
update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM);
offset += stride;
}
}
@@ -3465,6 +3527,8 @@ void genX(CmdDrawIndirectCountKHR)(
prim.PrimitiveTopologyType = pipeline->topology;
}
update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
offset += stride;
}
}
@@ -3530,6 +3594,8 @@ void genX(CmdDrawIndexedIndirectCountKHR)(
prim.PrimitiveTopologyType = pipeline->topology;
}
update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM);
offset += stride;
}
}
@@ -4115,6 +4181,120 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
}
}
/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
*
* "The VF cache needs to be invalidated before binding and then using
* Vertex Buffers that overlap with any previously bound Vertex Buffer
* (at a 64B granularity) since the last invalidation. A VF cache
* invalidate is performed by setting the "VF Cache Invalidation Enable"
* bit in PIPE_CONTROL."
*
* This is implemented by carefully tracking all vertex and index buffer
* bindings and flushing if the cache ever ends up with a range in the cache
* that would exceed 4 GiB. This is implemented in three parts:
*
* 1. genX(cmd_buffer_set_binding_for_gen8_vb_flush)() which must be called
* every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
* tracking code of the new binding. If this new binding would cause
* the cache to have a too-large range on the next draw call, a pipeline
* stall and VF cache invalidate are added to pending_pipeline_bits.
*
* 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
* empty whenever we emit a VF invalidate.
*
* 3. genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)() must be called
* after every 3DPRIMITIVE and copies the bound range into the dirty
* range for each used buffer. This has to be a separate step because
* we don't always re-bind all buffers and so 1. can't know which
* buffers are actually bound.
*/
void
genX(cmd_buffer_set_binding_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
int vb_index,
struct anv_address vb_address,
uint32_t vb_size)
{
if (GEN_GEN < 8 || GEN_GEN > 9 ||
!cmd_buffer->device->instance->physicalDevice.use_softpin)
return;
struct anv_vb_cache_range *bound, *dirty;
if (vb_index == -1) {
bound = &cmd_buffer->state.gfx.ib_bound_range;
dirty = &cmd_buffer->state.gfx.ib_dirty_range;
} else {
assert(vb_index >= 0);
assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
}
if (vb_size == 0) {
bound->start = 0;
bound->end = 0;
return;
}
assert(vb_address.bo && (vb_address.bo->flags & EXEC_OBJECT_PINNED));
bound->start = gen_48b_address(anv_address_physical(vb_address));
bound->end = bound->start + vb_size;
assert(bound->end > bound->start); /* No overflow */
/* Align everything to a cache line */
bound->start &= ~(64ull - 1ull);
bound->end = align_u64(bound->end, 64);
/* Compute the dirty range */
dirty->start = MIN2(dirty->start, bound->start);
dirty->end = MAX2(dirty->end, bound->end);
/* If our range is larger than 32 bits, we have to flush */
assert(bound->end - bound->start <= (1ull << 32));
if (dirty->end - dirty->start > (1ull << 32)) {
cmd_buffer->state.pending_pipe_bits |=
ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
}
}
void
genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
uint32_t access_type,
uint64_t vb_used)
{
if (GEN_GEN < 8 || GEN_GEN > 9 ||
!cmd_buffer->device->instance->physicalDevice.use_softpin)
return;
if (access_type == RANDOM) {
/* We have an index buffer */
struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
if (bound->end > bound->start) {
dirty->start = MIN2(dirty->start, bound->start);
dirty->end = MAX2(dirty->end, bound->end);
}
}
uint64_t mask = vb_used;
while (mask) {
int i = u_bit_scan64(&mask);
assert(i >= 0);
assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
struct anv_vb_cache_range *bound, *dirty;
bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
if (bound->end > bound->start) {
dirty->start = MIN2(dirty->start, bound->start);
dirty->end = MAX2(dirty->end, bound->end);
}
}
}
/**
* Update the pixel hashing modes that determine the balancing of PS threads
* across subslices and slices.

View File

@@ -78,6 +78,7 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
}
genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, 32, src, size);
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
genX(flush_pipeline_select_3d)(cmd_buffer);
@@ -229,5 +230,8 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
prim.BaseVertexLocation = 0;
}
genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, SEQUENTIAL,
1ull << 32);
cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
}