anv: Use PIPE_CONTROL flushes to implement the gen8 VF cache WA
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
This commit is contained in:
@@ -141,8 +141,12 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
|
||||
}
|
||||
}
|
||||
|
||||
/* We only allow 48-bit addresses with softpin because knowing the actual
|
||||
* address is required for the vertex cache flush workaround.
|
||||
*/
|
||||
device->supports_48bit_addresses = (device->info.gen >= 8) &&
|
||||
gtt_size > (4ULL << 30 /* GiB */);
|
||||
device->has_softpin &&
|
||||
gtt_size > (4ULL << 30 /* GiB */);
|
||||
|
||||
uint64_t heap_size = anv_compute_heap_size(fd, gtt_size);
|
||||
|
||||
@@ -471,10 +475,6 @@ anv_physical_device_init(struct anv_physical_device *device,
|
||||
goto fail;
|
||||
}
|
||||
|
||||
result = anv_physical_device_init_heaps(device, fd);
|
||||
if (result != VK_SUCCESS)
|
||||
goto fail;
|
||||
|
||||
device->has_softpin = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN);
|
||||
device->has_exec_async = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_ASYNC);
|
||||
device->has_exec_capture = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_CAPTURE);
|
||||
@@ -484,6 +484,10 @@ anv_physical_device_init(struct anv_physical_device *device,
|
||||
anv_gem_supports_syncobj_wait(fd);
|
||||
device->has_context_priority = anv_gem_has_context_priority(fd);
|
||||
|
||||
result = anv_physical_device_init_heaps(device, fd);
|
||||
if (result != VK_SUCCESS)
|
||||
goto fail;
|
||||
|
||||
device->use_softpin = device->has_softpin &&
|
||||
device->supports_48bit_addresses;
|
||||
|
||||
|
@@ -44,6 +44,14 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer);
|
||||
|
||||
void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer);
|
||||
|
||||
void genX(cmd_buffer_set_binding_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
|
||||
int vb_index,
|
||||
struct anv_address vb_address,
|
||||
uint32_t vb_size);
|
||||
void genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
|
||||
uint32_t access_type,
|
||||
uint64_t vb_used);
|
||||
|
||||
void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
|
||||
unsigned width, unsigned height,
|
||||
unsigned scale);
|
||||
|
@@ -2503,6 +2503,27 @@ struct anv_attachment_state {
|
||||
struct anv_image_view * image_view;
|
||||
};
|
||||
|
||||
/** State tracking for vertex buffer flushes
|
||||
*
|
||||
* On Gen8-9, the VF cache only considers the bottom 32 bits of memory
|
||||
* addresses. If you happen to have two vertex buffers which get placed
|
||||
* exactly 4 GiB apart and use them in back-to-back draw calls, you can get
|
||||
* collisions. In order to solve this problem, we track vertex address ranges
|
||||
* which are live in the cache and invalidate the cache if one ever exceeds 32
|
||||
* bits.
|
||||
*/
|
||||
struct anv_vb_cache_range {
|
||||
/* Virtual address at which the live vertex buffer cache range starts for
|
||||
* this vertex buffer index.
|
||||
*/
|
||||
uint64_t start;
|
||||
|
||||
/* Virtual address of the byte after where vertex buffer cache range ends.
|
||||
* This is exclusive such that end - start is the size of the range.
|
||||
*/
|
||||
uint64_t end;
|
||||
};
|
||||
|
||||
/** State tracking for particular pipeline bind point
|
||||
*
|
||||
* This struct is the base struct for anv_cmd_graphics_state and
|
||||
@@ -2531,6 +2552,11 @@ struct anv_cmd_graphics_state {
|
||||
anv_cmd_dirty_mask_t dirty;
|
||||
uint32_t vb_dirty;
|
||||
|
||||
struct anv_vb_cache_range ib_bound_range;
|
||||
struct anv_vb_cache_range ib_dirty_range;
|
||||
struct anv_vb_cache_range vb_bound_ranges[33];
|
||||
struct anv_vb_cache_range vb_dirty_ranges[33];
|
||||
|
||||
struct anv_dynamic_state dynamic;
|
||||
|
||||
struct {
|
||||
|
@@ -139,19 +139,6 @@ blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
|
||||
struct blorp_address *addr)
|
||||
{
|
||||
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
|
||||
|
||||
/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
|
||||
*
|
||||
* "The VF cache needs to be invalidated before binding and then using
|
||||
* Vertex Buffers that overlap with any previously bound Vertex Buffer
|
||||
* (at a 64B granularity) since the last invalidation. A VF cache
|
||||
* invalidate is performed by setting the "VF Cache Invalidation Enable"
|
||||
* bit in PIPE_CONTROL."
|
||||
*
|
||||
* This restriction first appears in the Skylake PRM but the internal docs
|
||||
* also list it as being an issue on Broadwell. In order to avoid this
|
||||
* problem, we align all vertex buffer allocations to 64 bytes.
|
||||
*/
|
||||
struct anv_state vb_state =
|
||||
anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 64);
|
||||
|
||||
@@ -170,9 +157,25 @@ blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
|
||||
uint32_t *sizes,
|
||||
unsigned num_vbs)
|
||||
{
|
||||
/* anv forces all vertex buffers into the low 4GB so there are never any
|
||||
* transitions that require a VF invalidation.
|
||||
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
|
||||
|
||||
for (unsigned i = 0; i < num_vbs; i++) {
|
||||
struct anv_address anv_addr = {
|
||||
.bo = addrs[i].buffer,
|
||||
.offset = addrs[i].offset,
|
||||
};
|
||||
genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer,
|
||||
i, anv_addr, sizes[i]);
|
||||
}
|
||||
|
||||
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
|
||||
|
||||
/* Technically, we should call this *after* 3DPRIMITIVE but it doesn't
|
||||
* really matter for blorp because we never call apply_pipe_flushes after
|
||||
* this point.
|
||||
*/
|
||||
genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, SEQUENTIAL,
|
||||
(1 << num_vbs) - 1);
|
||||
}
|
||||
|
||||
#if GEN_GEN >= 8
|
||||
|
@@ -1392,6 +1392,10 @@ genX(BeginCommandBuffer)(
|
||||
* executing anything. The chances are fairly high that they will use
|
||||
* blorp at least once per primary command buffer so it shouldn't be
|
||||
* wasted.
|
||||
*
|
||||
* There is also a workaround on gen8 which requires us to invalidate the
|
||||
* VF cache occasionally. It's easier if we can assume we start with a
|
||||
* fresh cache (See also genX(cmd_buffer_set_binding_for_gen8_vb_flush).)
|
||||
*/
|
||||
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
|
||||
|
||||
@@ -1598,6 +1602,14 @@ genX(CmdExecuteCommands)(
|
||||
anv_cmd_buffer_add_secondary(primary, secondary);
|
||||
}
|
||||
|
||||
/* The secondary isn't counted in our VF cache tracking so we need to
|
||||
* invalidate the whole thing.
|
||||
*/
|
||||
if (GEN_GEN >= 8 && GEN_GEN <= 9) {
|
||||
primary->state.pending_pipe_bits |=
|
||||
ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
|
||||
}
|
||||
|
||||
/* The secondary may have selected a different pipeline (3D or compute) and
|
||||
* may have changed the current L3$ configuration. Reset our tracking
|
||||
* variables to invalid values to ensure that we re-emit these in the case
|
||||
@@ -1836,6 +1848,18 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
|
||||
bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
|
||||
}
|
||||
|
||||
if ((GEN_GEN >= 8 && GEN_GEN <= 9) &&
|
||||
(bits & ANV_PIPE_CS_STALL_BIT) &&
|
||||
(bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
|
||||
/* If we are doing a VF cache invalidate AND a CS stall (it must be
|
||||
* both) then we can reset our vertex cache tracking.
|
||||
*/
|
||||
memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
|
||||
sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
|
||||
memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
|
||||
sizeof(cmd_buffer->state.gfx.ib_dirty_range));
|
||||
}
|
||||
|
||||
if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
|
||||
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
|
||||
#if GEN_GEN >= 12
|
||||
@@ -2830,6 +2854,12 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
|
||||
#endif
|
||||
};
|
||||
|
||||
#if GEN_GEN >= 8 && GEN_GEN <= 9
|
||||
genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, vb,
|
||||
state.BufferStartingAddress,
|
||||
state.BufferSize);
|
||||
#endif
|
||||
|
||||
GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
|
||||
i++;
|
||||
}
|
||||
@@ -2967,6 +2997,9 @@ emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
|
||||
.EndAddress = anv_address_add(addr, size),
|
||||
#endif
|
||||
});
|
||||
|
||||
genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer,
|
||||
index, addr, size);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -3014,6 +3047,25 @@ emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
|
||||
emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
|
||||
}
|
||||
|
||||
static void
|
||||
update_dirty_vbs_for_gen8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
|
||||
uint32_t access_type)
|
||||
{
|
||||
struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline;
|
||||
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
|
||||
|
||||
uint64_t vb_used = pipeline->vb_used;
|
||||
if (vs_prog_data->uses_firstvertex ||
|
||||
vs_prog_data->uses_baseinstance)
|
||||
vb_used |= 1ull << ANV_SVGS_VB_INDEX;
|
||||
if (vs_prog_data->uses_drawid)
|
||||
vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
|
||||
|
||||
genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer,
|
||||
access_type == RANDOM,
|
||||
vb_used);
|
||||
}
|
||||
|
||||
void genX(CmdDraw)(
|
||||
VkCommandBuffer commandBuffer,
|
||||
uint32_t vertexCount,
|
||||
@@ -3059,6 +3111,8 @@ void genX(CmdDraw)(
|
||||
prim.StartInstanceLocation = firstInstance;
|
||||
prim.BaseVertexLocation = 0;
|
||||
}
|
||||
|
||||
update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
|
||||
}
|
||||
|
||||
void genX(CmdDrawIndexed)(
|
||||
@@ -3107,6 +3161,8 @@ void genX(CmdDrawIndexed)(
|
||||
prim.StartInstanceLocation = firstInstance;
|
||||
prim.BaseVertexLocation = vertexOffset;
|
||||
}
|
||||
|
||||
update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM);
|
||||
}
|
||||
|
||||
/* Auto-Draw / Indirect Registers */
|
||||
@@ -3179,6 +3235,8 @@ void genX(CmdDrawIndirectByteCountEXT)(
|
||||
prim.VertexAccessType = SEQUENTIAL;
|
||||
prim.PrimitiveTopologyType = pipeline->topology;
|
||||
}
|
||||
|
||||
update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
|
||||
#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */
|
||||
}
|
||||
|
||||
@@ -3263,6 +3321,8 @@ void genX(CmdDrawIndirect)(
|
||||
prim.PrimitiveTopologyType = pipeline->topology;
|
||||
}
|
||||
|
||||
update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
|
||||
|
||||
offset += stride;
|
||||
}
|
||||
}
|
||||
@@ -3311,6 +3371,8 @@ void genX(CmdDrawIndexedIndirect)(
|
||||
prim.PrimitiveTopologyType = pipeline->topology;
|
||||
}
|
||||
|
||||
update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM);
|
||||
|
||||
offset += stride;
|
||||
}
|
||||
}
|
||||
@@ -3465,6 +3527,8 @@ void genX(CmdDrawIndirectCountKHR)(
|
||||
prim.PrimitiveTopologyType = pipeline->topology;
|
||||
}
|
||||
|
||||
update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
|
||||
|
||||
offset += stride;
|
||||
}
|
||||
}
|
||||
@@ -3530,6 +3594,8 @@ void genX(CmdDrawIndexedIndirectCountKHR)(
|
||||
prim.PrimitiveTopologyType = pipeline->topology;
|
||||
}
|
||||
|
||||
update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM);
|
||||
|
||||
offset += stride;
|
||||
}
|
||||
}
|
||||
@@ -4115,6 +4181,120 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
|
||||
}
|
||||
}
|
||||
|
||||
/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
|
||||
*
|
||||
* "The VF cache needs to be invalidated before binding and then using
|
||||
* Vertex Buffers that overlap with any previously bound Vertex Buffer
|
||||
* (at a 64B granularity) since the last invalidation. A VF cache
|
||||
* invalidate is performed by setting the "VF Cache Invalidation Enable"
|
||||
* bit in PIPE_CONTROL."
|
||||
*
|
||||
* This is implemented by carefully tracking all vertex and index buffer
|
||||
* bindings and flushing if the cache ever ends up with a range in the cache
|
||||
* that would exceed 4 GiB. This is implemented in three parts:
|
||||
*
|
||||
* 1. genX(cmd_buffer_set_binding_for_gen8_vb_flush)() which must be called
|
||||
* every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
|
||||
* tracking code of the new binding. If this new binding would cause
|
||||
* the cache to have a too-large range on the next draw call, a pipeline
|
||||
* stall and VF cache invalidate are added to pending_pipeline_bits.
|
||||
*
|
||||
* 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
|
||||
* empty whenever we emit a VF invalidate.
|
||||
*
|
||||
* 3. genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)() must be called
|
||||
* after every 3DPRIMITIVE and copies the bound range into the dirty
|
||||
* range for each used buffer. This has to be a separate step because
|
||||
* we don't always re-bind all buffers and so 1. can't know which
|
||||
* buffers are actually bound.
|
||||
*/
|
||||
void
|
||||
genX(cmd_buffer_set_binding_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
|
||||
int vb_index,
|
||||
struct anv_address vb_address,
|
||||
uint32_t vb_size)
|
||||
{
|
||||
if (GEN_GEN < 8 || GEN_GEN > 9 ||
|
||||
!cmd_buffer->device->instance->physicalDevice.use_softpin)
|
||||
return;
|
||||
|
||||
struct anv_vb_cache_range *bound, *dirty;
|
||||
if (vb_index == -1) {
|
||||
bound = &cmd_buffer->state.gfx.ib_bound_range;
|
||||
dirty = &cmd_buffer->state.gfx.ib_dirty_range;
|
||||
} else {
|
||||
assert(vb_index >= 0);
|
||||
assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
|
||||
assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
|
||||
bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
|
||||
dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
|
||||
}
|
||||
|
||||
if (vb_size == 0) {
|
||||
bound->start = 0;
|
||||
bound->end = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
assert(vb_address.bo && (vb_address.bo->flags & EXEC_OBJECT_PINNED));
|
||||
bound->start = gen_48b_address(anv_address_physical(vb_address));
|
||||
bound->end = bound->start + vb_size;
|
||||
assert(bound->end > bound->start); /* No overflow */
|
||||
|
||||
/* Align everything to a cache line */
|
||||
bound->start &= ~(64ull - 1ull);
|
||||
bound->end = align_u64(bound->end, 64);
|
||||
|
||||
/* Compute the dirty range */
|
||||
dirty->start = MIN2(dirty->start, bound->start);
|
||||
dirty->end = MAX2(dirty->end, bound->end);
|
||||
|
||||
/* If our range is larger than 32 bits, we have to flush */
|
||||
assert(bound->end - bound->start <= (1ull << 32));
|
||||
if (dirty->end - dirty->start > (1ull << 32)) {
|
||||
cmd_buffer->state.pending_pipe_bits |=
|
||||
ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
|
||||
uint32_t access_type,
|
||||
uint64_t vb_used)
|
||||
{
|
||||
if (GEN_GEN < 8 || GEN_GEN > 9 ||
|
||||
!cmd_buffer->device->instance->physicalDevice.use_softpin)
|
||||
return;
|
||||
|
||||
if (access_type == RANDOM) {
|
||||
/* We have an index buffer */
|
||||
struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
|
||||
struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
|
||||
|
||||
if (bound->end > bound->start) {
|
||||
dirty->start = MIN2(dirty->start, bound->start);
|
||||
dirty->end = MAX2(dirty->end, bound->end);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t mask = vb_used;
|
||||
while (mask) {
|
||||
int i = u_bit_scan64(&mask);
|
||||
assert(i >= 0);
|
||||
assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
|
||||
assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
|
||||
|
||||
struct anv_vb_cache_range *bound, *dirty;
|
||||
bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
|
||||
dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
|
||||
|
||||
if (bound->end > bound->start) {
|
||||
dirty->start = MIN2(dirty->start, bound->start);
|
||||
dirty->end = MAX2(dirty->end, bound->end);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the pixel hashing modes that determine the balancing of PS threads
|
||||
* across subslices and slices.
|
||||
|
@@ -78,6 +78,7 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
|
||||
genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
|
||||
}
|
||||
|
||||
genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, 32, src, size);
|
||||
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
|
||||
|
||||
genX(flush_pipeline_select_3d)(cmd_buffer);
|
||||
@@ -229,5 +230,8 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
|
||||
prim.BaseVertexLocation = 0;
|
||||
}
|
||||
|
||||
genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, SEQUENTIAL,
|
||||
1ull << 32);
|
||||
|
||||
cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
|
||||
}
|
||||
|
Reference in New Issue
Block a user