anv: move trtt submissions over to the anv_async_submit

We can remove a bunch of TRTT specific code from the backends as well
as manual submission tracking.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: José Roberto de Souza <jose.souza@intel.com>
Reviewed-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28975>
This commit is contained in:
Lionel Landwerlin
2024-04-29 11:37:03 +03:00
committed by Marge Bot
parent 1adafbddbd
commit 7da5b1caef
15 changed files with 297 additions and 504 deletions

View File

@@ -1668,37 +1668,6 @@ anv_queue_submit_simple_batch(struct anv_queue *queue,
return result;
}
VkResult
anv_queue_submit_trtt_batch(struct anv_sparse_submission *submit,
struct anv_batch *batch)
{
struct anv_queue *queue = submit->queue;
struct anv_device *device = queue->device;
VkResult result = VK_SUCCESS;
uint32_t batch_size = align(batch->next - batch->start, 8);
struct anv_trtt_batch_bo *trtt_bbo;
result = anv_trtt_batch_bo_new(device, batch_size, &trtt_bbo);
if (result != VK_SUCCESS)
return result;
memcpy(trtt_bbo->bo->map, batch->start, trtt_bbo->size);
#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
if (device->physical->memory.need_flush &&
anv_bo_needs_host_cache_flush(trtt_bbo->bo->alloc_flags))
intel_flush_range(trtt_bbo->bo->map, trtt_bbo->size);
#endif
if (INTEL_DEBUG(DEBUG_BATCH)) {
intel_print_batch(queue->decoder, trtt_bbo->bo->map, trtt_bbo->bo->size,
trtt_bbo->bo->offset, false);
}
result = device->kmd_backend->execute_trtt_batch(submit, trtt_bbo);
return result;
}
void
anv_cmd_buffer_clflush(struct anv_cmd_buffer **cmd_buffers,
uint32_t num_cmd_buffers)

View File

@@ -3246,14 +3246,25 @@ anv_device_destroy_context_or_vm(struct anv_device *device)
}
}
static void
static VkResult
anv_device_init_trtt(struct anv_device *device)
{
struct anv_trtt *trtt = &device->trtt;
VkResult result =
vk_sync_create(&device->vk,
&device->physical->sync_syncobj_type,
VK_SYNC_IS_TIMELINE,
0 /* initial_value */,
&trtt->timeline);
if (result != VK_SUCCESS)
return result;
simple_mtx_init(&trtt->mutex, mtx_plain);
list_inithead(&trtt->in_flight_batches);
return VK_SUCCESS;
}
static void
@@ -3261,31 +3272,9 @@ anv_device_finish_trtt(struct anv_device *device)
{
struct anv_trtt *trtt = &device->trtt;
if (trtt->timeline_val > 0) {
struct drm_syncobj_timeline_wait wait = {
.handles = (uintptr_t)&trtt->timeline_handle,
.points = (uintptr_t)&trtt->timeline_val,
.timeout_nsec = INT64_MAX,
.count_handles = 1,
.flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
.first_signaled = false,
};
if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &wait))
fprintf(stderr, "TR-TT syncobj wait failed!\n");
anv_sparse_trtt_garbage_collect_batches(device, true);
list_for_each_entry_safe(struct anv_trtt_batch_bo, trtt_bbo,
&trtt->in_flight_batches, link)
anv_trtt_batch_bo_free(device, trtt_bbo);
}
if (trtt->timeline_handle > 0) {
struct drm_syncobj_destroy destroy = {
.handle = trtt->timeline_handle,
};
if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY, &destroy))
fprintf(stderr, "TR-TT syncobj destroy failed!\n");
}
vk_sync_destroy(&device->vk, trtt->timeline);
simple_mtx_destroy(&trtt->mutex);
@@ -3915,6 +3904,10 @@ VkResult anv_CreateDevice(
}
}
result = anv_device_init_trtt(device);
if (result != VK_SUCCESS)
goto fail_companion_cmd_pool;
anv_device_init_blorp(device);
anv_device_init_border_colors(device);
@@ -3929,8 +3922,6 @@ VkResult anv_CreateDevice(
anv_device_init_embedded_samplers(device);
anv_device_init_trtt(device);
BITSET_ONES(device->gfx_dirty_state);
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_INDEX_BUFFER);
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_SO_DECL_LIST);
@@ -3963,13 +3954,13 @@ VkResult anv_CreateDevice(
result = anv_genX(device->info, init_device_state)(device);
if (result != VK_SUCCESS)
goto fail_companion_cmd_pool;
goto fail_inits;
*pDevice = anv_device_to_handle(device);
return VK_SUCCESS;
fail_companion_cmd_pool:
fail_inits:
anv_device_finish_trtt(device);
anv_device_finish_embedded_samplers(device);
anv_device_utrace_finish(device);
@@ -3977,7 +3968,7 @@ VkResult anv_CreateDevice(
anv_device_finish_rt_shaders(device);
anv_device_finish_astc_emu(device);
anv_device_finish_internal_kernels(device);
fail_companion_cmd_pool:
if (device->info->verx10 >= 125) {
vk_common_DestroyCommandPool(anv_device_to_handle(device),
device->companion_rcs_cmd_pool, NULL);
@@ -4089,6 +4080,7 @@ void anv_DestroyDevice(
struct anv_physical_device *pdevice = device->physical;
/* Do TRTT batch garbage collection before destroying queues. */
anv_device_finish_trtt(device);
for (uint32_t i = 0; i < device->queue_count; i++)

View File

@@ -65,13 +65,6 @@ stub_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
return VK_ERROR_UNKNOWN;
}
static VkResult
stub_execute_trtt_batch(struct anv_sparse_submission *submit,
struct anv_trtt_batch_bo *trtt_bbo)
{
return VK_ERROR_UNKNOWN;
}
static VkResult
stub_queue_exec_locked(struct anv_queue *queue,
uint32_t wait_count,
@@ -180,7 +173,6 @@ const struct anv_kmd_backend *anv_stub_kmd_backend_get(void)
.vm_bind_bo = stub_vm_bind_bo,
.vm_unbind_bo = stub_vm_bind_bo,
.execute_simple_batch = stub_execute_simple_batch,
.execute_trtt_batch = stub_execute_trtt_batch,
.queue_exec_locked = stub_queue_exec_locked,
.queue_exec_async = stub_queue_exec_async,
.bo_alloc_flags_to_bo_flags = stub_bo_alloc_flags_to_bo_flags,

View File

@@ -38,8 +38,10 @@
struct intel_sample_positions;
struct intel_urb_config;
struct anv_async_submit;
struct anv_embedded_sampler;
struct anv_pipeline_embedded_sampler_binding;
struct anv_trtt_bind;
typedef struct nir_builder nir_builder;
typedef struct nir_shader nir_shader;
@@ -351,9 +353,16 @@ genX(simple_shader_push_state_address)(struct anv_simple_shader *state,
void
genX(emit_simple_shader_end)(struct anv_simple_shader *state);
VkResult genX(init_trtt_context_state)(struct anv_queue *queue);
VkResult genX(init_trtt_context_state)(struct anv_device *device,
struct anv_async_submit *submit);
VkResult genX(write_trtt_entries)(struct anv_trtt_submission *submit);
void genX(write_trtt_entries)(struct anv_async_submit *submit,
struct anv_trtt_bind *l3l2_binds,
uint32_t n_l3l2_binds,
struct anv_trtt_bind *l1_binds,
uint32_t n_l1_binds);
void genX(async_submit_end)(struct anv_async_submit *submit);
void
genX(cmd_buffer_emit_push_descriptor_buffer_surface)(struct anv_cmd_buffer *cmd_buffer,

View File

@@ -40,7 +40,6 @@ struct anv_query_pool;
struct anv_async_submit;
struct anv_utrace_submit;
struct anv_sparse_submission;
struct anv_trtt_batch_bo;
enum anv_vm_bind_op {
/* bind vma specified in anv_vm_bind */
@@ -113,8 +112,6 @@ struct anv_kmd_backend {
bool is_companion_rcs_batch);
/* The caller is expected to hold device->mutex when calling this vfunc.
*/
VkResult (*execute_trtt_batch)(struct anv_sparse_submission *submit,
struct anv_trtt_batch_bo *trtt_bbo);
VkResult (*queue_exec_locked)(struct anv_queue *queue,
uint32_t wait_count,
const struct vk_sync_wait *waits,

View File

@@ -768,35 +768,6 @@ struct anv_state_stream {
struct util_dynarray all_blocks;
};
struct anv_sparse_submission {
struct anv_queue *queue;
struct anv_vm_bind *binds;
int binds_len;
int binds_capacity;
uint32_t wait_count;
uint32_t signal_count;
struct vk_sync_wait *waits;
struct vk_sync_signal *signals;
};
struct anv_trtt_bind {
uint64_t pte_addr;
uint64_t entry_addr;
};
struct anv_trtt_submission {
struct anv_sparse_submission *sparse;
struct anv_trtt_bind *l3l2_binds;
struct anv_trtt_bind *l1_binds;
int l3l2_binds_len;
int l1_binds_len;
};
/* The block_pool functions exported for testing only. The block pool should
* only be used via a state pool (see below).
*/
@@ -1788,19 +1759,6 @@ struct anv_device_astc_emu {
VkPipeline pipeline;
};
struct anv_trtt_batch_bo {
struct anv_bo *bo;
uint32_t size;
/* Once device->trtt.timeline_handle signals timeline_val as complete we
* can free this struct and its members.
*/
uint64_t timeline_val;
/* Part of device->trtt.in_flight_batches. */
struct list_head link;
};
struct anv_device {
struct vk_device vk;
@@ -2028,12 +1986,11 @@ struct anv_device {
struct anv_bo *cur_page_table_bo;
uint64_t next_page_table_bo_offset;
/* Timeline syncobj used to track completion of the TR-TT batch BOs. */
uint32_t timeline_handle;
struct vk_sync *timeline;
uint64_t timeline_val;
/* List of struct anv_trtt_batch_bo batches that are in flight and can
* be freed once their timeline gets signaled.
/* List of struct anv_trtt_submission that are in flight and can be
* freed once their vk_sync gets signaled.
*/
struct list_head in_flight_batches;
} trtt;
@@ -2203,17 +2160,6 @@ VkResult anv_queue_submit(struct vk_queue *queue,
VkResult anv_queue_submit_simple_batch(struct anv_queue *queue,
struct anv_batch *batch,
bool is_companion_rcs_batch);
VkResult anv_queue_submit_trtt_batch(struct anv_sparse_submission *submit,
struct anv_batch *batch);
static inline void
anv_trtt_batch_bo_free(struct anv_device *device,
struct anv_trtt_batch_bo *trtt_bbo)
{
anv_bo_pool_free(&device->batch_bo_pool, trtt_bbo->bo);
list_del(&trtt_bbo->link);
vk_free(&device->vk.alloc, trtt_bbo);
}
void anv_queue_trace(struct anv_queue *queue, const char *label,
bool frame, bool begin);
@@ -2521,6 +2467,32 @@ anv_async_submit_done(struct anv_async_submit *submit);
bool
anv_async_submit_wait(struct anv_async_submit *submit);
struct anv_sparse_submission {
struct anv_queue *queue;
struct anv_vm_bind *binds;
int binds_len;
int binds_capacity;
uint32_t wait_count;
uint32_t signal_count;
struct vk_sync_wait *waits;
struct vk_sync_signal *signals;
};
struct anv_trtt_bind {
uint64_t pte_addr;
uint64_t entry_addr;
};
struct anv_trtt_submission {
struct anv_async_submit base;
struct anv_sparse_submission *sparse;
struct list_head link;
};
struct anv_device_memory {
struct vk_device_memory vk;
@@ -3217,6 +3189,9 @@ VkResult anv_sparse_bind_image_memory(struct anv_queue *queue,
VkResult anv_sparse_bind(struct anv_device *device,
struct anv_sparse_submission *sparse_submit);
VkResult anv_sparse_trtt_garbage_collect_batches(struct anv_device *device,
bool wait_completion);
VkSparseImageFormatProperties
anv_sparse_calc_image_format_properties(struct anv_physical_device *pdevice,
VkImageAspectFlags aspect,
@@ -3236,8 +3211,6 @@ VkResult anv_sparse_image_check_support(struct anv_physical_device *pdevice,
VkSampleCountFlagBits samples,
VkImageType type,
VkFormat format);
VkResult anv_trtt_batch_bo_new(struct anv_device *device, uint32_t batch_size,
struct anv_trtt_batch_bo **out_trtt_bbo);
struct anv_buffer {
struct vk_buffer vk;

View File

@@ -396,20 +396,11 @@ trtt_get_page_table_bo(struct anv_device *device, struct anv_bo **bo,
}
static VkResult
anv_trtt_init_context_state(struct anv_queue *queue)
anv_trtt_init_context_state(struct anv_device *device,
struct anv_async_submit *submit)
{
struct anv_device *device = queue->device;
struct anv_trtt *trtt = &device->trtt;
struct drm_syncobj_create create = {
.handle = 0,
.flags = 0,
};
if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_CREATE, &create))
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
assert(create.handle != 0);
trtt->timeline_handle = create.handle;
struct anv_bo *l3_bo;
VkResult result = trtt_get_page_table_bo(device, &l3_bo, &trtt->l3_addr);
if (result != VK_SUCCESS)
@@ -430,7 +421,7 @@ anv_trtt_init_context_state(struct anv_queue *queue)
goto fail_free_l3;
}
result = anv_genX(device->info, init_trtt_context_state)(queue);
result = anv_genX(device->info, init_trtt_context_state)(device, submit);
return result;
@@ -439,17 +430,6 @@ fail_free_l3:
return result;
}
static void
anv_trtt_bind_list_add_entry(struct anv_trtt_bind *binds, int *binds_len,
uint64_t pte_addr, uint64_t entry_addr)
{
binds[*binds_len] = (struct anv_trtt_bind) {
.pte_addr = pte_addr,
.entry_addr = entry_addr,
};
(*binds_len)++;
}
/* For L3 and L2 pages, null and invalid entries are indicated by bits 1 and 0
* respectively. For L1 entries, the hardware compares the addresses against
* what we program to the GFX_TRTT_NULL and GFX_TRTT_INVAL registers.
@@ -457,13 +437,27 @@ anv_trtt_bind_list_add_entry(struct anv_trtt_bind *binds, int *binds_len,
#define ANV_TRTT_L3L2_NULL_ENTRY (1 << 1)
#define ANV_TRTT_L3L2_INVALID_ENTRY (1 << 0)
static void
anv_trtt_bind_list_add_entry(struct anv_trtt_bind *binds, uint32_t *binds_len,
uint64_t pte_addr, uint64_t entry_addr)
{
binds[*binds_len] = (struct anv_trtt_bind) {
.pte_addr = pte_addr,
.entry_addr = entry_addr,
};
(*binds_len)++;
}
/* Adds elements to the anv_trtt_bind structs passed. This doesn't write the
* entries to the HW yet.
*/
static VkResult
anv_trtt_bind_add(struct anv_device *device,
uint64_t trtt_addr, uint64_t dest_addr,
struct anv_trtt_submission *s)
struct anv_trtt_bind *l3l2_binds,
uint32_t *n_l3l2_binds,
struct anv_trtt_bind *l1_binds,
uint32_t *n_l1_binds)
{
VkResult result = VK_SUCCESS;
struct anv_trtt *trtt = &device->trtt;
@@ -480,9 +474,10 @@ anv_trtt_bind_add(struct anv_device *device,
if (is_null_bind) {
trtt->l3_mirror[l3_index] = ANV_TRTT_L3L2_NULL_ENTRY;
anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
trtt->l3_addr + l3_index * sizeof(uint64_t),
ANV_TRTT_L3L2_NULL_ENTRY);
anv_trtt_bind_list_add_entry(l3l2_binds, n_l3l2_binds,
trtt->l3_addr + l3_index *
sizeof(uint64_t),
ANV_TRTT_L3L2_NULL_ENTRY);
return VK_SUCCESS;
}
@@ -494,8 +489,9 @@ anv_trtt_bind_add(struct anv_device *device,
trtt->l3_mirror[l3_index] = l2_addr;
anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
trtt->l3_addr + l3_index * sizeof(uint64_t), l2_addr);
anv_trtt_bind_list_add_entry(l3l2_binds, n_l3l2_binds,
trtt->l3_addr + l3_index *
sizeof(uint64_t), l2_addr);
}
assert(l2_addr != 0 && l2_addr != ANV_TRTT_L3L2_NULL_ENTRY);
@@ -508,9 +504,9 @@ anv_trtt_bind_add(struct anv_device *device,
trtt->l2_mirror[l3_index * 512 + l2_index] =
ANV_TRTT_L3L2_NULL_ENTRY;
anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
l2_addr + l2_index * sizeof(uint64_t),
ANV_TRTT_L3L2_NULL_ENTRY);
anv_trtt_bind_list_add_entry(l3l2_binds, n_l3l2_binds,
l2_addr + l2_index * sizeof(uint64_t),
ANV_TRTT_L3L2_NULL_ENTRY);
return VK_SUCCESS;
}
@@ -522,13 +518,65 @@ anv_trtt_bind_add(struct anv_device *device,
trtt->l2_mirror[l3_index * 512 + l2_index] = l1_addr;
anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
l2_addr + l2_index * sizeof(uint64_t), l1_addr);
anv_trtt_bind_list_add_entry(l3l2_binds, n_l3l2_binds,
l2_addr + l2_index * sizeof(uint64_t),
l1_addr);
}
assert(l1_addr != 0 && l1_addr != ANV_TRTT_L3L2_NULL_ENTRY);
anv_trtt_bind_list_add_entry(s->l1_binds, &s->l1_binds_len,
l1_addr + l1_index * sizeof(uint32_t), dest_addr);
anv_trtt_bind_list_add_entry(l1_binds, n_l1_binds,
l1_addr + l1_index * sizeof(uint32_t),
dest_addr);
return VK_SUCCESS;
}
VkResult
anv_sparse_trtt_garbage_collect_batches(struct anv_device *device,
bool wait_completion)
{
struct anv_trtt *trtt = &device->trtt;
uint64_t last_value;
if (!wait_completion) {
VkResult result =
vk_sync_get_value(&device->vk, trtt->timeline, &last_value);
if (result != VK_SUCCESS)
return result;
} else {
last_value = trtt->timeline_val;
}
list_for_each_entry_safe(struct anv_trtt_submission, submit,
&trtt->in_flight_batches, link) {
if (submit->base.signal.signal_value <= last_value) {
list_del(&submit->link);
anv_async_submit_fini(&submit->base);
vk_free(&device->vk.alloc, submit);
continue;
}
if (!wait_completion)
break;
VkResult result = vk_sync_wait(
&device->vk,
submit->base.signal.sync,
submit->base.signal.signal_value,
VK_SYNC_WAIT_COMPLETE,
os_time_get_absolute_timeout(OS_TIMEOUT_INFINITE));
if (result == VK_SUCCESS) {
list_del(&submit->link);
anv_async_submit_fini(&submit->base);
vk_free(&device->vk.alloc, submit);
continue;
}
/* If the wait failed but the caller wanted completion, return the
* error.
*/
return result;
}
return VK_SUCCESS;
}
@@ -545,6 +593,35 @@ anv_sparse_bind_trtt(struct anv_device *device,
if (!sparse_submit->queue)
sparse_submit->queue = trtt->queue;
struct anv_trtt_submission *submit =
vk_zalloc(&device->vk.alloc, sizeof(*submit), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (submit == NULL)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
result = anv_async_submit_init(&submit->base, sparse_submit->queue,
&device->batch_bo_pool,
false, false);
if (result != VK_SUCCESS)
goto error_async;
simple_mtx_lock(&trtt->mutex);
anv_sparse_trtt_garbage_collect_batches(device, false);
submit->base.signal = (struct vk_sync_signal) {
.sync = trtt->timeline,
.signal_value = ++trtt->timeline_val,
};
/* If the TRTT L3 table was never set, initialize it as part of this
* submission.
*/
if (!trtt->l3_addr)
anv_trtt_init_context_state(device, &submit->base);
assert(trtt->l3_addr);
/* These capacities are conservative estimations. For L1 binds the
* number will match exactly unless we skip NULL binds due to L2 already
* being NULL. For L3/L2 things are harder to estimate, but the resulting
@@ -561,26 +638,15 @@ anv_sparse_bind_trtt(struct anv_device *device,
l3l2_binds_capacity += (pages / 1024 + 1) * 2;
}
/* Turn a series of virtual address maps, into a list of L3/L2/L1 TRTT page
* table updates.
*/
STACK_ARRAY(struct anv_trtt_bind, l3l2_binds, l3l2_binds_capacity);
STACK_ARRAY(struct anv_trtt_bind, l1_binds, l1_binds_capacity);
struct anv_trtt_submission trtt_submit = {
.sparse = sparse_submit,
.l3l2_binds = l3l2_binds,
.l1_binds = l1_binds,
.l3l2_binds_len = 0,
.l1_binds_len = 0,
};
simple_mtx_lock(&trtt->mutex);
if (!trtt->l3_addr)
anv_trtt_init_context_state(sparse_submit->queue);
assert(trtt->l3_addr);
for (int b = 0; b < sparse_submit->binds_len; b++) {
uint32_t n_l3l2_binds = 0, n_l1_binds = 0;
for (int b = 0; b < sparse_submit->binds_len && result == VK_SUCCESS; b++) {
struct anv_vm_bind *vm_bind = &sparse_submit->binds[b];
for (size_t i = 0; i < vm_bind->size; i += 64 * 1024) {
for (size_t i = 0; i < vm_bind->size && result == VK_SUCCESS; i += 64 * 1024) {
uint64_t trtt_addr = vm_bind->address + i;
uint64_t dest_addr =
(vm_bind->op == ANV_VM_BIND && vm_bind->bo) ?
@@ -588,29 +654,74 @@ anv_sparse_bind_trtt(struct anv_device *device,
ANV_TRTT_L1_NULL_TILE_VAL;
result = anv_trtt_bind_add(device, trtt_addr, dest_addr,
&trtt_submit);
if (result != VK_SUCCESS)
goto out;
l3l2_binds, &n_l3l2_binds,
l1_binds, &n_l1_binds);
}
}
assert(trtt_submit.l3l2_binds_len <= l3l2_binds_capacity);
assert(trtt_submit.l1_binds_len <= l1_binds_capacity);
assert(n_l3l2_binds <= l3l2_binds_capacity);
assert(n_l1_binds <= l1_binds_capacity);
sparse_debug("trtt_binds: num_vm_binds:%02d l3l2:%04d l1:%04d\n",
sparse_submit->binds_len, trtt_submit.l3l2_binds_len,
trtt_submit.l1_binds_len);
/* Convert the L3/L2/L1 TRTT page table updates in anv_trtt_bind elements
* into MI commands.
*/
if (result == VK_SUCCESS) {
sparse_debug("trtt_binds: num_vm_binds:%02d l3l2:%04d l1:%04d\n",
sparse_submit->binds_len, n_l3l2_binds, n_l1_binds);
if (trtt_submit.l3l2_binds_len || trtt_submit.l1_binds_len)
result = anv_genX(device->info, write_trtt_entries)(&trtt_submit);
if (n_l3l2_binds || n_l1_binds) {
anv_genX(device->info, write_trtt_entries)(
&submit->base, l3l2_binds, n_l3l2_binds, l1_binds, n_l1_binds);
}
}
if (result == VK_SUCCESS)
ANV_RMV(vm_binds, device, sparse_submit->binds, sparse_submit->binds_len);
out:
simple_mtx_unlock(&trtt->mutex);
STACK_ARRAY_FINISH(l1_binds);
STACK_ARRAY_FINISH(l3l2_binds);
anv_genX(device->info, async_submit_end)(&submit->base);
if (submit->base.batch.status != VK_SUCCESS) {
result = submit->base.batch.status;
goto error_add_bind;
}
/* Add all the BOs backing TRTT page tables to the reloc list.
*
* TODO: we could narrow down the list by using anv_address structures in
* anv_trtt_bind for the pte_addr.
*/
if (device->physical->uses_relocs) {
for (int i = 0; i < trtt->num_page_table_bos; i++) {
result = anv_reloc_list_add_bo(&submit->base.relocs,
trtt->page_table_bos[i]);
if (result != VK_SUCCESS)
goto error_add_bind;
}
}
result =
device->kmd_backend->queue_exec_async(&submit->base,
sparse_submit->wait_count,
sparse_submit->waits,
sparse_submit->signal_count,
sparse_submit->signals);
if (result != VK_SUCCESS)
goto error_add_bind;
list_addtail(&submit->link, &trtt->in_flight_batches);
simple_mtx_unlock(&trtt->mutex);
ANV_RMV(vm_binds, device, sparse_submit->binds, sparse_submit->binds_len);
return VK_SUCCESS;
error_add_bind:
simple_mtx_unlock(&trtt->mutex);
anv_async_submit_fini(&submit->base);
error_async:
vk_free(&device->vk.alloc, submit);
return result;
}
@@ -1299,65 +1410,3 @@ anv_sparse_image_check_support(struct anv_physical_device *pdevice,
return VK_SUCCESS;
}
static VkResult
anv_trtt_garbage_collect_batches(struct anv_device *device)
{
struct anv_trtt *trtt = &device->trtt;
if (trtt->timeline_val % 8 != 7)
return VK_SUCCESS;
uint64_t cur_timeline_val = 0;
struct drm_syncobj_timeline_array array = {
.handles = (uintptr_t)&trtt->timeline_handle,
.points = (uintptr_t)&cur_timeline_val,
.count_handles = 1,
.flags = 0,
};
if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_QUERY, &array))
return vk_error(device, VK_ERROR_UNKNOWN);
list_for_each_entry_safe(struct anv_trtt_batch_bo, trtt_bbo,
&trtt->in_flight_batches, link) {
if (trtt_bbo->timeline_val > cur_timeline_val)
return VK_SUCCESS;
anv_trtt_batch_bo_free(device, trtt_bbo);
}
return VK_SUCCESS;
}
VkResult
anv_trtt_batch_bo_new(struct anv_device *device, uint32_t batch_size,
struct anv_trtt_batch_bo **out_trtt_bbo)
{
struct anv_trtt *trtt = &device->trtt;
VkResult result;
anv_trtt_garbage_collect_batches(device);
struct anv_trtt_batch_bo *trtt_bbo =
vk_alloc(&device->vk.alloc, sizeof(*trtt_bbo), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!trtt_bbo)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
result = anv_bo_pool_alloc(&device->batch_bo_pool, batch_size,
&trtt_bbo->bo);
if (result != VK_SUCCESS)
goto out;
trtt_bbo->size = batch_size;
trtt_bbo->timeline_val = ++trtt->timeline_val;
list_addtail(&trtt_bbo->link, &trtt->in_flight_batches);
*out_trtt_bbo = trtt_bbo;
return VK_SUCCESS;
out:
vk_free(&device->vk.alloc, trtt_bbo);
return result;
}

View File

@@ -6094,22 +6094,17 @@ genX(cmd_buffer_end_companion_rcs_syncpoint)(struct anv_cmd_buffer *cmd_buffer,
#endif
}
VkResult
genX(write_trtt_entries)(struct anv_trtt_submission *submit)
void
genX(write_trtt_entries)(struct anv_async_submit *submit,
struct anv_trtt_bind *l3l2_binds,
uint32_t n_l3l2_binds,
struct anv_trtt_bind *l1_binds,
uint32_t n_l1_binds)
{
#if GFX_VER >= 12
const struct intel_device_info *devinfo =
submit->sparse->queue->device->info;
size_t batch_size = submit->l3l2_binds_len * 20 +
submit->l1_binds_len * 16 +
GENX(PIPE_CONTROL_length) * sizeof(uint32_t) + 8;
STACK_ARRAY(uint32_t, cmds, batch_size);
struct anv_batch batch = {
.start = cmds,
.next = cmds,
.end = (void *)cmds + batch_size,
};
submit->queue->device->info;
struct anv_batch *batch = &submit->batch;
/* BSpec says:
* "DWord Length programmed must not exceed 0x3FE."
@@ -6127,90 +6122,86 @@ genX(write_trtt_entries)(struct anv_trtt_submission *submit)
* contiguous addresses.
*/
for (int i = 0; i < submit->l3l2_binds_len; i++) {
for (uint32_t i = 0; i < n_l3l2_binds; i++) {
int extra_writes = 0;
for (int j = i + 1;
j < submit->l3l2_binds_len &&
extra_writes <= max_qword_extra_writes;
for (uint32_t j = i + 1;
j < n_l3l2_binds && extra_writes <= max_qword_extra_writes;
j++) {
if (submit->l3l2_binds[i].pte_addr + (j - i) * 8 ==
submit->l3l2_binds[j].pte_addr) {
if (l3l2_binds[i].pte_addr + (j - i) * 8 == l3l2_binds[j].pte_addr) {
extra_writes++;
} else {
break;
}
}
bool is_last_write = submit->l1_binds_len == 0 &&
i + extra_writes + 1 == submit->l3l2_binds_len;
bool is_last_write = n_l1_binds == 0 &&
i + extra_writes + 1 == n_l3l2_binds;
uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
qword_write_len + (extra_writes * 2);
uint32_t *dw;
dw = anv_batch_emitn(&batch, total_len, GENX(MI_STORE_DATA_IMM),
dw = anv_batch_emitn(batch, total_len, GENX(MI_STORE_DATA_IMM),
.ForceWriteCompletionCheck = is_last_write,
.StoreQword = true,
.Address = anv_address_from_u64(submit->l3l2_binds[i].pte_addr),
.Address = anv_address_from_u64(l3l2_binds[i].pte_addr),
);
dw += 3;
for (int j = 0; j < extra_writes + 1; j++) {
uint64_t entry_addr_64b = submit->l3l2_binds[i + j].entry_addr;
for (uint32_t j = 0; j < extra_writes + 1; j++) {
uint64_t entry_addr_64b = l3l2_binds[i + j].entry_addr;
*dw = entry_addr_64b & 0xFFFFFFFF;
dw++;
*dw = (entry_addr_64b >> 32) & 0xFFFFFFFF;
dw++;
}
assert(dw == batch.next);
assert(dw == batch->next);
i += extra_writes;
}
for (int i = 0; i < submit->l1_binds_len; i++) {
for (uint32_t i = 0; i < n_l1_binds; i++) {
int extra_writes = 0;
for (int j = i + 1;
j < submit->l1_binds_len && extra_writes <= max_dword_extra_writes;
for (uint32_t j = i + 1;
j < n_l1_binds && extra_writes <= max_dword_extra_writes;
j++) {
if (submit->l1_binds[i].pte_addr + (j - i) * 4 ==
submit->l1_binds[j].pte_addr) {
if (l1_binds[i].pte_addr + (j - i) * 4 ==
l1_binds[j].pte_addr) {
extra_writes++;
} else {
break;
}
}
bool is_last_write = i + extra_writes + 1 == submit->l1_binds_len;
bool is_last_write = i + extra_writes + 1 == n_l1_binds;
uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
dword_write_len + extra_writes;
uint32_t *dw;
dw = anv_batch_emitn(&batch, total_len, GENX(MI_STORE_DATA_IMM),
dw = anv_batch_emitn(batch, total_len, GENX(MI_STORE_DATA_IMM),
.ForceWriteCompletionCheck = is_last_write,
.Address = anv_address_from_u64(submit->l1_binds[i].pte_addr),
.Address = anv_address_from_u64(l1_binds[i].pte_addr),
);
dw += 3;
for (int j = 0; j < extra_writes + 1; j++) {
*dw = (submit->l1_binds[i + j].entry_addr >> 16) & 0xFFFFFFFF;
for (uint32_t j = 0; j < extra_writes + 1; j++) {
*dw = (l1_binds[i + j].entry_addr >> 16) & 0xFFFFFFFF;
dw++;
}
assert(dw == batch.next);
assert(dw == batch->next);
i += extra_writes;
}
genx_batch_emit_pipe_control(&batch, devinfo, _3D,
genx_batch_emit_pipe_control(batch, devinfo, _3D,
ANV_PIPE_CS_STALL_BIT |
ANV_PIPE_TLB_INVALIDATE_BIT);
anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
assert(batch.next <= batch.end);
VkResult result = anv_queue_submit_trtt_batch(submit->sparse, &batch);
STACK_ARRAY_FINISH(cmds);
return result;
#else
unreachable("Not implemented");
#endif
return VK_SUCCESS;
}
void
genX(async_submit_end)(struct anv_async_submit *submit)
{
struct anv_batch *batch = &submit->batch;
anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
}
void

View File

@@ -1396,31 +1396,25 @@ genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer)
}
VkResult
genX(init_trtt_context_state)(struct anv_queue *queue)
genX(init_trtt_context_state)(struct anv_device *device,
struct anv_async_submit *submit)
{
#if GFX_VER >= 12
struct anv_device *device = queue->device;
struct anv_trtt *trtt = &device->trtt;
struct anv_batch *batch = &submit->batch;
uint32_t cmds[128];
struct anv_batch batch = {
.start = cmds,
.next = cmds,
.end = (void *)cmds + sizeof(cmds),
};
anv_batch_write_reg(&batch, GENX(GFX_TRTT_INVAL), trtt_inval) {
anv_batch_write_reg(batch, GENX(GFX_TRTT_INVAL), trtt_inval) {
trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
}
anv_batch_write_reg(&batch, GENX(GFX_TRTT_NULL), trtt_null) {
anv_batch_write_reg(batch, GENX(GFX_TRTT_NULL), trtt_null) {
trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
}
#if GFX_VER >= 20
anv_batch_write_reg(&batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) {
anv_batch_write_reg(batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) {
trtt_va_range.TRVABase = device->physical->va.trtt.addr >> 44;
}
#else
anv_batch_write_reg(&batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) {
anv_batch_write_reg(batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) {
trtt_va_range.TRVAMaskValue = 0xF;
trtt_va_range.TRVADataValue = 0xF;
}
@@ -1428,28 +1422,24 @@ genX(init_trtt_context_state)(struct anv_queue *queue)
uint64_t l3_addr = trtt->l3_addr;
assert((l3_addr & 0xFFF) == 0);
anv_batch_write_reg(&batch, GENX(GFX_TRTT_L3_BASE_LOW), trtt_base_low) {
anv_batch_write_reg(batch, GENX(GFX_TRTT_L3_BASE_LOW), trtt_base_low) {
trtt_base_low.TRVAL3PointerLowerAddress =
(l3_addr & 0xFFFFF000) >> 12;
}
anv_batch_write_reg(&batch, GENX(GFX_TRTT_L3_BASE_HIGH),
anv_batch_write_reg(batch, GENX(GFX_TRTT_L3_BASE_HIGH),
trtt_base_high) {
trtt_base_high.TRVAL3PointerUpperAddress =
(l3_addr >> 32) & 0xFFFF;
}
/* Enabling TR-TT needs to be done after setting up the other registers.
*/
anv_batch_write_reg(&batch, GENX(GFX_TRTT_CR), trtt_cr) {
anv_batch_write_reg(batch, GENX(GFX_TRTT_CR), trtt_cr) {
trtt_cr.TRTTEnable = true;
}
anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
assert(batch.next <= batch.end);
VkResult res = anv_queue_submit_simple_batch(queue, &batch, false);
if (res != VK_SUCCESS)
return res;
genx_batch_emit_pipe_control(batch, device->info, _3D,
ANV_PIPE_CS_STALL_BIT |
ANV_PIPE_TLB_INVALIDATE_BIT);
#endif
return VK_SUCCESS;
}

View File

@@ -1051,105 +1051,3 @@ fail:
anv_execbuf_finish(&execbuf);
return result;
}
VkResult
i915_execute_trtt_batch(struct anv_sparse_submission *submit,
struct anv_trtt_batch_bo *trtt_bbo)
{
struct anv_queue *queue = submit->queue;
struct anv_device *device = queue->device;
struct anv_trtt *trtt = &device->trtt;
struct anv_execbuf execbuf = {
.alloc = &device->vk.alloc,
.alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
};
VkResult result;
for (uint32_t i = 0; i < submit->wait_count; i++) {
result = anv_execbuf_add_sync(device, &execbuf, submit->waits[i].sync,
false /* is_signal */,
submit->waits[i].wait_value);
if (result != VK_SUCCESS)
goto out;
}
for (uint32_t i = 0; i < submit->signal_count; i++) {
result = anv_execbuf_add_sync(device, &execbuf, submit->signals[i].sync,
true /* is_signal */,
submit->signals[i].signal_value);
if (result != VK_SUCCESS)
goto out;
}
result = anv_execbuf_add_syncobj(device, &execbuf, trtt->timeline_handle,
I915_EXEC_FENCE_SIGNAL,
trtt_bbo->timeline_val);
if (result != VK_SUCCESS)
goto out;
result = anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL,
0);
if (result != VK_SUCCESS)
goto out;
for (int i = 0; i < trtt->num_page_table_bos; i++) {
result = anv_execbuf_add_bo(device, &execbuf, trtt->page_table_bos[i],
NULL, EXEC_OBJECT_WRITE);
if (result != VK_SUCCESS)
goto out;
}
if (queue->sync) {
result = anv_execbuf_add_sync(device, &execbuf, queue->sync,
true /* is_signal */,
0 /* signal_value */);
if (result != VK_SUCCESS)
goto out;
}
result = anv_execbuf_add_bo(device, &execbuf, trtt_bbo->bo, NULL, 0);
if (result != VK_SUCCESS)
goto out;
if (INTEL_DEBUG(DEBUG_SUBMIT))
anv_i915_debug_submit(&execbuf);
uint64_t exec_flags = 0;
uint32_t context_id;
get_context_and_exec_flags(queue, false, &exec_flags, &context_id);
execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {
.buffers_ptr = (uintptr_t) execbuf.objects,
.buffer_count = execbuf.bo_count,
.batch_start_offset = 0,
.batch_len = trtt_bbo->size,
.flags = I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | exec_flags,
.rsvd1 = context_id,
.rsvd2 = 0,
};
setup_execbuf_fence_params(&execbuf);
ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count);
int ret = queue->device->info->no_hw ? 0 :
anv_gem_execbuffer(device, &execbuf.execbuf);
if (ret) {
result = vk_device_set_lost(&device->vk,
"trtt anv_gem_execbuffer failed: %m");
goto out;
}
if (queue->sync) {
result = vk_sync_wait(&device->vk, queue->sync, 0,
VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
if (result != VK_SUCCESS) {
result = vk_queue_set_lost(&queue->vk, "trtt sync wait failed");
goto out;
}
}
out:
anv_execbuf_finish(&execbuf);
return result;
}

View File

@@ -29,15 +29,12 @@
#include "vk_sync.h"
struct anv_device;
struct anv_queue;
struct anv_bo;
struct anv_cmd_buffer;
struct anv_query_pool;
struct anv_async_submit;
struct anv_utrace_submit;
struct anv_sparse_submission;
struct anv_trtt_batch_bo;
VkResult
i915_queue_exec_async(struct anv_async_submit *submit,
@@ -50,10 +47,6 @@ VkResult
i915_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
uint32_t batch_bo_size, bool is_companion_rcs_batch);
VkResult
i915_execute_trtt_batch(struct anv_sparse_submission *submit,
struct anv_trtt_batch_bo *trtt_bbo);
VkResult
i915_queue_exec_locked(struct anv_queue *queue,
uint32_t wait_count,

View File

@@ -297,7 +297,6 @@ anv_i915_kmd_backend_get(void)
.vm_bind_bo = i915_vm_bind_bo,
.vm_unbind_bo = i915_vm_bind_bo,
.execute_simple_batch = i915_execute_simple_batch,
.execute_trtt_batch = i915_execute_trtt_batch,
.queue_exec_locked = i915_queue_exec_locked,
.queue_exec_async = i915_queue_exec_async,
.bo_alloc_flags_to_bo_flags = i915_bo_alloc_flags_to_bo_flags,

View File

@@ -183,58 +183,6 @@ xe_exec_print_debug(struct anv_queue *queue, uint32_t cmd_buffer_count,
perf_query_pool, perf_query_pass);
}
VkResult
xe_execute_trtt_batch(struct anv_sparse_submission *submit,
struct anv_trtt_batch_bo *trtt_bbo)
{
struct anv_queue *queue = submit->queue;
struct anv_device *device = queue->device;
struct anv_trtt *trtt = &device->trtt;
VkResult result = VK_SUCCESS;
struct drm_xe_sync extra_sync = {
.type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
.flags = DRM_XE_SYNC_FLAG_SIGNAL,
.handle = trtt->timeline_handle,
.timeline_value = trtt_bbo->timeline_val,
};
struct drm_xe_sync *xe_syncs = NULL;
uint32_t xe_syncs_count = 0;
result = xe_exec_process_syncs(queue, submit->wait_count, submit->waits,
submit->signal_count, submit->signals,
1, &extra_sync,
NULL, /* utrace_submit */
false, /* is_companion_rcs_queue */
&xe_syncs, &xe_syncs_count);
if (result != VK_SUCCESS)
return result;
struct drm_xe_exec exec = {
.exec_queue_id = queue->exec_queue_id,
.num_syncs = xe_syncs_count,
.syncs = (uintptr_t)xe_syncs,
.address = trtt_bbo->bo->offset,
.num_batch_buffer = 1,
};
if (!device->info->no_hw) {
if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec)) {
result = vk_device_set_lost(&device->vk, "XE_EXEC failed: %m");
goto out;
}
}
if (queue->sync) {
result = vk_sync_wait(&device->vk, queue->sync, 0,
VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
}
out:
vk_free(&device->vk.alloc, xe_syncs);
return result;
}
VkResult
xe_queue_exec_async(struct anv_async_submit *submit,
uint32_t wait_count,

View File

@@ -36,17 +36,11 @@ struct anv_cmd_buffer;
struct anv_query_pool;
struct anv_async_submit;
struct anv_utrace_submit;
struct anv_sparse_submission;
struct anv_trtt_batch_bo;
VkResult
xe_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
uint32_t batch_bo_size, bool is_companion_rcs_batch);
VkResult
xe_execute_trtt_batch(struct anv_sparse_submission *submit,
struct anv_trtt_batch_bo *trtt_bbo);
VkResult
xe_queue_exec_locked(struct anv_queue *queue,
uint32_t wait_count,

View File

@@ -346,7 +346,6 @@ anv_xe_kmd_backend_get(void)
.vm_bind_bo = xe_vm_bind_bo,
.vm_unbind_bo = xe_vm_unbind_bo,
.execute_simple_batch = xe_execute_simple_batch,
.execute_trtt_batch = xe_execute_trtt_batch,
.queue_exec_locked = xe_queue_exec_locked,
.queue_exec_async = xe_queue_exec_async,
.bo_alloc_flags_to_bo_flags = xe_bo_alloc_flags_to_bo_flags,