anv: create a pool for indirect descriptors

We'll use the fact that the pool is aligned to 4Gb to limit the amount
of address computations to build the address in the shaders.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
This commit is contained in:
Lionel Landwerlin
2023-02-22 09:00:35 +02:00
committed by Marge Bot
parent 6367691b58
commit d2c0147228
7 changed files with 173 additions and 45 deletions

View File

@@ -126,6 +126,8 @@ anv_create_cmd_buffer(struct vk_command_pool *pool,
&device->dynamic_state_pool, 16384);
anv_state_stream_init(&cmd_buffer->general_state_stream,
&device->general_state_pool, 16384);
anv_state_stream_init(&cmd_buffer->push_descriptor_stream,
&device->push_descriptor_pool, 4096);
int success = u_vector_init_pow2(&cmd_buffer->dynamic_bos, 8,
sizeof(struct anv_bo *));
@@ -175,6 +177,7 @@ anv_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
anv_state_stream_finish(&cmd_buffer->surface_state_stream);
anv_state_stream_finish(&cmd_buffer->dynamic_state_stream);
anv_state_stream_finish(&cmd_buffer->general_state_stream);
anv_state_stream_finish(&cmd_buffer->push_descriptor_stream);
while (u_vector_length(&cmd_buffer->dynamic_bos) > 0) {
struct anv_bo **bo = u_vector_remove(&cmd_buffer->dynamic_bos);
@@ -220,6 +223,10 @@ anv_cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
anv_state_stream_init(&cmd_buffer->general_state_stream,
&cmd_buffer->device->general_state_pool, 16384);
anv_state_stream_finish(&cmd_buffer->push_descriptor_stream);
anv_state_stream_init(&cmd_buffer->push_descriptor_stream,
&cmd_buffer->device->push_descriptor_pool, 4096);
while (u_vector_length(&cmd_buffer->dynamic_bos) > 0) {
struct anv_bo **bo = u_vector_remove(&cmd_buffer->dynamic_bos);
anv_device_release_bo(cmd_buffer->device, *bo);
@@ -950,11 +957,17 @@ anv_cmd_buffer_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
if (layout->descriptor_buffer_size &&
((*push_set)->set_used_on_gpu ||
set->desc_mem.alloc_size < layout->descriptor_buffer_size)) {
struct anv_physical_device *pdevice = cmd_buffer->device->physical;
struct anv_state_stream *push_stream =
pdevice->indirect_descriptors ?
&cmd_buffer->push_descriptor_stream :
&cmd_buffer->surface_state_stream;
/* The previous buffer is either actively used by some GPU command (so
* we can't modify it) or is too small. Allocate a new one.
*/
struct anv_state desc_mem =
anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
anv_state_stream_alloc(push_stream,
anv_descriptor_set_layout_descriptor_buffer_size(layout, 0),
ANV_UBO_ALIGNMENT);
if (set->desc_mem.alloc_size) {
@@ -964,10 +977,9 @@ anv_cmd_buffer_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
}
set->desc_mem = desc_mem;
set->desc_addr = (struct anv_address) {
.bo = cmd_buffer->dynamic_state_stream.state_pool->block_pool.bo,
.offset = set->desc_mem.offset,
};
set->desc_addr = anv_state_pool_state_address(
push_stream->state_pool,
set->desc_mem);
}
return set;

View File

@@ -1353,6 +1353,7 @@ anv_physical_device_try_create(struct vk_instance *vk_instance,
device->compiler->supports_shader_constants = true;
device->compiler->indirect_ubos_use_sampler = device->info.ver < 12;
device->compiler->extended_bindless_surface_offset = device->uses_ex_bso;
device->compiler->use_bindless_sampler_offset = !device->indirect_descriptors;
isl_device_init(&device->isl_dev, &device->info);
@@ -2849,10 +2850,13 @@ decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)
return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->scratch_surface_state_pool.block_pool, address))
return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->bindless_surface_state_pool.block_pool, address))
if (device->physical->indirect_descriptors &&
get_bo_from_pool(&ret_bo, &device->bindless_surface_state_pool.block_pool, address))
return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->internal_surface_state_pool.block_pool, address))
return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->push_descriptor_pool.block_pool, address))
return ret_bo;
if (!device->cmd_buffer_being_decoded)
return (struct intel_batch_decode_bo) { };
@@ -3108,6 +3112,10 @@ VkResult anv_CreateDevice(
device->physical->va.high_heap.addr,
device->physical->va.high_heap.size);
util_vma_heap_init(&device->vma_desc,
device->physical->va.descriptor_pool.addr,
device->physical->va.descriptor_pool.size);
list_inithead(&device->memory_objects);
list_inithead(&device->image_private_objects);
@@ -3198,12 +3206,14 @@ VkResult anv_CreateDevice(
if (result != VK_SUCCESS)
goto fail_scratch_surface_state_pool;
result = anv_state_pool_init(&device->bindless_surface_state_pool, device,
"bindless surface state pool",
device->physical->va.bindless_surface_state_pool.addr,
0, 4096);
if (result != VK_SUCCESS)
goto fail_internal_surface_state_pool;
if (device->physical->indirect_descriptors) {
result = anv_state_pool_init(&device->bindless_surface_state_pool, device,
"bindless surface state pool",
device->physical->va.bindless_surface_state_pool.addr,
0, 4096);
if (result != VK_SUCCESS)
goto fail_internal_surface_state_pool;
}
if (device->info->verx10 >= 125) {
/* We're using 3DSTATE_BINDING_TABLE_POOL_ALLOC to give the binding
@@ -3232,11 +3242,18 @@ VkResult anv_CreateDevice(
if (result != VK_SUCCESS)
goto fail_bindless_surface_state_pool;
result = anv_state_pool_init(&device->push_descriptor_pool, device,
"push descriptor pool",
device->physical->va.push_descriptor_pool.addr,
0, 4096);
if (result != VK_SUCCESS)
goto fail_binding_table_pool;
if (device->info->has_aux_map) {
device->aux_map_ctx = intel_aux_map_init(device, &aux_map_allocator,
&physical_device->info);
if (!device->aux_map_ctx)
goto fail_binding_table_pool;
goto fail_push_descriptor_pool;
}
result = anv_device_alloc_bo(device, "workaround", 8192,
@@ -3413,10 +3430,13 @@ VkResult anv_CreateDevice(
intel_aux_map_finish(device->aux_map_ctx);
device->aux_map_ctx = NULL;
}
fail_push_descriptor_pool:
anv_state_pool_finish(&device->push_descriptor_pool);
fail_binding_table_pool:
anv_state_pool_finish(&device->binding_table_pool);
fail_bindless_surface_state_pool:
anv_state_pool_finish(&device->bindless_surface_state_pool);
if (device->physical->indirect_descriptors)
anv_state_pool_finish(&device->bindless_surface_state_pool);
fail_internal_surface_state_pool:
anv_state_pool_finish(&device->internal_surface_state_pool);
fail_scratch_surface_state_pool:
@@ -3437,6 +3457,7 @@ VkResult anv_CreateDevice(
fail_mutex:
pthread_mutex_destroy(&device->mutex);
fail_vmas:
util_vma_heap_finish(&device->vma_desc);
util_vma_heap_finish(&device->vma_hi);
util_vma_heap_finish(&device->vma_cva);
util_vma_heap_finish(&device->vma_lo);
@@ -3513,11 +3534,13 @@ void anv_DestroyDevice(
device->aux_map_ctx = NULL;
}
anv_state_pool_finish(&device->push_descriptor_pool);
anv_state_pool_finish(&device->binding_table_pool);
if (device->info->verx10 >= 125)
anv_state_pool_finish(&device->scratch_surface_state_pool);
anv_state_pool_finish(&device->internal_surface_state_pool);
anv_state_pool_finish(&device->bindless_surface_state_pool);
if (device->physical->indirect_descriptors)
anv_state_pool_finish(&device->bindless_surface_state_pool);
anv_state_pool_finish(&device->instruction_state_pool);
anv_state_pool_finish(&device->dynamic_state_pool);
anv_state_pool_finish(&device->general_state_pool);
@@ -3526,6 +3549,7 @@ void anv_DestroyDevice(
anv_bo_cache_finish(&device->bo_cache);
util_vma_heap_finish(&device->vma_desc);
util_vma_heap_finish(&device->vma_hi);
util_vma_heap_finish(&device->vma_cva);
util_vma_heap_finish(&device->vma_lo);
@@ -3588,6 +3612,9 @@ anv_vma_heap_for_flags(struct anv_device *device,
if (alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS)
return &device->vma_lo;
if (alloc_flags & ANV_BO_ALLOC_DESCRIPTOR_POOL)
return &device->vma_desc;
return &device->vma_hi;
}
@@ -3634,7 +3661,8 @@ anv_vma_free(struct anv_device *device,
{
assert(vma_heap == &device->vma_lo ||
vma_heap == &device->vma_cva ||
vma_heap == &device->vma_hi);
vma_heap == &device->vma_hi ||
vma_heap == &device->vma_desc);
const uint64_t addr_48b = intel_48b_address(address);

View File

@@ -384,6 +384,9 @@ enum anv_bo_alloc_flags {
/** This buffer will be scanout to display */
ANV_BO_ALLOC_SCANOUT = (1 << 12),
/** For descriptor pools */
ANV_BO_ALLOC_DESCRIPTOR_POOL = (1 << 13),
};
struct anv_bo {
@@ -956,6 +959,8 @@ struct anv_physical_device {
struct anv_va_range scratch_surface_state_pool;
struct anv_va_range bindless_surface_state_pool;
struct anv_va_range instruction_state_pool;
struct anv_va_range descriptor_pool;
struct anv_va_range push_descriptor_pool;
struct anv_va_range client_visible_heap;
struct anv_va_range high_heap;
} va;
@@ -1123,6 +1128,7 @@ struct anv_device {
struct util_vma_heap vma_lo;
struct util_vma_heap vma_cva;
struct util_vma_heap vma_hi;
struct util_vma_heap vma_desc;
/** List of all anv_device_memory objects */
struct list_head memory_objects;
@@ -1142,6 +1148,7 @@ struct anv_device {
struct anv_state_pool scratch_surface_state_pool;
struct anv_state_pool internal_surface_state_pool;
struct anv_state_pool bindless_surface_state_pool;
struct anv_state_pool push_descriptor_pool;
struct anv_state_reserved_pool custom_border_colors;
@@ -2827,6 +2834,7 @@ struct anv_cmd_buffer {
struct anv_state_stream surface_state_stream;
struct anv_state_stream dynamic_state_stream;
struct anv_state_stream general_state_stream;
struct anv_state_stream push_descriptor_stream;
VkCommandBufferUsageFlags usage_flags;

View File

@@ -55,7 +55,10 @@ anv_device_print_vas(struct anv_physical_device *device)
PRINT_HEAP(dynamic_state_pool);
PRINT_HEAP(binding_table_pool);
PRINT_HEAP(internal_surface_state_pool);
PRINT_HEAP(scratch_surface_state_pool);
PRINT_HEAP(bindless_surface_state_pool);
PRINT_HEAP(descriptor_pool);
PRINT_HEAP(push_descriptor_pool);
PRINT_HEAP(instruction_state_pool);
PRINT_HEAP(client_visible_heap);
PRINT_HEAP(high_heap);
@@ -106,13 +109,34 @@ anv_physical_device_init_va_ranges(struct anv_physical_device *device)
* binding tables can address internal surface states & bindless surface
* states.
*/
address = align64(address, _4Gb);
address = va_add(&device->va.binding_table_pool, address, _1Gb);
address = va_add(&device->va.internal_surface_state_pool, address, 2 * _1Gb);
address = va_add(&device->va.internal_surface_state_pool, address, 1 * _1Gb);
/* Scratch surface state overlaps with the internal surface state */
va_at(&device->va.scratch_surface_state_pool,
device->va.internal_surface_state_pool.addr,
8 * _1Mb);
address = va_add(&device->va.bindless_surface_state_pool, address, _1Gb);
/* Both of the following heaps have be in the same 4Gb range from the
* binding table pool start so they can be addressed from binding table
* entries.
*/
if (device->indirect_descriptors) {
/* With indirect descriptors, we allocate bindless surface states from
* this pool.
*/
address = va_add(&device->va.bindless_surface_state_pool, address, 2 * _1Gb);
/* Descriptor buffers can go anywhere */
address = align64(address, _4Gb);
address = va_add(&device->va.descriptor_pool, address, 3 * _1Gb);
address = va_add(&device->va.push_descriptor_pool, address, _1Gb);
} else {
/* With direct descriptor, descriptors set buffers are allocated
* here.
*/
address = va_add(&device->va.descriptor_pool, address, 2 * _1Gb);
}
/* We use a trick to compute constant data offsets in the shaders to avoid
* unnecessary 64bit address computations (see lower_load_constant() in

View File

@@ -183,20 +183,47 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
sba.IndirectObjectBufferSizeModifyEnable = true;
sba.DynamicStateBufferSizeModifyEnable = true;
sba.InstructionBuffersizeModifyEnable = true;
sba.BindlessSurfaceStateBaseAddress =
(struct anv_address) { .offset =
device->physical->va.bindless_surface_state_pool.addr,
};
sba.BindlessSurfaceStateSize =
anv_physical_device_bindless_heap_size(device->physical) / ANV_SURFACE_STATE_SIZE - 1;
sba.BindlessSurfaceStateMOCS = mocs;
sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
#if GFX_VER >= 11
sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };
sba.BindlessSamplerStateMOCS = mocs;
sba.BindlessSamplerStateBaseAddressModifyEnable = true;
sba.BindlessSamplerStateBufferSize = 0;
if (!device->physical->indirect_descriptors) {
#if GFX_VERx10 >= 125
/* Bindless Surface State & Bindless Sampler State are aligned to the
* same heap
*/
sba.BindlessSurfaceStateBaseAddress =
sba.BindlessSamplerStateBaseAddress =
(struct anv_address) { .offset =
device->physical->va.binding_table_pool.addr, };
sba.BindlessSurfaceStateSize =
(device->physical->va.binding_table_pool.size +
device->physical->va.internal_surface_state_pool.size +
device->physical->va.descriptor_pool.size) - 1;
sba.BindlessSamplerStateBufferSize =
(device->physical->va.binding_table_pool.size +
device->physical->va.internal_surface_state_pool.size +
device->physical->va.descriptor_pool.size) / 4096 - 1;
sba.BindlessSurfaceStateMOCS = sba.BindlessSamplerStateMOCS = mocs;
sba.BindlessSurfaceStateBaseAddressModifyEnable =
sba.BindlessSamplerStateBaseAddressModifyEnable = true;
#else
unreachable("Direct descriptor not supported");
#endif
} else {
sba.BindlessSurfaceStateBaseAddress =
(struct anv_address) { .offset =
device->physical->va.bindless_surface_state_pool.addr,
};
sba.BindlessSurfaceStateSize =
anv_physical_device_bindless_heap_size(device->physical) / ANV_SURFACE_STATE_SIZE - 1;
sba.BindlessSurfaceStateMOCS = mocs;
sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
#if GFX_VER >= 11
sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };
sba.BindlessSamplerStateMOCS = mocs;
sba.BindlessSamplerStateBaseAddressModifyEnable = true;
sba.BindlessSamplerStateBufferSize = 0;
#endif
}
#if GFX_VERx10 >= 125
sba.L1CacheControl = L1CC_WB;
#endif

View File

@@ -261,18 +261,39 @@ init_common_queue_state(struct anv_queue *queue, struct anv_batch *batch)
sba.InstructionBaseAddressModifyEnable = true;
sba.InstructionBuffersizeModifyEnable = true;
sba.BindlessSurfaceStateBaseAddress =
(struct anv_address) { .offset =
device->physical->va.bindless_surface_state_pool.addr, };
sba.BindlessSurfaceStateSize =
anv_physical_device_bindless_heap_size(device->physical) / ANV_SURFACE_STATE_SIZE - 1;
sba.BindlessSurfaceStateMOCS = mocs;
sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
if (device->physical->indirect_descriptors) {
sba.BindlessSurfaceStateBaseAddress =
(struct anv_address) { .offset =
device->physical->va.bindless_surface_state_pool.addr,
};
sba.BindlessSurfaceStateSize =
anv_physical_device_bindless_heap_size(device->physical) / ANV_SURFACE_STATE_SIZE - 1;
sba.BindlessSurfaceStateMOCS = mocs;
sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };
sba.BindlessSamplerStateMOCS = mocs;
sba.BindlessSamplerStateBaseAddressModifyEnable = true;
sba.BindlessSamplerStateBufferSize = 0;
sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };
sba.BindlessSamplerStateMOCS = mocs;
sba.BindlessSamplerStateBaseAddressModifyEnable = true;
sba.BindlessSamplerStateBufferSize = 0;
} else {
/* Bindless Surface State & Bindless Sampler State are aligned to the
* same heap
*/
sba.BindlessSurfaceStateBaseAddress =
sba.BindlessSamplerStateBaseAddress =
(struct anv_address) { .offset = device->physical->va.binding_table_pool.addr, };
sba.BindlessSurfaceStateSize =
(device->physical->va.binding_table_pool.size +
device->physical->va.internal_surface_state_pool.size +
device->physical->va.descriptor_pool.size) - 1;
sba.BindlessSamplerStateBufferSize =
(device->physical->va.binding_table_pool.size +
device->physical->va.internal_surface_state_pool.size +
device->physical->va.descriptor_pool.size) / 4096 - 1;
sba.BindlessSurfaceStateMOCS = sba.BindlessSamplerStateMOCS = mocs;
sba.BindlessSurfaceStateBaseAddressModifyEnable =
sba.BindlessSamplerStateBaseAddressModifyEnable = true;
}
#if GFX_VERx10 >= 125
sba.L1CacheControl = L1CC_WB;

View File

@@ -344,9 +344,17 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
if (result != VK_SUCCESS)
return result;
result = pin_state_pool(device, execbuf, &device->bindless_surface_state_pool);
if (result != VK_SUCCESS)
return result;
if (device->physical->va.bindless_surface_state_pool.size > 0) {
result = pin_state_pool(device, execbuf, &device->bindless_surface_state_pool);
if (result != VK_SUCCESS)
return result;
}
if (device->physical->va.push_descriptor_pool.size > 0) {
result = pin_state_pool(device, execbuf, &device->push_descriptor_pool);
if (result != VK_SUCCESS)
return result;
}
result = pin_state_pool(device, execbuf, &device->internal_surface_state_pool);
if (result != VK_SUCCESS)