anv/trtt: make all contexts have the same TR-TT programming

On Gen12 (the oldest we support on Mesa right now for TR-TT) we
started having per-engine TR-TT registers and we are supposed to make
all contexts share the same TR-TT programming.

On LNL+, this is documented in the BSpec page for the TRTT_CNTRL
register (68417), with more details in HSDs 14020454786 and
16022013154.

On Gen12 platforms this information is a little harder to find and
there's a whole trail of HSDs leading up to 1209977595, which links to
the documents that describe the programming. BSpec for TR-TT on Gen12
is very confusing as it still contains registers and other information
from Gen11 that were not removed.

Regarding the additional BLT and COMP registers, please notice that on
the BSpec pages for the TR-TT registers, the "Register Instance"
section only lists the GFX registers as non-privileged. However, the
"User Mode Privileged Commands" lists the other instances of the TR-TT
Regsiters as non-privileged, which matches what we see: there's no
need to put these addresses in the FORCE_TO_NONPRIV registers.

Notice that for now, when TR-TT is being used we only expose a single
queue, so this change effectively does nothing until we start exposing
extra queues. I left that part for later to help bisectability.

v2:
 - s/trtt_init_context_state/trtt_init_queues_state/ (José)
 - pass device as the argument to init_queues_state (José)
v3:
 - use async_submit_end (José)

Reviewed-by: José Roberto de Souza <jose.souza@intel.com>
Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30252>
This commit is contained in:
Paulo Zanoni
2024-06-25 14:36:35 -07:00
parent 6415027d85
commit 5ca224aa0c
3 changed files with 77 additions and 33 deletions

View File

@@ -354,8 +354,7 @@ genX(simple_shader_push_state_address)(struct anv_simple_shader *state,
void
genX(emit_simple_shader_end)(struct anv_simple_shader *state);
VkResult genX(init_trtt_context_state)(struct anv_device *device,
struct anv_async_submit *submit);
VkResult genX(init_trtt_context_state)(struct anv_async_submit *submit);
void genX(write_trtt_entries)(struct anv_async_submit *submit,
struct anv_trtt_bind *l3l2_binds,

View File

@@ -405,9 +405,8 @@ trtt_get_page_table_bo(struct anv_device *device, struct anv_bo **bo,
}
static VkResult
anv_trtt_init_context_state(struct anv_queue *queue)
anv_trtt_init_queues_state(struct anv_device *device)
{
struct anv_device *device = queue->device;
struct anv_trtt *trtt = &device->trtt;
struct anv_bo *l3_bo;
@@ -417,43 +416,52 @@ anv_trtt_init_context_state(struct anv_queue *queue)
trtt->l3_mirror = vk_zalloc(&device->vk.alloc, 4096, 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!trtt->l3_mirror) {
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
return result;
}
if (!trtt->l3_mirror)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
/* L3 has 512 entries, so we can have up to 512 L2 tables. */
trtt->l2_mirror = vk_zalloc(&device->vk.alloc, 512 * 4096, 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!trtt->l2_mirror) {
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
goto fail_free_l3;
vk_free(&device->vk.alloc, trtt->l3_mirror);
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
}
struct anv_async_submit submits[device->queue_count];
int submits_used = 0;
for (uint32_t i = 0; i < device->queue_count; i++) {
struct anv_queue *q = &device->queues[i];
struct anv_async_submit submit;
result = anv_async_submit_init(&submit, queue, &device->batch_bo_pool,
false, true);
if (result != VK_SUCCESS)
return result;
result = anv_async_submit_init(&submits[submits_used], q,
&device->batch_bo_pool, false, true);
if (result != VK_SUCCESS)
break;
result = anv_genX(device->info, init_trtt_context_state)(device, &submit);
if (result != VK_SUCCESS)
goto fail_fini_submit;
struct anv_async_submit *submit = &submits[submits_used++];
anv_genX(device->info, async_submit_end)(&submit);
result = anv_genX(device->info, init_trtt_context_state)(submit);
if (result != VK_SUCCESS) {
anv_async_submit_fini(submit);
submits_used--;
break;
}
result = device->kmd_backend->queue_exec_async(&submit, 0, NULL, 1,
&submit.signal);
anv_genX(device->info, async_submit_end)(submit);
anv_async_submit_wait(&submit);
result = device->kmd_backend->queue_exec_async(submit, 0, NULL, 1,
&submit->signal);
if (result != VK_SUCCESS) {
anv_async_submit_fini(submit);
submits_used--;
break;
}
}
fail_fini_submit:
anv_async_submit_fini(&submit);
return result;
for (uint32_t i = 0; i < submits_used; i++) {
anv_async_submit_wait(&submits[i]);
anv_async_submit_fini(&submits[i]);
}
fail_free_l3:
vk_free(&device->vk.alloc, trtt->l3_mirror);
return result;
}
@@ -645,7 +653,7 @@ anv_sparse_bind_trtt(struct anv_device *device,
* submission.
*/
if (!trtt->l3_addr) {
result = anv_trtt_init_context_state(sparse_submit->queue);
result = anv_trtt_init_queues_state(device);
if (result != VK_SUCCESS)
goto error_add_bind;
}

View File

@@ -1442,10 +1442,11 @@ genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer)
}
VkResult
genX(init_trtt_context_state)(struct anv_device *device,
struct anv_async_submit *submit)
genX(init_trtt_context_state)(struct anv_async_submit *submit)
{
#if GFX_VER >= 12
struct anv_queue *queue = submit->queue;
struct anv_device *device = queue->device;
struct anv_trtt *trtt = &device->trtt;
struct anv_batch *batch = &submit->batch;
@@ -1462,25 +1463,61 @@ genX(init_trtt_context_state)(struct anv_device *device,
anv_batch_write_reg(batch, GENX(GFX_TRTT_L3_BASE_HIGH), trtt_base_high)
trtt_base_high.TRVAL3PointerUpperAddress = l3_addr_high;
anv_batch_write_reg(batch, GENX(BLT_TRTT_INVAL), trtt_inval)
trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
anv_batch_write_reg(batch, GENX(BLT_TRTT_NULL), trtt_null)
trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
anv_batch_write_reg(batch, GENX(BLT_TRTT_L3_BASE_LOW), trtt_base_low)
trtt_base_low.TRVAL3PointerLowerAddress = l3_addr_low;
anv_batch_write_reg(batch, GENX(BLT_TRTT_L3_BASE_HIGH), trtt_base_high)
trtt_base_high.TRVAL3PointerUpperAddress = l3_addr_high;
anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_INVAL), trtt_inval)
trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_NULL), trtt_null)
trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_L3_BASE_LOW), trtt_base_low)
trtt_base_low.TRVAL3PointerLowerAddress = l3_addr_low;
anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_L3_BASE_HIGH), trtt_base_high)
trtt_base_high.TRVAL3PointerUpperAddress = l3_addr_high;
#if GFX_VER >= 20
uint32_t trva_base = device->physical->va.trtt.addr >> 44;
anv_batch_write_reg(batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range)
trtt_va_range.TRVABase = trva_base;
anv_batch_write_reg(batch, GENX(BLT_TRTT_VA_RANGE), trtt_va_range)
trtt_va_range.TRVABase = trva_base;
anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_VA_RANGE), trtt_va_range)
trtt_va_range.TRVABase = trva_base;
#else
anv_batch_write_reg(batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) {
trtt_va_range.TRVAMaskValue = 0xF;
trtt_va_range.TRVADataValue = 0xF;
}
anv_batch_write_reg(batch, GENX(BLT_TRTT_VA_RANGE), trtt_va_range) {
trtt_va_range.TRVAMaskValue = 0xF;
trtt_va_range.TRVADataValue = 0xF;
}
anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_VA_RANGE), trtt_va_range) {
trtt_va_range.TRVAMaskValue = 0xF;
trtt_va_range.TRVADataValue = 0xF;
}
#endif
/* Enabling TR-TT needs to be done after setting up the other registers.
*/
anv_batch_write_reg(batch, GENX(GFX_TRTT_CR), trtt_cr)
trtt_cr.TRTTEnable = true;
anv_batch_write_reg(batch, GENX(BLT_TRTT_CR), trtt_cr)
trtt_cr.TRTTEnable = true;
anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_CR), trtt_cr)
trtt_cr.TRTTEnable = true;
genx_batch_emit_pipe_control(batch, device->info, _3D,
ANV_PIPE_CS_STALL_BIT |
ANV_PIPE_TLB_INVALIDATE_BIT);
if (queue->family->engine_class != INTEL_ENGINE_CLASS_COPY) {
genx_batch_emit_pipe_control(batch, device->info, _3D,
ANV_PIPE_CS_STALL_BIT |
ANV_PIPE_TLB_INVALIDATE_BIT);
}
#endif
return VK_SUCCESS;
}