intel/fs: put scratch surface in the surface state heap

In 4ceaed7839 we made scratch surface state allocations part of the
internal heap (mapped to STATE_BASE_ADDRESS::SurfaceStateBaseAddress)
so that it doesn't uses slots in the application's expected 1M
descriptors (especially with vkd3d-proton).

But all our compiler code relies on BSS
(STATE_BASE_ADDRESS::BindlessSurfaceStateBaseAddress).

The additional issue is that there is only 26bits of surface offset
available in CS instruction (CFE_STATE, 3DSTATE_VS, etc...) for
scratch surfaces. So we need the drivers to put the scratch surfaces
in the first chunk of STATE_BASE_ADDRESS::SurfaceStateBaseAddress
(hence all the driver changes).

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Fixes: 4ceaed7839 ("anv: split internal surface states from descriptors")
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/7687
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19727>
This commit is contained in:
Lionel Landwerlin
2022-11-14 15:54:01 +02:00
committed by Marge Bot
parent daab161535
commit 9c1c1888d9
12 changed files with 98 additions and 31 deletions

View File

@@ -709,6 +709,9 @@ init_state_base_address(struct iris_batch *batch)
sba.IndirectObjectMOCS = mocs;
sba.InstructionMOCS = mocs;
sba.SurfaceStateMOCS = mocs;
#if GFX_VER >= 9
sba.BindlessSurfaceStateMOCS = mocs;
#endif
sba.GeneralStateBaseAddressModifyEnable = true;
sba.DynamicStateBaseAddressModifyEnable = true;
@@ -717,12 +720,6 @@ init_state_base_address(struct iris_batch *batch)
sba.GeneralStateBufferSizeModifyEnable = true;
sba.DynamicStateBufferSizeModifyEnable = true;
sba.SurfaceStateBaseAddressModifyEnable = true;
#if GFX_VER >= 9
sba.BindlessSurfaceStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_SCRATCH_START);
sba.BindlessSurfaceStateSize = (IRIS_SCRATCH_ZONE_SIZE >> 12) - 1;
sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
sba.BindlessSurfaceStateMOCS = mocs;
#endif
#if GFX_VER >= 11
sba.BindlessSamplerStateMOCS = mocs;
#endif

View File

@@ -1567,6 +1567,11 @@ enum brw_message_target {
#define GFX8_BTI_STATELESS_NON_COHERENT 253
#define GFX9_BTI_BINDLESS 252
/* This ID doesn't map anything HW related value. It exists to inform the
* lowering code to not use the bindless heap.
*/
#define GFX125_NON_BINDLESS (1u << 16)
/* Dataport atomic operations for Untyped Atomic Integer Operation message
* (and others).
*/

View File

@@ -5090,6 +5090,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);
ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
brw_imm_ud(~0x3ffu));
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX125_NON_BINDLESS);
srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
} else if (devinfo->ver >= 8) {
srcs[SURFACE_LOGICAL_SRC_SURFACE] =
@@ -5156,6 +5157,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);
ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
brw_imm_ud(~0x3ffu));
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX125_NON_BINDLESS);
srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
} else if (devinfo->ver >= 8) {
srcs[SURFACE_LOGICAL_SRC_SURFACE] =

View File

@@ -843,7 +843,7 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld,
unspill_inst->sfid = GFX12_SFID_UGM;
unspill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
unspill_inst->exec_size,
LSC_ADDR_SURFTYPE_BSS,
LSC_ADDR_SURFTYPE_SS,
LSC_ADDR_SIZE_A32,
1 /* num_coordinates */,
LSC_DATA_SIZE_D32,
@@ -940,7 +940,7 @@ fs_reg_alloc::emit_spill(const fs_builder &bld,
spill_inst->sfid = GFX12_SFID_UGM;
spill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
bld.dispatch_width(),
LSC_ADDR_SURFTYPE_BSS,
LSC_ADDR_SURFTYPE_SS,
LSC_ADDR_SIZE_A32,
1 /* num_coordinates */,
LSC_DATA_SIZE_D32,

View File

@@ -1718,13 +1718,20 @@ lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
else
inst->sfid = GFX12_SFID_UGM;
/* We must have exactly one of surface and surface_handle */
assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
/* We should have exactly one of surface and surface_handle. For scratch
* messages generated by brw_fs_nir.cpp we also allow a special value to
* know what heap base we should use in STATE_BASE_ADDRESS (SS = Surface
* State Offset, or BSS = Bindless Surface State Offset).
*/
bool non_bindless = surface.file == IMM && surface.ud == GFX125_NON_BINDLESS;
assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE) ||
(non_bindless && surface_handle.file != BAD_FILE));
enum lsc_addr_surface_type surf_type;
if (surface_handle.file != BAD_FILE)
surf_type = LSC_ADDR_SURFTYPE_BSS;
else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
if (surface_handle.file != BAD_FILE) {
assert(surface.file == IMM && (surface.ud == 0 || surface.ud == GFX125_NON_BINDLESS));
surf_type = non_bindless ? LSC_ADDR_SURFTYPE_SS : LSC_ADDR_SURFTYPE_BSS;
} else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
surf_type = LSC_ADDR_SURFTYPE_FLAT;
else
surf_type = LSC_ADDR_SURFTYPE_BTI;
@@ -1801,6 +1808,7 @@ lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
case LSC_ADDR_SURFTYPE_FLAT:
inst->src[1] = brw_imm_ud(0);
break;
case LSC_ADDR_SURFTYPE_SS:
case LSC_ADDR_SURFTYPE_BSS:
/* We assume that the driver provided the handle in the top 20 bits so
* we can use the surface handle directly as the extended descriptor.

View File

@@ -1197,7 +1197,7 @@ anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool
for (unsigned i = 0; i < 16; i++) {
if (pool->surf_states[i].map != NULL) {
anv_state_pool_free(&device->internal_surface_state_pool,
anv_state_pool_free(&device->scratch_surface_state_pool,
pool->surf_states[i]);
}
}
@@ -1274,6 +1274,8 @@ anv_scratch_pool_get_surf(struct anv_device *device,
struct anv_scratch_pool *pool,
unsigned per_thread_scratch)
{
assert(device->info->verx10 >= 125);
if (per_thread_scratch == 0)
return 0;
@@ -1290,7 +1292,7 @@ anv_scratch_pool_get_surf(struct anv_device *device,
struct anv_address addr = { .bo = bo };
struct anv_state state =
anv_state_pool_alloc(&device->internal_surface_state_pool,
anv_state_pool_alloc(&device->scratch_surface_state_pool,
device->isl_dev.ss.size, 64);
isl_buffer_fill_state(&device->isl_dev, state.map,
@@ -1305,7 +1307,7 @@ anv_scratch_pool_get_surf(struct anv_device *device,
uint32_t current = p_atomic_cmpxchg(&pool->surfs[scratch_size_log2],
0, state.offset);
if (current) {
anv_state_pool_free(&device->internal_surface_state_pool, state);
anv_state_pool_free(&device->scratch_surface_state_pool, state);
return current;
} else {
pool->surf_states[scratch_size_log2] = state;

View File

@@ -1443,7 +1443,7 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
}
/* Add all the global BOs to the object list for softpin case. */
result = pin_state_pool(device, execbuf, &device->internal_surface_state_pool);
result = pin_state_pool(device, execbuf, &device->scratch_surface_state_pool);
if (result != VK_SUCCESS)
return result;
@@ -1451,6 +1451,10 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
if (result != VK_SUCCESS)
return result;
result = pin_state_pool(device, execbuf, &device->internal_surface_state_pool);
if (result != VK_SUCCESS)
return result;
result = pin_state_pool(device, execbuf, &device->dynamic_state_pool);
if (result != VK_SUCCESS)
return result;

View File

@@ -3063,10 +3063,12 @@ decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)
return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->binding_table_pool.block_pool, address))
return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->internal_surface_state_pool.block_pool, address))
if (get_bo_from_pool(&ret_bo, &device->scratch_surface_state_pool.block_pool, address))
return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->bindless_surface_state_pool.block_pool, address))
return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->internal_surface_state_pool.block_pool, address))
return ret_bo;
if (!device->cmd_buffer_being_decoded)
return (struct intel_batch_decode_bo) { };
@@ -3437,11 +3439,27 @@ VkResult anv_CreateDevice(
if (result != VK_SUCCESS)
goto fail_dynamic_state_pool;
result = anv_state_pool_init(&device->internal_surface_state_pool, device,
"internal surface state pool",
INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS, 0, 4096);
if (device->info->verx10 >= 125) {
/* Put the scratch surface states at the beginning of the internal
* surface state pool.
*/
result = anv_state_pool_init(&device->scratch_surface_state_pool, device,
"scratch surface state pool",
SCRATCH_SURFACE_STATE_POOL_MIN_ADDRESS, 0, 4096);
if (result != VK_SUCCESS)
goto fail_instruction_state_pool;
result = anv_state_pool_init(&device->internal_surface_state_pool, device,
"internal surface state pool",
INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS,
SCRATCH_SURFACE_STATE_POOL_SIZE, 4096);
} else {
result = anv_state_pool_init(&device->internal_surface_state_pool, device,
"internal surface state pool",
INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS, 0, 4096);
}
if (result != VK_SUCCESS)
goto fail_instruction_state_pool;
goto fail_scratch_surface_state_pool;
result = anv_state_pool_init(&device->bindless_surface_state_pool, device,
"bindless surface state pool",
@@ -3549,7 +3567,9 @@ VkResult anv_CreateDevice(
* to zero and they have a valid descriptor.
*/
device->null_surface_state =
anv_state_pool_alloc(&device->internal_surface_state_pool,
anv_state_pool_alloc(device->info->verx10 >= 125 ?
&device->scratch_surface_state_pool :
&device->internal_surface_state_pool,
device->isl_dev.ss.size,
device->isl_dev.ss.align);
isl_null_fill_state(&device->isl_dev, device->null_surface_state.map,
@@ -3650,6 +3670,9 @@ VkResult anv_CreateDevice(
anv_state_pool_finish(&device->bindless_surface_state_pool);
fail_internal_surface_state_pool:
anv_state_pool_finish(&device->internal_surface_state_pool);
fail_scratch_surface_state_pool:
if (device->info->verx10 >= 125)
anv_state_pool_finish(&device->scratch_surface_state_pool);
fail_instruction_state_pool:
anv_state_pool_finish(&device->instruction_state_pool);
fail_dynamic_state_pool:
@@ -3738,6 +3761,8 @@ void anv_DestroyDevice(
}
anv_state_pool_finish(&device->binding_table_pool);
if (device->info->verx10 >= 125)
anv_state_pool_finish(&device->scratch_surface_state_pool);
anv_state_pool_finish(&device->internal_surface_state_pool);
anv_state_pool_finish(&device->bindless_surface_state_pool);
anv_state_pool_finish(&device->instruction_state_pool);

View File

@@ -2679,6 +2679,8 @@ anv_CreateImageView(VkDevice _device,
.h = image->vk.extent.height,
.d = image->vk.extent.depth,
});
iview->planes[vplane].lowered_surface_state_is_null = true;
}
}
}

View File

@@ -160,8 +160,10 @@ struct intel_perf_query_result;
#define BINDING_TABLE_POOL_MAX_ADDRESS 0x00013fffffffULL
#define INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS 0x000140000000ULL /* 5 GiB */
#define INTERNAL_SURFACE_STATE_POOL_MAX_ADDRESS 0x0001bfffffffULL
#define BINDLESS_SURFACE_STATE_POOL_MIN_ADDRESS 0x0001c0000000ULL /* 7 GiB */
#define BINDLESS_SURFACE_STATE_POOL_MAX_ADDRESS 0x0001bfffffffULL
#define SCRATCH_SURFACE_STATE_POOL_MIN_ADDRESS 0x000140000000ULL /* 5 GiB (8MiB overlaps surface state pool) */
#define SCRATCH_SURFACE_STATE_POOL_MAX_ADDRESS 0x0001407fffffULL
#define BINDLESS_SURFACE_STATE_POOL_MIN_ADDRESS 0x0001c0000000ULL /* 7 GiB (64MiB) */
#define BINDLESS_SURFACE_STATE_POOL_MAX_ADDRESS 0x0001c3ffffffULL
#define INSTRUCTION_STATE_POOL_MIN_ADDRESS 0x000200000000ULL /* 8 GiB */
#define INSTRUCTION_STATE_POOL_MAX_ADDRESS 0x00023fffffffULL
#define CLIENT_VISIBLE_HEAP_MIN_ADDRESS 0x000240000000ULL /* 9 GiB */
@@ -177,10 +179,12 @@ struct intel_perf_query_result;
#define BINDING_TABLE_POOL_SIZE \
(BINDING_TABLE_POOL_MAX_ADDRESS - BINDING_TABLE_POOL_MIN_ADDRESS + 1)
#define BINDING_TABLE_POOL_BLOCK_SIZE (65536)
#define INTERNAL_SURFACE_STATE_POOL_SIZE \
(INTERNAL_SURFACE_STATE_POOL_MAX_ADDRESS - INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS + 1)
#define SCRATCH_SURFACE_STATE_POOL_SIZE \
(SCRATCH_SURFACE_STATE_POOL_MAX_ADDRESS - SCRATCH_SURFACE_STATE_POOL_MIN_ADDRESS + 1)
#define BINDLESS_SURFACE_STATE_POOL_SIZE \
(BINDLESS_SURFACE_STATE_POOL_MAX_ADDRESS - BINDLESS_SURFACE_STATE_POOL_MIN_ADDRESS + 1)
#define INTERNAL_SURFACE_STATE_POOL_SIZE \
(INTERNAL_SURFACE_STATE_POOL_MAX_ADDRESS - INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS + 1)
#define INSTRUCTION_STATE_POOL_SIZE \
(INSTRUCTION_STATE_POOL_MAX_ADDRESS - INSTRUCTION_STATE_POOL_MIN_ADDRESS + 1)
#define CLIENT_VISIBLE_HEAP_SIZE \
@@ -1158,6 +1162,7 @@ struct anv_device {
struct anv_state_pool dynamic_state_pool;
struct anv_state_pool instruction_state_pool;
struct anv_state_pool binding_table_pool;
struct anv_state_pool scratch_surface_state_pool;
struct anv_state_pool internal_surface_state_pool;
struct anv_state_pool bindless_surface_state_pool;
@@ -3831,6 +3836,8 @@ struct anv_image_view {
*/
struct anv_surface_state storage_surface_state;
struct anv_surface_state lowered_storage_surface_state;
bool lowered_surface_state_is_null;
} planes[3];
};

View File

@@ -181,7 +181,11 @@ static uint32_t
blorp_binding_table_offset_to_pointer(struct blorp_batch *batch,
uint32_t offset)
{
#if GFX_VERX10 >= 125
return SCRATCH_SURFACE_STATE_POOL_SIZE + offset;
#else
return offset;
#endif
}
static void *

View File

@@ -2421,7 +2421,12 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
continue;
}
const struct anv_descriptor *desc = &set->descriptors[binding->index];
/* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress
* heap. Depending on where the descriptor surface state is
* allocated, they can either come from
* device->internal_surface_state_pool or
* device->bindless_surface_state_pool.
*/
switch (desc->type) {
case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
case VK_DESCRIPTOR_TYPE_SAMPLER:
@@ -2451,10 +2456,11 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
binding->lowered_storage_surface
? desc->image_view->planes[binding->plane].lowered_storage_surface_state
: desc->image_view->planes[binding->plane].storage_surface_state;
surface_state =
anv_bindless_state_for_binding_table(sstate.state);
const bool lowered_surface_state_is_null =
desc->image_view->planes[binding->plane].lowered_surface_state_is_null;
surface_state = anv_bindless_state_for_binding_table(sstate.state);
assert(surface_state.alloc_size);
if (surface_state.offset == 0) {
if (binding->lowered_storage_surface && lowered_surface_state_is_null) {
mesa_loge("Bound a image to a descriptor where the "
"descriptor does not have NonReadable "
"set and the image does not have a "
@@ -2752,7 +2758,12 @@ cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
anv_batch_emit(&cmd_buffer->batch,
GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
btp._3DCommandSubOpcode = binding_table_opcodes[s];
#if GFX_VERX10 >= 125
btp.PointertoVSBindingTable = SCRATCH_SURFACE_STATE_POOL_SIZE +
cmd_buffer->state.binding_tables[s].offset;
#else
btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
#endif
}
}
}