intel/aux_map: introduce ref count of L1 entries
To implement this feature, we need to do CPU side tracking of all L3/L2/L1 entries. This does add a little bit of CPU allocations, but the advantage is that the traversal of the page table tree is faster. No more need for the linear seach of find_buffer(). With this feature, we can have multiple VkImage bind to the same main memory address, as long as they share exact same mapping parameters. The AUX mapping will be removed when the last VkImage is destroyed. As previously, if the L1 mapping entry parameters don't match, the mapping fails. Anv handles this nicely by just disabling AUX on the image. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com> Reviewed-by: José Roberto de Souza <jose.souza@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26335>
This commit is contained in:

committed by
Marge Bot

parent
02ef01fa95
commit
7c6faa1efe
@@ -183,14 +183,33 @@ struct aux_map_buffer {
|
||||
struct intel_buffer *buffer;
|
||||
};
|
||||
|
||||
struct intel_aux_level {
|
||||
/* GPU address of the current level */
|
||||
uint64_t address;
|
||||
|
||||
/* Pointer to the GPU entries of this level */
|
||||
uint64_t *entries;
|
||||
|
||||
union {
|
||||
/* Host tracking of a parent level to its children (only use on L3/L2
|
||||
* levels which have 4096 entries)
|
||||
*/
|
||||
struct intel_aux_level *children[4096];
|
||||
|
||||
/* Refcount of AUX pages at the L1 level (MTL has only 16 entries in L1,
|
||||
* which Gfx12 has 256 entries)
|
||||
*/
|
||||
uint32_t ref_counts[256];
|
||||
};
|
||||
};
|
||||
|
||||
struct intel_aux_map_context {
|
||||
void *driver_ctx;
|
||||
pthread_mutex_t mutex;
|
||||
struct intel_aux_level *l3_level;
|
||||
struct intel_mapped_pinned_buffer_alloc *buffer_alloc;
|
||||
uint32_t num_buffers;
|
||||
struct list_head buffers;
|
||||
uint64_t level3_base_addr;
|
||||
uint64_t *level3_map;
|
||||
uint32_t tail_offset, tail_remaining;
|
||||
uint32_t state_num;
|
||||
const struct aux_format_info *format;
|
||||
@@ -250,7 +269,7 @@ select_format(const struct intel_device_info *devinfo)
|
||||
static bool
|
||||
add_buffer(struct intel_aux_map_context *ctx)
|
||||
{
|
||||
struct aux_map_buffer *buf = ralloc(ctx, struct aux_map_buffer);
|
||||
struct aux_map_buffer *buf = rzalloc(ctx, struct aux_map_buffer);
|
||||
if (!buf)
|
||||
return false;
|
||||
|
||||
@@ -312,20 +331,31 @@ get_current_pos(struct intel_aux_map_context *ctx, uint64_t *gpu, uint64_t **map
|
||||
*map = (uint64_t*)((uint8_t*)tail->buffer->map + ctx->tail_offset);
|
||||
}
|
||||
|
||||
static bool
|
||||
add_sub_table(struct intel_aux_map_context *ctx, uint32_t size,
|
||||
uint32_t align, uint64_t *gpu, uint64_t **map)
|
||||
static struct intel_aux_level *
|
||||
add_sub_table(struct intel_aux_map_context *ctx,
|
||||
struct intel_aux_level *parent,
|
||||
uint32_t parent_index,
|
||||
uint32_t size, uint32_t align)
|
||||
{
|
||||
if (!align_and_verify_space(ctx, size, align)) {
|
||||
if (!add_buffer(ctx))
|
||||
return false;
|
||||
return NULL;
|
||||
UNUSED bool aligned = align_and_verify_space(ctx, size, align);
|
||||
assert(aligned);
|
||||
}
|
||||
get_current_pos(ctx, gpu, map);
|
||||
memset(*map, 0, size);
|
||||
|
||||
struct intel_aux_level *level = rzalloc(ctx, struct intel_aux_level);
|
||||
|
||||
get_current_pos(ctx, &level->address, &level->entries);
|
||||
memset(level->entries, 0, size);
|
||||
advance_current_pos(ctx, size);
|
||||
return true;
|
||||
|
||||
if (parent != NULL) {
|
||||
assert(parent->children[parent_index] == NULL);
|
||||
parent->children[parent_index] = level;
|
||||
}
|
||||
|
||||
return level;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
@@ -361,11 +391,12 @@ intel_aux_map_init(void *driver_ctx,
|
||||
ctx->tail_remaining = 0;
|
||||
ctx->state_num = 0;
|
||||
|
||||
if (add_sub_table(ctx, L3_L2_SUB_TABLE_LEN, L3_L2_SUB_TABLE_LEN,
|
||||
&ctx->level3_base_addr, &ctx->level3_map)) {
|
||||
ctx->l3_level = add_sub_table(ctx, NULL, 0,
|
||||
L3_L2_SUB_TABLE_LEN, L3_L2_SUB_TABLE_LEN);
|
||||
if (ctx->l3_level != NULL) {
|
||||
if (aux_map_debug)
|
||||
fprintf(stderr, "AUX-MAP L3: 0x%"PRIx64", map=%p\n",
|
||||
ctx->level3_base_addr, ctx->level3_map);
|
||||
ctx->l3_level->address, ctx->l3_level->entries);
|
||||
p_atomic_inc(&ctx->state_num);
|
||||
return ctx;
|
||||
} else {
|
||||
@@ -404,27 +435,7 @@ intel_aux_map_get_base(struct intel_aux_map_context *ctx)
|
||||
* This get initialized in intel_aux_map_init, and never changes, so there is
|
||||
* no need to lock the mutex.
|
||||
*/
|
||||
return ctx->level3_base_addr;
|
||||
}
|
||||
|
||||
static struct aux_map_buffer *
|
||||
find_buffer(struct intel_aux_map_context *ctx, uint64_t addr)
|
||||
{
|
||||
list_for_each_entry(struct aux_map_buffer, buf, &ctx->buffers, link) {
|
||||
if (buf->buffer->gpu <= addr && buf->buffer->gpu_end > addr) {
|
||||
return buf;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static uint64_t *
|
||||
get_u64_entry_ptr(struct intel_aux_map_context *ctx, uint64_t addr)
|
||||
{
|
||||
struct aux_map_buffer *buf = find_buffer(ctx, addr);
|
||||
assert(buf);
|
||||
uintptr_t map_offset = addr - buf->buffer->gpu;
|
||||
return (uint64_t*)((uint8_t*)buf->buffer->map + map_offset);
|
||||
return ctx->l3_level->address;
|
||||
}
|
||||
|
||||
static uint8_t
|
||||
@@ -511,52 +522,57 @@ get_l1_addr_mask(struct intel_aux_map_context *ctx)
|
||||
static void
|
||||
get_aux_entry(struct intel_aux_map_context *ctx, uint64_t main_address,
|
||||
uint32_t *l1_index_out, uint64_t *l1_entry_addr_out,
|
||||
uint64_t **l1_entry_map_out)
|
||||
uint64_t **l1_entry_map_out,
|
||||
struct intel_aux_level **l1_aux_level_out)
|
||||
{
|
||||
uint32_t l3_index = (main_address >> 36) & 0xfff;
|
||||
uint64_t *l3_entry = &ctx->level3_map[l3_index];
|
||||
struct intel_aux_level *l3_level = ctx->l3_level;
|
||||
struct intel_aux_level *l2_level;
|
||||
struct intel_aux_level *l1_level;
|
||||
|
||||
uint64_t *l2_map;
|
||||
if ((*l3_entry & INTEL_AUX_MAP_ENTRY_VALID_BIT) == 0) {
|
||||
uint64_t l2_addr;
|
||||
if (add_sub_table(ctx, L3_L2_SUB_TABLE_LEN, L3_L2_SUB_TABLE_LEN,
|
||||
&l2_addr, &l2_map)) {
|
||||
uint32_t l3_index = (main_address >> 36) & 0xfff;
|
||||
|
||||
if (l3_level->children[l3_index] == NULL) {
|
||||
l2_level =
|
||||
add_sub_table(ctx, ctx->l3_level, l3_index,
|
||||
L3_L2_SUB_TABLE_LEN, L3_L2_SUB_TABLE_LEN);
|
||||
if (l2_level != NULL) {
|
||||
if (aux_map_debug)
|
||||
fprintf(stderr, "AUX-MAP L3[0x%x]: 0x%"PRIx64", map=%p\n",
|
||||
l3_index, l2_addr, l2_map);
|
||||
l3_index, l2_level->address, l2_level->entries);
|
||||
} else {
|
||||
unreachable("Failed to add L2 Aux-Map Page Table!");
|
||||
}
|
||||
*l3_entry = (l2_addr & L3_ENTRY_L2_ADDR_MASK) | INTEL_AUX_MAP_ENTRY_VALID_BIT;
|
||||
l3_level->entries[l3_index] = (l2_level->address & L3_ENTRY_L2_ADDR_MASK) |
|
||||
INTEL_AUX_MAP_ENTRY_VALID_BIT;
|
||||
} else {
|
||||
uint64_t l2_addr = intel_canonical_address(*l3_entry & L3_ENTRY_L2_ADDR_MASK);
|
||||
l2_map = get_u64_entry_ptr(ctx, l2_addr);
|
||||
l2_level = l3_level->children[l3_index];
|
||||
}
|
||||
uint32_t l2_index = (main_address >> 24) & 0xfff;
|
||||
uint64_t *l2_entry = &l2_map[l2_index];
|
||||
uint64_t l1_page_size = ctx->format->l1_page_size;
|
||||
uint64_t l1_addr, *l1_map;
|
||||
if ((*l2_entry & INTEL_AUX_MAP_ENTRY_VALID_BIT) == 0) {
|
||||
if (add_sub_table(ctx, l1_page_size, l1_page_size, &l1_addr, &l1_map)) {
|
||||
if (l2_level->children[l2_index] == NULL) {
|
||||
l1_level = add_sub_table(ctx, l2_level, l2_index, l1_page_size, l1_page_size);
|
||||
if (l1_level != NULL) {
|
||||
if (aux_map_debug)
|
||||
fprintf(stderr, "AUX-MAP L2[0x%x]: 0x%"PRIx64", map=%p\n",
|
||||
l2_index, l1_addr, l1_map);
|
||||
l2_index, l1_level->address, l1_level->entries);
|
||||
} else {
|
||||
unreachable("Failed to add L1 Aux-Map Page Table!");
|
||||
}
|
||||
*l2_entry = (l1_addr & get_l1_addr_mask(ctx)) | INTEL_AUX_MAP_ENTRY_VALID_BIT;
|
||||
l2_level->entries[l2_index] = (l1_level->address & get_l1_addr_mask(ctx)) |
|
||||
INTEL_AUX_MAP_ENTRY_VALID_BIT;
|
||||
} else {
|
||||
l1_addr = intel_canonical_address(*l2_entry & get_l1_addr_mask(ctx));
|
||||
l1_map = get_u64_entry_ptr(ctx, l1_addr);
|
||||
l1_level = l2_level->children[l2_index];
|
||||
}
|
||||
uint32_t l1_index = get_index(main_address, ctx->format->l1_index_mask,
|
||||
ctx->format->l1_index_offset);
|
||||
ctx->format->l1_index_offset);
|
||||
if (l1_index_out)
|
||||
*l1_index_out = l1_index;
|
||||
if (l1_entry_addr_out)
|
||||
*l1_entry_addr_out = intel_canonical_address(l1_addr + l1_index * sizeof(*l1_map));
|
||||
*l1_entry_addr_out = intel_canonical_address(l1_level->address + l1_index * sizeof(uint64_t));
|
||||
if (l1_entry_map_out)
|
||||
*l1_entry_map_out = &l1_map[l1_index];
|
||||
*l1_entry_map_out = &l1_level->entries[l1_index];
|
||||
if (l1_aux_level_out)
|
||||
*l1_aux_level_out = l1_level;
|
||||
}
|
||||
|
||||
static bool
|
||||
@@ -570,7 +586,8 @@ add_mapping(struct intel_aux_map_context *ctx, uint64_t main_address,
|
||||
|
||||
uint32_t l1_index;
|
||||
uint64_t *l1_entry;
|
||||
get_aux_entry(ctx, main_address, &l1_index, NULL, &l1_entry);
|
||||
struct intel_aux_level *l1_aux_level;
|
||||
get_aux_entry(ctx, main_address, &l1_index, NULL, &l1_entry, &l1_aux_level);
|
||||
|
||||
const uint64_t l1_data =
|
||||
(aux_address & intel_aux_get_meta_address_mask(ctx)) |
|
||||
@@ -579,6 +596,7 @@ add_mapping(struct intel_aux_map_context *ctx, uint64_t main_address,
|
||||
|
||||
const uint64_t current_l1_data = *l1_entry;
|
||||
if ((current_l1_data & INTEL_AUX_MAP_ENTRY_VALID_BIT) == 0) {
|
||||
assert(l1_aux_level->ref_counts[l1_index] == 0);
|
||||
assert((aux_address & 0xffULL) == 0);
|
||||
if (aux_map_debug)
|
||||
fprintf(stderr, "AUX-MAP L1[0x%x] 0x%"PRIx64" -> 0x%"PRIx64"\n",
|
||||
@@ -608,6 +626,8 @@ add_mapping(struct intel_aux_map_context *ctx, uint64_t main_address,
|
||||
}
|
||||
}
|
||||
|
||||
l1_aux_level->ref_counts[l1_index]++;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -618,12 +638,71 @@ intel_aux_map_get_entry(struct intel_aux_map_context *ctx,
|
||||
{
|
||||
pthread_mutex_lock(&ctx->mutex);
|
||||
uint64_t *l1_entry_map;
|
||||
get_aux_entry(ctx, main_address, NULL, aux_entry_address, &l1_entry_map);
|
||||
get_aux_entry(ctx, main_address, NULL, aux_entry_address, &l1_entry_map, NULL);
|
||||
pthread_mutex_unlock(&ctx->mutex);
|
||||
|
||||
return l1_entry_map;
|
||||
}
|
||||
|
||||
/**
|
||||
* We mark the leaf entry as invalid, but we don't attempt to cleanup the
|
||||
* other levels of translation mappings. Since we attempt to re-use VMA
|
||||
* ranges, hopefully this will not lead to unbounded growth of the translation
|
||||
* tables.
|
||||
*/
|
||||
static void
|
||||
remove_l1_mapping_locked(struct intel_aux_map_context *ctx, uint64_t main_address,
|
||||
bool reset_refcount, bool *state_changed)
|
||||
{
|
||||
uint32_t l1_index;
|
||||
uint64_t *l1_entry;
|
||||
struct intel_aux_level *l1_aux_level;
|
||||
get_aux_entry(ctx, main_address, &l1_index, NULL, &l1_entry, &l1_aux_level);
|
||||
|
||||
const uint64_t current_l1_data = *l1_entry;
|
||||
const uint64_t l1_data = current_l1_data & ~INTEL_AUX_MAP_ENTRY_VALID_BIT;
|
||||
|
||||
if ((current_l1_data & INTEL_AUX_MAP_ENTRY_VALID_BIT) == 0) {
|
||||
assert(l1_aux_level->ref_counts[l1_index] == 0);
|
||||
return;
|
||||
} else if (reset_refcount) {
|
||||
l1_aux_level->ref_counts[l1_index] = 0;
|
||||
if (unlikely(l1_data == 0))
|
||||
*state_changed = true;
|
||||
*l1_entry = l1_data;
|
||||
} else {
|
||||
assert(l1_aux_level->ref_counts[l1_index] > 0);
|
||||
if (--l1_aux_level->ref_counts[l1_index] == 0) {
|
||||
/**
|
||||
* We use non-zero bits in 63:1 to indicate the entry had been filled
|
||||
* previously. In the unlikely event that these are all zero, we
|
||||
* force a flush of the aux-map tables.
|
||||
*/
|
||||
if (unlikely(l1_data == 0))
|
||||
*state_changed = true;
|
||||
*l1_entry = l1_data;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
remove_mapping_locked(struct intel_aux_map_context *ctx, uint64_t main_address,
|
||||
uint64_t size, bool reset_refcount, bool *state_changed)
|
||||
{
|
||||
if (aux_map_debug)
|
||||
fprintf(stderr, "AUX-MAP remove 0x%"PRIx64"-0x%"PRIx64"\n", main_address,
|
||||
main_address + size);
|
||||
|
||||
uint64_t main_inc_addr = main_address;
|
||||
uint64_t main_page_size = ctx->format->main_page_size;
|
||||
assert((main_address & get_page_mask(main_page_size)) == 0);
|
||||
while (main_inc_addr - main_address < size) {
|
||||
remove_l1_mapping_locked(ctx, main_inc_addr, reset_refcount,
|
||||
state_changed);
|
||||
main_inc_addr += main_page_size;
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
intel_aux_map_add_mapping(struct intel_aux_map_context *ctx, uint64_t main_address,
|
||||
uint64_t aux_address, uint64_t main_size_B,
|
||||
@@ -645,41 +724,32 @@ intel_aux_map_add_mapping(struct intel_aux_map_context *ctx, uint64_t main_addre
|
||||
main_inc_addr = main_inc_addr + main_page_size;
|
||||
aux_inc_addr = aux_inc_addr + aux_page_size;
|
||||
}
|
||||
bool success = main_inc_addr - main_address >= main_size_B;
|
||||
if (!success && (main_inc_addr - main_address) > 0) {
|
||||
/* If the mapping failed, remove the mapped portion. */
|
||||
remove_mapping_locked(ctx, main_address,
|
||||
main_size_B - (main_inc_addr - main_address),
|
||||
false /* reset_refcount */, &state_changed);
|
||||
}
|
||||
pthread_mutex_unlock(&ctx->mutex);
|
||||
if (state_changed)
|
||||
p_atomic_inc(&ctx->state_num);
|
||||
|
||||
return main_inc_addr - main_address >= main_size_B;
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
/**
|
||||
* We mark the leaf entry as invalid, but we don't attempt to cleanup the
|
||||
* other levels of translation mappings. Since we attempt to re-use VMA
|
||||
* ranges, hopefully this will not lead to unbounded growth of the translation
|
||||
* tables.
|
||||
*/
|
||||
static void
|
||||
remove_mapping(struct intel_aux_map_context *ctx, uint64_t main_address,
|
||||
bool *state_changed)
|
||||
void
|
||||
intel_aux_map_del_mapping(struct intel_aux_map_context *ctx, uint64_t main_address,
|
||||
uint64_t size)
|
||||
{
|
||||
uint64_t *l1_entry;
|
||||
get_aux_entry(ctx, main_address, NULL, NULL, &l1_entry);
|
||||
|
||||
const uint64_t current_l1_data = *l1_entry;
|
||||
const uint64_t l1_data = current_l1_data & ~INTEL_AUX_MAP_ENTRY_VALID_BIT;
|
||||
|
||||
if ((current_l1_data & INTEL_AUX_MAP_ENTRY_VALID_BIT) == 0) {
|
||||
return;
|
||||
} else {
|
||||
/**
|
||||
* We use non-zero bits in 63:1 to indicate the entry had been filled
|
||||
* previously. In the unlikely event that these are all zero, we force a
|
||||
* flush of the aux-map tables.
|
||||
*/
|
||||
if (unlikely(l1_data == 0))
|
||||
*state_changed = true;
|
||||
*l1_entry = l1_data;
|
||||
}
|
||||
bool state_changed = false;
|
||||
pthread_mutex_lock(&ctx->mutex);
|
||||
remove_mapping_locked(ctx, main_address, size, false /* reset_refcount */,
|
||||
&state_changed);
|
||||
pthread_mutex_unlock(&ctx->mutex);
|
||||
if (state_changed)
|
||||
p_atomic_inc(&ctx->state_num);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -688,17 +758,8 @@ intel_aux_map_unmap_range(struct intel_aux_map_context *ctx, uint64_t main_addre
|
||||
{
|
||||
bool state_changed = false;
|
||||
pthread_mutex_lock(&ctx->mutex);
|
||||
if (aux_map_debug)
|
||||
fprintf(stderr, "AUX-MAP remove 0x%"PRIx64"-0x%"PRIx64"\n", main_address,
|
||||
main_address + size);
|
||||
|
||||
uint64_t main_inc_addr = main_address;
|
||||
uint64_t main_page_size = ctx->format->main_page_size;
|
||||
assert((main_address & get_page_mask(main_page_size)) == 0);
|
||||
while (main_inc_addr - main_address < size) {
|
||||
remove_mapping(ctx, main_inc_addr, &state_changed);
|
||||
main_inc_addr += main_page_size;
|
||||
}
|
||||
remove_mapping_locked(ctx, main_address, size, true /* reset_refcount */,
|
||||
&state_changed);
|
||||
pthread_mutex_unlock(&ctx->mutex);
|
||||
if (state_changed)
|
||||
p_atomic_inc(&ctx->state_num);
|
||||
|
@@ -110,12 +110,23 @@ intel_aux_map_get_entry(struct intel_aux_map_context *ctx,
|
||||
uint64_t *aux_entry_address);
|
||||
|
||||
/* Fails if a mapping is attempted that would conflict with an existing one.
|
||||
* This increase the refcount of the mapped region if already mapped, sets it
|
||||
* to 1 otherwise.
|
||||
*/
|
||||
bool
|
||||
intel_aux_map_add_mapping(struct intel_aux_map_context *ctx, uint64_t main_address,
|
||||
uint64_t aux_address, uint64_t main_size_B,
|
||||
uint64_t format_bits);
|
||||
|
||||
/* Decrease the refcount of a mapped region. When the refcount reaches 0, the
|
||||
* region is unmapped.
|
||||
*/
|
||||
void
|
||||
intel_aux_map_del_mapping(struct intel_aux_map_context *ctx, uint64_t main_address,
|
||||
uint64_t size);
|
||||
|
||||
/* Unmaps a region, refcount is reset to 0.
|
||||
*/
|
||||
void
|
||||
intel_aux_map_unmap_range(struct intel_aux_map_context *ctx, uint64_t main_address,
|
||||
uint64_t size);
|
||||
|
Reference in New Issue
Block a user