anv/allocator: Simplify anv_scratch_pool

The previous implementation was being overly clever and using the
anv_bo::size field as its mutex.  Scratch pool allocations don't happen
often, will happen at most a fixed number of times, and never happen in the
critical path (they only happen in shader compilation).  We can make this
much simpler by just using the device mutex.  This also means that we can
start using anv_bo_init_new directly on the bo and avoid setting fields
one-at-a-time.

Signed-off-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Kristian H. Kristensen <hoegsberg@google.com>
Cc: "13.0" <mesa-stable@lists.freedesktop.org>
This commit is contained in:
Jason Ekstrand
2016-11-01 13:10:11 -07:00
parent 6283b6d56a
commit bd0f8d5070
2 changed files with 56 additions and 62 deletions

View File

@@ -888,9 +888,9 @@ anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool
{
for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) {
for (unsigned i = 0; i < 16; i++) {
struct anv_bo *bo = &pool->bos[i][s];
if (bo->size > 0)
anv_gem_close(device, bo->gem_handle);
struct anv_scratch_bo *bo = &pool->bos[i][s];
if (bo->exists > 0)
anv_gem_close(device, bo->bo.gem_handle);
}
}
}
@@ -905,70 +905,59 @@ anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
assert(scratch_size_log2 < 16);
struct anv_bo *bo = &pool->bos[scratch_size_log2][stage];
struct anv_scratch_bo *bo = &pool->bos[scratch_size_log2][stage];
/* From now on, we go into a critical section. In order to remain
* thread-safe, we use the bo size as a lock. A value of 0 means we don't
* have a valid BO yet. A value of 1 means locked. A value greater than 1
* means we have a bo of the given size.
/* We can use "exists" to shortcut and ignore the critical section */
if (bo->exists)
return &bo->bo;
pthread_mutex_lock(&device->mutex);
__sync_synchronize();
if (bo->exists)
return &bo->bo;
const struct anv_physical_device *physical_device =
&device->instance->physicalDevice;
const struct gen_device_info *devinfo = &physical_device->info;
/* WaCSScratchSize:hsw
*
* Haswell's scratch space address calculation appears to be sparse
* rather than tightly packed. The Thread ID has bits indicating which
* subslice, EU within a subslice, and thread within an EU it is.
* There's a maximum of two slices and two subslices, so these can be
* stored with a single bit. Even though there are only 10 EUs per
* subslice, this is stored in 4 bits, so there's an effective maximum
* value of 16 EUs. Similarly, although there are only 7 threads per EU,
* this is stored in a 3 bit number, giving an effective maximum value
* of 8 threads per EU.
*
* This means that we need to use 16 * 8 instead of 10 * 7 for the
* number of threads per subslice.
*/
const unsigned subslices = MAX2(physical_device->subslice_total, 1);
const unsigned scratch_ids_per_subslice =
device->info.is_haswell ? 16 * 8 : devinfo->max_cs_threads;
if (bo->size > 1)
return bo;
uint32_t max_threads[] = {
[MESA_SHADER_VERTEX] = devinfo->max_vs_threads,
[MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
[MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
[MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads,
[MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads,
[MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslices,
};
uint64_t size = __sync_val_compare_and_swap(&bo->size, 0, 1);
if (size == 0) {
/* We own the lock. Allocate a buffer */
uint32_t size = per_thread_scratch * max_threads[stage];
const struct anv_physical_device *physical_device =
&device->instance->physicalDevice;
const struct gen_device_info *devinfo = &physical_device->info;
anv_bo_init_new(&bo->bo, device, size);
/* WaCSScratchSize:hsw
*
* Haswell's scratch space address calculation appears to be sparse
* rather than tightly packed. The Thread ID has bits indicating which
* subslice, EU within a subslice, and thread within an EU it is.
* There's a maximum of two slices and two subslices, so these can be
* stored with a single bit. Even though there are only 10 EUs per
* subslice, this is stored in 4 bits, so there's an effective maximum
* value of 16 EUs. Similarly, although there are only 7 threads per EU,
* this is stored in a 3 bit number, giving an effective maximum value
* of 8 threads per EU.
*
* This means that we need to use 16 * 8 instead of 10 * 7 for the
* number of threads per subslice.
*/
const unsigned subslices = MAX2(physical_device->subslice_total, 1);
const unsigned scratch_ids_per_subslice =
device->info.is_haswell ? 16 * 8 : devinfo->max_cs_threads;
/* Set the exists last because it may be read by other threads */
__sync_synchronize();
bo->exists = true;
uint32_t max_threads[] = {
[MESA_SHADER_VERTEX] = devinfo->max_vs_threads,
[MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
[MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
[MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads,
[MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads,
[MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslices,
};
pthread_mutex_unlock(&device->mutex);
size = per_thread_scratch * max_threads[stage];
struct anv_bo new_bo;
anv_bo_init_new(&new_bo, device, size);
bo->gem_handle = new_bo.gem_handle;
/* Set the size last because we use it as a lock */
__sync_synchronize();
bo->size = size;
futex_wake((uint32_t *)&bo->size, INT_MAX);
} else {
/* Someone else got here first */
while (bo->size == 1)
futex_wait((uint32_t *)&bo->size, 1);
}
return bo;
return &bo->bo;
}

View File

@@ -453,9 +453,14 @@ VkResult anv_bo_pool_alloc(struct anv_bo_pool *pool, struct anv_bo *bo,
uint32_t size);
void anv_bo_pool_free(struct anv_bo_pool *pool, const struct anv_bo *bo);
struct anv_scratch_bo {
bool exists;
struct anv_bo bo;
};
struct anv_scratch_pool {
/* Indexed by Per-Thread Scratch Space number (the hardware value) and stage */
struct anv_bo bos[16][MESA_SHADER_STAGES];
struct anv_scratch_bo bos[16][MESA_SHADER_STAGES];
};
void anv_scratch_pool_init(struct anv_device *device,