radv: upload shader binaries of a pipeline contiguously in memory

RGP expects shaders to be contiguous in memory, otherwise it explodes
because we have to generate huge captures with lot of holes.

This reduces capture sizes of Cyberpunk 2077 from ~3.5GiB to ~180MiB.

This should also help for future pipeline libraries.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13690>
This commit is contained in:
Samuel Pitoiset
2021-11-05 13:58:12 +01:00
committed by Marge Bot
parent a7f0463612
commit 3fa2220838
7 changed files with 169 additions and 72 deletions

View File

@@ -1391,15 +1391,7 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
radv_emit_batch_break_on_new_ps(cmd_buffer);
for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
if (!pipeline->shaders[i])
continue;
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->shaders[i]->bo);
}
if (radv_pipeline_has_gs_copy_shader(pipeline))
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->gs_copy_shader->bo);
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->slab->alloc->arena->bo);
if (unlikely(cmd_buffer->device->trace_bo))
radv_save_pipeline(cmd_buffer, pipeline);
@@ -4848,8 +4840,7 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipel
cmd_buffer->compute_scratch_waves_wanted =
MAX2(cmd_buffer->compute_scratch_waves_wanted, pipeline->max_waves);
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
pipeline->shaders[MESA_SHADER_COMPUTE]->bo);
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->slab->alloc->arena->bo);
if (unlikely(cmd_buffer->device->trace_bo))
radv_save_pipeline(cmd_buffer, pipeline);

View File

@@ -832,10 +832,6 @@ radv_trap_handler_init(struct radv_device *device)
return false;
}
result = ws->buffer_make_resident(ws, device->trap_handler_shader->bo, true);
if (result != VK_SUCCESS)
return false;
result = ws->buffer_create(ws, TMA_BO_SIZE, 256, RADEON_DOMAIN_VRAM,
RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING |
RADEON_FLAG_ZERO_VRAM | RADEON_FLAG_32BIT,
@@ -873,7 +869,6 @@ radv_trap_handler_finish(struct radv_device *device)
struct radeon_winsys *ws = device->ws;
if (unlikely(device->trap_handler_shader)) {
ws->buffer_make_resident(ws, device->trap_handler_shader->bo, false);
radv_shader_destroy(device, device->trap_handler_shader);
}

View File

@@ -171,6 +171,37 @@ radv_pipeline_has_gs_copy_shader(const struct radv_pipeline *pipeline)
return !!pipeline->gs_copy_shader;
}
static struct radv_pipeline_slab *
radv_pipeline_slab_create(struct radv_device *device, struct radv_pipeline *pipeline,
uint32_t code_size)
{
struct radv_pipeline_slab *slab;
slab = calloc(1, sizeof(*slab));
if (!slab)
return NULL;
slab->ref_count = 1;
slab->alloc = radv_alloc_shader_memory(device, code_size, pipeline);
if (!slab->alloc) {
free(slab);
return NULL;
}
return slab;
}
void
radv_pipeline_slab_destroy(struct radv_device *device, struct radv_pipeline_slab *slab)
{
if (!p_atomic_dec_zero(&slab->ref_count))
return;
radv_free_shader_memory(device, slab->alloc);
free(slab);
}
void
radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline,
const VkAllocationCallbacks *allocator)
@@ -183,6 +214,9 @@ radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline
free(pipeline->library.stages);
}
if (pipeline->slab)
radv_pipeline_slab_destroy(device, pipeline->slab);
for (unsigned i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i)
if (pipeline->shaders[i])
radv_shader_destroy(device, pipeline->shaders[i]);
@@ -3354,6 +3388,61 @@ non_uniform_access_callback(const nir_src *src, void *_)
return nir_chase_binding(*src).success ? 0x2 : 0x3;
}
VkResult
radv_upload_shaders(struct radv_device *device, struct radv_pipeline *pipeline,
struct radv_shader_binary **binaries, struct radv_shader_binary *gs_copy_binary)
{
uint32_t code_size = 0;
/* Compute the total code size. */
for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
struct radv_shader *shader = pipeline->shaders[i];
if (!shader)
continue;
code_size += align(shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
}
if (pipeline->gs_copy_shader) {
code_size += align(pipeline->gs_copy_shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
}
/* Allocate memory for all shader binaries. */
pipeline->slab = radv_pipeline_slab_create(device, pipeline, code_size);
if (!pipeline->slab)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
/* Upload shader binaries. */
uint64_t slab_va = radv_buffer_get_va(pipeline->slab->alloc->arena->bo);
uint32_t slab_offset = pipeline->slab->alloc->offset;
char *slab_ptr = pipeline->slab->alloc->arena->ptr;
for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
struct radv_shader *shader = pipeline->shaders[i];
if (!shader)
continue;
shader->va = slab_va + slab_offset;
void *dest_ptr = slab_ptr + slab_offset;
if (!radv_shader_binary_upload(device, binaries[i], shader, dest_ptr))
return VK_ERROR_OUT_OF_HOST_MEMORY;
slab_offset += align(shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
}
if (pipeline->gs_copy_shader) {
pipeline->gs_copy_shader->va = slab_va + slab_offset;
void *dest_ptr = slab_ptr + slab_offset;
if (!radv_shader_binary_upload(device, gs_copy_binary, pipeline->gs_copy_shader, dest_ptr))
return VK_ERROR_OUT_OF_HOST_MEMORY;
}
return VK_SUCCESS;
}
VkResult
radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout *pipeline_layout,
struct radv_device *device, struct radv_pipeline_cache *cache,
@@ -3411,11 +3500,6 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout
radv_create_shaders_from_pipeline_cache(device, cache, hash, pipeline,
stack_sizes, num_stack_sizes,
&found_in_application_cache)) {
if (modules[MESA_SHADER_GEOMETRY] && !pipeline->shaders[MESA_SHADER_GEOMETRY]->info.is_ngg) {
/* We use the CS slot because graphics pipelines might use all the other ones. */
pipeline->gs_copy_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
pipeline->shaders[MESA_SHADER_COMPUTE] = NULL;
}
radv_stop_feedback(pipeline_feedback, found_in_application_cache);
return VK_SUCCESS;
}
@@ -3692,19 +3776,7 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout
}
/* Upload shader binaries. */
for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
struct radv_shader *shader = pipeline->shaders[i];
if (!shader)
continue;
if (!radv_shader_binary_upload(device, binaries[i], shader))
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
if (i == MESA_SHADER_GEOMETRY && pipeline->gs_copy_shader) {
if (!radv_shader_binary_upload(device, gs_copy_binary, pipeline->gs_copy_shader))
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
}
}
radv_upload_shaders(device, pipeline, binaries, gs_copy_binary);
if (!keep_executable_info) {
if (pipeline->gs_copy_shader) {

View File

@@ -39,6 +39,7 @@ struct cache_entry {
uint32_t binary_sizes[MESA_VULKAN_SHADER_STAGES];
uint32_t num_stack_sizes;
struct radv_shader *shaders[MESA_VULKAN_SHADER_STAGES];
struct radv_pipeline_slab *slab;
char code[0];
};
@@ -94,6 +95,8 @@ radv_pipeline_cache_finish(struct radv_pipeline_cache *cache)
if (cache->hash_table[i]->shaders[j])
radv_shader_destroy(cache->device, cache->hash_table[i]->shaders[j]);
}
if (cache->hash_table[i]->slab)
radv_pipeline_slab_destroy(cache->device, cache->hash_table[i]->slab);
vk_free(&cache->alloc, cache->hash_table[i]);
}
mtx_destroy(&cache->mutex);
@@ -298,6 +301,7 @@ radv_create_shaders_from_pipeline_cache(
uint32_t *num_stack_sizes, bool *found_in_application_cache)
{
struct cache_entry *entry;
VkResult result;
if (!cache) {
cache = device->mem_cache;
@@ -347,6 +351,9 @@ radv_create_shaders_from_pipeline_cache(
}
}
struct radv_shader_binary *binaries[MESA_VULKAN_SHADER_STAGES] = {NULL};
struct radv_shader_binary *gs_copy_binary = NULL;
bool needs_upload = false;
char *p = entry->code;
for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
if (!entry->shaders[i] && entry->binary_sizes[i]) {
@@ -356,13 +363,8 @@ radv_create_shaders_from_pipeline_cache(
entry->shaders[i] = radv_shader_create(device, binary, false, true, NULL);
if (!radv_shader_binary_upload(device, binary, entry->shaders[i])) {
free(binary);
radv_pipeline_cache_unlock(cache);
return false;
}
free(binary);
needs_upload = true;
binaries[i] = binary;
} else if (entry->binary_sizes[i]) {
p += entry->binary_sizes[i];
}
@@ -370,6 +372,33 @@ radv_create_shaders_from_pipeline_cache(
memcpy(pipeline->shaders, entry->shaders, sizeof(entry->shaders));
if (pipeline->shaders[MESA_SHADER_GEOMETRY] &&
!pipeline->shaders[MESA_SHADER_GEOMETRY]->info.is_ngg) {
/* For the GS copy shader, RADV uses the compute shader slot to avoid a new cache entry. */
pipeline->gs_copy_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
pipeline->shaders[MESA_SHADER_COMPUTE] = NULL;
gs_copy_binary = binaries[MESA_SHADER_COMPUTE];
}
if (needs_upload) {
result = radv_upload_shaders(device, pipeline, binaries, gs_copy_binary);
for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
if (pipeline->shaders[i])
free(binaries[i]);
}
free(gs_copy_binary);
if (result != VK_SUCCESS) {
radv_pipeline_cache_unlock(cache);
return false;
}
entry->slab = pipeline->slab;
} else {
pipeline->slab = entry->slab;
}
if (num_stack_sizes) {
*num_stack_sizes = entry->num_stack_sizes;
if (entry->num_stack_sizes) {
@@ -388,6 +417,7 @@ radv_create_shaders_from_pipeline_cache(
for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i)
if (entry->shaders[i])
p_atomic_inc(&entry->shaders[i]->ref_count);
p_atomic_inc(&entry->slab->ref_count);
}
assert((uintptr_t)p <= (uintptr_t)entry + entry_size(entry));
@@ -417,6 +447,12 @@ radv_pipeline_cache_insert_shaders(struct radv_device *device, struct radv_pipel
pipeline->shaders[i] = entry->shaders[i];
p_atomic_inc(&pipeline->shaders[i]->ref_count);
}
radv_pipeline_slab_destroy(cache->device, pipeline->slab);
pipeline->slab = entry->slab;
p_atomic_inc(&pipeline->slab->ref_count);
radv_pipeline_cache_unlock(cache);
return;
}
@@ -499,6 +535,9 @@ radv_pipeline_cache_insert_shaders(struct radv_device *device, struct radv_pipel
p_atomic_inc(&pipeline->shaders[i]->ref_count);
}
entry->slab = pipeline->slab;
p_atomic_inc(&pipeline->slab->ref_count);
radv_pipeline_cache_add_entry(cache, entry);
cache->modified = true;
@@ -541,6 +580,7 @@ radv_pipeline_cache_load(struct radv_pipeline_cache *cache, const void *data, si
memcpy(dest_entry, entry, size_of_entry);
for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i)
dest_entry->shaders[i] = NULL;
dest_entry->slab = NULL;
radv_pipeline_cache_add_entry(cache, dest_entry);
}
p += size_of_entry;
@@ -638,6 +678,7 @@ radv_GetPipelineCacheData(VkDevice _device, VkPipelineCache _cache, size_t *pDat
memcpy(p, entry, size_of_entry);
for (int j = 0; j < MESA_VULKAN_SHADER_STAGES; ++j)
((struct cache_entry *)p)->shaders[j] = NULL;
((struct cache_entry *)p)->slab = NULL;
p = (char *)p + size_of_entry;
}
*pDataSize = (char *)p - (char *)pData;

View File

@@ -373,6 +373,10 @@ void radv_pipeline_cache_insert_shaders(
struct radv_pipeline *pipeline, struct radv_shader_binary *const *binaries,
const struct radv_pipeline_shader_stack_size *stack_sizes, uint32_t num_stack_sizes);
VkResult radv_upload_shaders(struct radv_device *device, struct radv_pipeline *pipeline,
struct radv_shader_binary **binaries,
struct radv_shader_binary *gs_copy_binary);
enum radv_blit_ds_layout {
RADV_BLIT_DS_LAYOUT_TILE_ENABLE,
RADV_BLIT_DS_LAYOUT_TILE_DISABLE,
@@ -1783,6 +1787,14 @@ struct radv_pipeline_shader_stack_size {
uint32_t non_recursive_size;
};
struct radv_pipeline_slab {
uint32_t ref_count;
union radv_shader_arena_block *alloc;
};
void radv_pipeline_slab_destroy(struct radv_device *device, struct radv_pipeline_slab *slab);
struct radv_pipeline {
struct vk_object_base base;
enum radv_pipeline_type type;
@@ -1790,6 +1802,8 @@ struct radv_pipeline {
struct radv_device *device;
struct radv_dynamic_state dynamic_state;
struct radv_pipeline_slab *slab;
bool need_indirect_descriptor_sets;
struct radv_shader *shaders[MESA_VULKAN_SHADER_STAGES];
struct radv_shader *gs_copy_shader;

View File

@@ -1089,8 +1089,8 @@ free_block_obj(struct radv_device *device, union radv_shader_arena_block *block)
* this should allocate blocks for shaders fast and with no fragmentation, while still allowing
* free'd memory to be re-used.
*/
static union radv_shader_arena_block *
alloc_shader_memory(struct radv_device *device, uint32_t size, void *ptr)
union radv_shader_arena_block *
radv_alloc_shader_memory(struct radv_device *device, uint32_t size, void *ptr)
{
size = align(size, RADV_SHADER_ALLOC_ALIGNMENT);
@@ -1209,8 +1209,8 @@ get_hole(struct radv_shader_arena *arena, struct list_head *head)
return hole->freelist.prev ? hole : NULL;
}
static void
free_shader_memory(struct radv_device *device, union radv_shader_arena_block *alloc)
void
radv_free_shader_memory(struct radv_device *device, union radv_shader_arena_block *alloc)
{
mtx_lock(&device->shader_arena_mutex);
@@ -1256,16 +1256,6 @@ free_shader_memory(struct radv_device *device, union radv_shader_arena_block *al
mtx_unlock(&device->shader_arena_mutex);
}
static void *
radv_alloc_shader_memory(struct radv_device *device, struct radv_shader *shader)
{
shader->alloc = alloc_shader_memory(device, shader->code_size, shader);
if (!shader->alloc)
return NULL;
shader->bo = shader->alloc->arena->bo;
return shader->alloc->arena->ptr + shader->alloc->offset;
}
void
radv_init_shader_arenas(struct radv_device *device)
{
@@ -1608,16 +1598,8 @@ radv_open_rtld_binary(struct radv_device *device, const struct radv_shader *shad
bool
radv_shader_binary_upload(struct radv_device *device, const struct radv_shader_binary *binary,
struct radv_shader *shader)
struct radv_shader *shader, void *dest_ptr)
{
void *dest_ptr;
dest_ptr = radv_alloc_shader_memory(device, shader);
if (!dest_ptr) {
free(shader);
return false;
}
if (binary->type == RADV_BINARY_TYPE_RTLD) {
struct ac_rtld_binary rtld_binary = {0};
@@ -1959,7 +1941,7 @@ upload_vs_prolog(struct radv_device *device, struct radv_prolog_binary *bin, uns
if (!prolog)
return NULL;
prolog->alloc = alloc_shader_memory(device, bin->code_size, NULL);
prolog->alloc = radv_alloc_shader_memory(device, bin->code_size, NULL);
if (!prolog->alloc) {
free(prolog);
return NULL;
@@ -2027,8 +2009,6 @@ radv_shader_destroy(struct radv_device *device, struct radv_shader *shader)
if (!p_atomic_dec_zero(&shader->ref_count))
return;
free_shader_memory(device, shader->alloc);
free(shader->spirv);
free(shader->nir_string);
free(shader->disasm_string);
@@ -2043,14 +2023,14 @@ radv_prolog_destroy(struct radv_device *device, struct radv_shader_prolog *prolo
if (!prolog)
return;
free_shader_memory(device, prolog->alloc);
radv_free_shader_memory(device, prolog->alloc);
free(prolog);
}
uint64_t
radv_shader_get_va(const struct radv_shader *shader)
{
return radv_buffer_get_va(shader->bo) + shader->alloc->offset;
return shader->va;
}
struct radv_shader *

View File

@@ -454,8 +454,8 @@ union radv_shader_arena_block {
struct radv_shader {
uint32_t ref_count;
struct radeon_winsys_bo *bo;
union radv_shader_arena_block *alloc;
uint64_t va;
struct ac_shader_config config;
uint8_t *code_ptr;
uint32_t code_size;
@@ -515,7 +515,11 @@ struct radv_shader *radv_shader_compile(
struct radv_shader_binary **binary_out);
bool radv_shader_binary_upload(struct radv_device *device, const struct radv_shader_binary *binary,
struct radv_shader *shader);
struct radv_shader *shader, void *dest_ptr);
union radv_shader_arena_block *radv_alloc_shader_memory(struct radv_device *device, uint32_t size,
void *ptr);
void radv_free_shader_memory(struct radv_device *device, union radv_shader_arena_block *alloc);
struct radv_shader *
radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *nir,