panfrost: Implement a disk cache

Wire up the Mesa shader disk cache into Panfrost. Coupled with the
precompiles from the previous patch, this should greatly reduce shader
recompile jank.

This is a bare bones implementation. Obvious future work includes:

- Caching internal (outside of Gallium) shaders
- Implement finalize_nir to reduce on disk size of shaders

That doesn't need to come in this patch.

This patch does shuffle some allocation patterns around to avoid extra
nir_shader_clones, but the result should be pretty clean.

---

Consider dEQP-GLES31.functional.ssbo.layout.basic_unsized_array.* in the CTS.
With a cold cache:

   44.11user 0.66system 0:45.44elapsed 98%CPU (0avgtext+0avgdata 267804maxresident)
   k 0inputs+0outputs (130major+74725minor)pagefaults 0swaps

But with this commit and a warm cache:

   4.07user 0.35system 0:04.56elapsed 96%CPU (0avgtext+0avgdata 211012maxresident)
   k0inputs+0outputs (1major+49489minor)pagefaults 0swaps

That's an 11x improvement!

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19363>
This commit is contained in:
Alyssa Rosenzweig
2022-10-14 12:32:26 -04:00
committed by Marge Bot
parent b35a55bb42
commit 40372bd720
7 changed files with 341 additions and 74 deletions

View File

@@ -20,6 +20,7 @@
# SOFTWARE.
files_panfrost = files(
'pan_disk_cache.c',
'pan_helpers.c',
'pan_public.h',
'pan_screen.c',

View File

@@ -51,6 +51,7 @@
#include "pan_util.h"
#include "decode.h"
#include "util/pan_lower_framebuffer.h"
#include "compiler/nir/nir_serialize.h"
static void
panfrost_clear(

View File

@@ -279,8 +279,18 @@ struct panfrost_fs_key {
};
struct panfrost_shader_key {
/* If we need vertex shader keys, union it in */
struct panfrost_fs_key fs;
union {
/* Vertex shaders do not use shader keys. However, we have a
* special "transform feedback" vertex program derived from a
* vertex shader. If vs_is_xfb is set on a vertex shader, this
* is a transform feedback shader, else it is a regular
* (unkeyed) vertex shader.
*/
bool vs_is_xfb;
/* Fragment shaders use regular shader keys */
struct panfrost_fs_key fs;
};
};
struct panfrost_compiled_shader {
@@ -308,7 +318,14 @@ struct panfrost_compiled_shader {
/* Shader CSO */
struct panfrost_uncompiled_shader {
nir_shader *nir;
/* NIR for the shader. For graphics, this will be non-NULL even for
* TGSI. For compute, this will be NULL after the shader is compiled,
* as we don't need any compute variants.
*/
const nir_shader *nir;
/* A SHA1 of the serialized NIR for the disk cache. */
unsigned char nir_sha1[20];
/* Stream output information */
struct pipe_stream_output_info stream_output;
@@ -329,6 +346,35 @@ struct panfrost_uncompiled_shader {
uint32_t fixed_varying_mask;
};
/* The binary artefacts of compiling a shader. This differs from
* panfrost_compiled_shader, which adds extra metadata beyond compiling but
* throws away information not needed after the initial compile.
*
* This structure is serialized for the shader disk cache.
*/
struct panfrost_shader_binary {
/* Collected information about the compiled shader */
struct pan_shader_info info;
/* The binary itself */
struct util_dynarray binary;
};
void
panfrost_disk_cache_store(struct disk_cache *cache,
const struct panfrost_uncompiled_shader *uncompiled,
const struct panfrost_shader_key *key,
const struct panfrost_shader_binary *binary);
bool
panfrost_disk_cache_retrieve(struct disk_cache *cache,
const struct panfrost_uncompiled_shader *uncompiled,
const struct panfrost_shader_key *key,
struct panfrost_shader_binary *binary);
void
panfrost_disk_cache_init(struct panfrost_screen *screen);
/** (Vertex buffer index, divisor) tuple that will become an Attribute Buffer
* Descriptor at draw-time on Midgard
*/

View File

@@ -0,0 +1,175 @@
/*
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates.
* Copyright © 2018 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
#include <string.h>
#include "compiler/nir/nir.h"
#include "util/blob.h"
#include "util/build_id.h"
#include "util/disk_cache.h"
#include "util/mesa-sha1.h"
#include "pan_context.h"
static bool debug = false;
extern int midgard_debug;
extern int bifrost_debug;
/**
* Compute a disk cache key for the given uncompiled shader and shader key.
*/
static void
panfrost_disk_cache_compute_key(struct disk_cache *cache,
const struct panfrost_uncompiled_shader *uncompiled,
const struct panfrost_shader_key *shader_key,
cache_key cache_key)
{
uint8_t data[sizeof(uncompiled->nir_sha1) + sizeof(*shader_key)];
memcpy(data, uncompiled->nir_sha1, sizeof(uncompiled->nir_sha1));
memcpy(data + sizeof(uncompiled->nir_sha1), shader_key, sizeof(*shader_key));
disk_cache_compute_key(cache, data, sizeof(data), cache_key);
}
/**
* Store the given compiled shader in the disk cache.
*
* This should only be called on newly compiled shaders. No checking is
* done to prevent repeated stores of the same shader.
*/
void
panfrost_disk_cache_store(struct disk_cache *cache,
const struct panfrost_uncompiled_shader *uncompiled,
const struct panfrost_shader_key *key,
const struct panfrost_shader_binary *binary)
{
#ifdef ENABLE_SHADER_CACHE
if (!cache)
return;
cache_key cache_key;
panfrost_disk_cache_compute_key(cache, uncompiled, key, cache_key);
if (debug) {
char sha1[41];
_mesa_sha1_format(sha1, cache_key);
fprintf(stderr, "[mesa disk cache] storing %s\n", sha1);
}
struct blob blob;
blob_init(&blob);
/* We write the following data to the cache blob:
*
* 1. Size of program binary
* 2. Program binary
* 3. Shader info
*/
blob_write_uint32(&blob, binary->binary.size);
blob_write_bytes(&blob, binary->binary.data, binary->binary.size);
blob_write_bytes(&blob, &binary->info, sizeof(binary->info));
disk_cache_put(cache, cache_key, blob.data, blob.size, NULL);
blob_finish(&blob);
#endif
}
/**
* Search for a compiled shader in the disk cache.
*/
bool
panfrost_disk_cache_retrieve(struct disk_cache *cache,
const struct panfrost_uncompiled_shader *uncompiled,
const struct panfrost_shader_key *key,
struct panfrost_shader_binary *binary)
{
#ifdef ENABLE_SHADER_CACHE
if (!cache)
return false;
cache_key cache_key;
panfrost_disk_cache_compute_key(cache, uncompiled, key, cache_key);
if (debug) {
char sha1[41];
_mesa_sha1_format(sha1, cache_key);
fprintf(stderr, "[mesa disk cache] retrieving %s: ", sha1);
}
size_t size;
void *buffer = disk_cache_get(cache, cache_key, &size);
if (debug)
fprintf(stderr, "%s\n", buffer ? "found" : "missing");
if (!buffer)
return false;
struct blob_reader blob;
blob_reader_init(&blob, buffer, size);
util_dynarray_init(&binary->binary, NULL);
uint32_t binary_size = blob_read_uint32(&blob);
void *ptr = util_dynarray_resize_bytes(&binary->binary, binary_size, 1);
blob_copy_bytes(&blob, ptr, binary_size);
blob_copy_bytes(&blob, &binary->info, sizeof(binary->info));
return true;
#else
return false;
#endif
}
/**
* Initialize the on-disk shader cache.
*/
void
panfrost_disk_cache_init(struct panfrost_screen *screen)
{
#ifdef ENABLE_SHADER_CACHE
const char *renderer = screen->base.get_name(&screen->base);
const struct build_id_note *note =
build_id_find_nhdr_for_addr(panfrost_disk_cache_init);
assert(note && build_id_length(note) == 20); /* sha1 */
const uint8_t *id_sha1 = build_id_data(note);
assert(id_sha1);
char timestamp[41];
_mesa_sha1_format(timestamp, id_sha1);
/* Consider any flags affecting the compile when caching */
uint64_t driver_flags = screen->dev.debug;
driver_flags |= ((uint64_t) (midgard_debug | bifrost_debug) << 32);
screen->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
#endif
}

View File

@@ -753,6 +753,8 @@ panfrost_destroy_screen(struct pipe_screen *pscreen)
if (dev->ro)
dev->ro->destroy(dev->ro);
panfrost_close_device(dev);
disk_cache_destroy(screen->disk_cache);
ralloc_free(pscreen);
}
@@ -853,6 +855,12 @@ panfrost_screen_get_compiler_options(struct pipe_screen *pscreen,
return pan_screen(pscreen)->vtbl.get_compiler_options();
}
static struct disk_cache *
panfrost_get_disk_shader_cache(struct pipe_screen *pscreen)
{
return pan_screen(pscreen)->disk_cache;
}
struct pipe_screen *
panfrost_create_screen(int fd, struct renderonly *ro)
{
@@ -896,12 +904,16 @@ panfrost_create_screen(int fd, struct renderonly *ro)
panfrost_is_dmabuf_modifier_supported;
screen->base.context_create = panfrost_create_context;
screen->base.get_compiler_options = panfrost_screen_get_compiler_options;
screen->base.get_disk_shader_cache = panfrost_get_disk_shader_cache;
screen->base.fence_reference = panfrost_fence_reference;
screen->base.fence_finish = panfrost_fence_finish;
screen->base.set_damage_region = panfrost_resource_set_damage_region;
panfrost_resource_screen_init(&screen->base);
pan_blend_shaders_init(dev);
panfrost_disk_cache_init(screen);
panfrost_pool_init(&screen->indirect_draw.bin_pool, NULL, dev,
PAN_BO_EXECUTE, 65536, "Indirect draw shaders",
false, true);

View File

@@ -37,6 +37,7 @@
#include "util/bitset.h"
#include "util/set.h"
#include "util/log.h"
#include "util/disk_cache.h"
#include "pan_device.h"
#include "pan_mempool.h"
@@ -107,6 +108,7 @@ struct panfrost_screen {
} indirect_draw;
struct panfrost_vtable vtbl;
struct disk_cache *disk_cache;
};
static inline struct panfrost_screen *

View File

@@ -37,12 +37,26 @@
#include "nir_serialize.h"
static struct panfrost_uncompiled_shader *
panfrost_alloc_shader(void)
panfrost_alloc_shader(const nir_shader *nir)
{
struct panfrost_uncompiled_shader *so = CALLOC_STRUCT(panfrost_uncompiled_shader);
struct panfrost_uncompiled_shader *so =
rzalloc(NULL, struct panfrost_uncompiled_shader);
simple_mtx_init(&so->lock, mtx_plain);
util_dynarray_init(&so->variants, NULL);
util_dynarray_init(&so->variants, so);
so->nir = nir;
/* Serialize the NIR to a binary blob that we can hash for the disk
* cache. Drop unnecessary information (like variable names) so the
* serialized NIR is smaller, and also to let us detect more isomorphic
* shaders when hashing, increasing cache hits.
*/
struct blob blob;
blob_init(&blob);
nir_serialize(&blob, nir, true);
_mesa_sha1_compute(blob.data, blob.size, so->nir_sha1);
blob_finish(&blob);
return so;
}
@@ -54,17 +68,15 @@ panfrost_alloc_variant(struct panfrost_uncompiled_shader *so)
}
static void
panfrost_shader_compile(struct pipe_screen *pscreen,
struct panfrost_pool *shader_pool,
struct panfrost_pool *desc_pool,
panfrost_shader_compile(struct panfrost_screen *screen,
const nir_shader *ir,
struct util_debug_callback *dbg,
struct panfrost_compiled_shader *state,
struct panfrost_shader_key *key,
unsigned req_local_mem,
unsigned fixed_varying_mask)
unsigned fixed_varying_mask,
struct panfrost_shader_binary *out)
{
struct panfrost_screen *screen = pan_screen(pscreen);
struct panfrost_device *dev = pan_device(pscreen);
struct panfrost_device *dev = pan_device(&screen->base);
nir_shader *s = nir_shader_clone(NULL, ir);
@@ -76,27 +88,27 @@ panfrost_shader_compile(struct pipe_screen *pscreen,
/* Lower this early so the backends don't have to worry about it */
if (s->info.stage == MESA_SHADER_FRAGMENT) {
inputs.fixed_varying_mask = state->key.fs.fixed_varying_mask;
inputs.fixed_varying_mask = key->fs.fixed_varying_mask;
if (s->info.outputs_written & BITFIELD_BIT(FRAG_RESULT_COLOR)) {
NIR_PASS_V(s, nir_lower_fragcolor,
state->key.fs.nr_cbufs_for_fragcolor);
key->fs.nr_cbufs_for_fragcolor);
}
if (state->key.fs.sprite_coord_enable) {
if (key->fs.sprite_coord_enable) {
NIR_PASS_V(s, nir_lower_texcoord_replace,
state->key.fs.sprite_coord_enable,
key->fs.sprite_coord_enable,
true /* point coord is sysval */,
false /* Y-invert */);
}
if (state->key.fs.clip_plane_enable) {
if (key->fs.clip_plane_enable) {
NIR_PASS_V(s, nir_lower_clip_fs,
state->key.fs.clip_plane_enable,
key->fs.clip_plane_enable,
false);
}
memcpy(inputs.rt_formats, state->key.fs.rt_formats, sizeof(inputs.rt_formats));
memcpy(inputs.rt_formats, key->fs.rt_formats, sizeof(inputs.rt_formats));
} else if (s->info.stage == MESA_SHADER_VERTEX) {
inputs.fixed_varying_mask = fixed_varying_mask;
@@ -104,41 +116,67 @@ panfrost_shader_compile(struct pipe_screen *pscreen,
inputs.no_idvs = s->info.has_transform_feedback_varyings;
}
struct util_dynarray binary;
util_dynarray_init(&out->binary, NULL);
screen->vtbl.compile_shader(s, &inputs, &out->binary, &out->info);
util_dynarray_init(&binary, NULL);
screen->vtbl.compile_shader(s, &inputs, &binary, &state->info);
assert(req_local_mem >= out->info.wls_size);
out->info.wls_size = req_local_mem;
assert(req_local_mem >= state->info.wls_size);
state->info.wls_size = req_local_mem;
/* In both clone and tgsi_to_nir paths, the shader is ralloc'd against
* a NULL context
*/
ralloc_free(s);
}
if (binary.size) {
state->bin = panfrost_pool_take_ref(shader_pool,
pan_pool_upload_aligned(&shader_pool->base,
binary.data, binary.size, 128));
static void
panfrost_shader_get(struct pipe_screen *pscreen,
struct panfrost_pool *shader_pool,
struct panfrost_pool *desc_pool,
struct panfrost_uncompiled_shader *uncompiled,
struct util_debug_callback *dbg,
struct panfrost_compiled_shader *state,
unsigned req_local_mem)
{
struct panfrost_screen *screen = pan_screen(pscreen);
struct panfrost_device *dev = pan_device(pscreen);
struct panfrost_shader_binary res = { 0 };
/* Try to retrieve the variant from the disk cache. If that fails,
* compile a new variant and store in the disk cache for later reuse.
*/
if (!panfrost_disk_cache_retrieve(screen->disk_cache, uncompiled, &state->key, &res)) {
panfrost_shader_compile(screen, uncompiled->nir, dbg, &state->key,
req_local_mem,
uncompiled->fixed_varying_mask, &res);
panfrost_disk_cache_store(screen->disk_cache, uncompiled, &state->key, &res);
}
state->info = res.info;
if (res.binary.size) {
state->bin = panfrost_pool_take_ref(shader_pool,
pan_pool_upload_aligned(&shader_pool->base,
res.binary.data, res.binary.size, 128));
}
util_dynarray_fini(&res.binary);
/* Don't upload RSD for fragment shaders since they need draw-time
* merging for e.g. depth/stencil/alpha. RSDs are replaced by simpler
* shader program descriptors on Valhall, which can be preuploaded even
* for fragment shaders. */
bool upload = !(s->info.stage == MESA_SHADER_FRAGMENT && dev->arch <= 7);
bool upload = !(uncompiled->nir->info.stage == MESA_SHADER_FRAGMENT && dev->arch <= 7);
screen->vtbl.prepare_shader(state, desc_pool, upload);
panfrost_analyze_sysvals(state);
util_dynarray_fini(&binary);
/* In both clone and tgsi_to_nir paths, the shader is ralloc'd against
* a NULL context */
ralloc_free(s);
}
static void
panfrost_build_key(struct panfrost_context *ctx,
struct panfrost_shader_key *key,
nir_shader *nir)
const nir_shader *nir)
{
/* We don't currently have vertex shader variants */
if (nir->info.stage != MESA_SHADER_FRAGMENT)
@@ -237,10 +275,8 @@ panfrost_new_variant_locked(
.stream_output = uncompiled->stream_output,
};
panfrost_shader_compile(ctx->base.screen,
&ctx->shaders, &ctx->descs, uncompiled->nir,
&ctx->base.debug, prog, 0,
uncompiled->fixed_varying_mask);
panfrost_shader_get(ctx->base.screen, &ctx->shaders, &ctx->descs,
uncompiled, &ctx->base.debug, prog, 0);
/* Fixup the stream out information */
prog->so_mask =
@@ -333,14 +369,19 @@ panfrost_create_shader_state(
struct pipe_context *pctx,
const struct pipe_shader_state *cso)
{
struct panfrost_uncompiled_shader *so = panfrost_alloc_shader();
nir_shader *nir = (cso->type == PIPE_SHADER_IR_TGSI) ?
tgsi_to_nir(cso->tokens, pctx->screen, false) :
cso->ir.nir;
struct panfrost_uncompiled_shader *so = panfrost_alloc_shader(nir);
/* The driver gets ownership of the nir_shader for graphics. The NIR is
* ralloc'd. Free the NIR when we free the uncompiled shader.
*/
ralloc_steal(so, nir);
so->stream_output = cso->stream_output;
if (cso->type == PIPE_SHADER_IR_TGSI)
so->nir = tgsi_to_nir(cso->tokens, pctx->screen, false);
else
so->nir = cso->ir.nir;
so->nir = nir;
/* Fix linkage early */
if (so->nir->info.stage == MESA_SHADER_VERTEX) {
@@ -353,7 +394,6 @@ panfrost_create_shader_state(
* feedback program. This is a special shader variant.
*/
struct panfrost_context *ctx = pan_context(pctx);
struct util_debug_callback *dbg = &ctx->base.debug;
if (so->nir->xfb_info) {
nir_shader *xfb = nir_shader_clone(NULL, so->nir);
@@ -361,14 +401,15 @@ panfrost_create_shader_state(
xfb->info.internal = true;
so->xfb = calloc(1, sizeof(struct panfrost_compiled_shader));
panfrost_shader_compile(pctx->screen, &ctx->shaders,
&ctx->descs, xfb, dbg, so->xfb, 0,
so->fixed_varying_mask);
so->xfb->key.vs_is_xfb = true;
panfrost_shader_get(ctx->base.screen, &ctx->shaders, &ctx->descs,
so, &ctx->base.debug, so->xfb, 0);
/* Since transform feedback is handled via the transform
* feedback program, the original program no longer uses XFB
*/
so->nir->info.has_transform_feedback_varyings = false;
nir->info.has_transform_feedback_varyings = false;
}
/* Compile the program. We don't use vertex shader keys, so there will
@@ -401,14 +442,10 @@ panfrost_create_shader_state(
}
static void
panfrost_delete_shader_state(
struct pipe_context *pctx,
void *so)
panfrost_delete_shader_state(struct pipe_context *pctx, void *so)
{
struct panfrost_uncompiled_shader *cso = (struct panfrost_uncompiled_shader *) so;
ralloc_free(cso->nir);
util_dynarray_foreach(&cso->variants, struct panfrost_compiled_shader, so) {
panfrost_bo_unreference(so->bin.bo);
panfrost_bo_unreference(so->state.bo);
@@ -424,8 +461,7 @@ panfrost_delete_shader_state(
simple_mtx_destroy(&cso->lock);
util_dynarray_fini(&cso->variants);
free(so);
ralloc_free(so);
}
/*
@@ -438,15 +474,19 @@ panfrost_create_compute_state(
const struct pipe_compute_state *cso)
{
struct panfrost_context *ctx = pan_context(pctx);
struct panfrost_uncompiled_shader *so = panfrost_alloc_shader();
struct panfrost_uncompiled_shader *so = panfrost_alloc_shader(cso->prog);
struct panfrost_compiled_shader *v = panfrost_alloc_variant(so);
memset(v, 0, sizeof *v);
assert(cso->ir_type == PIPE_SHADER_IR_NIR && "TGSI kernels unsupported");
panfrost_shader_compile(pctx->screen, &ctx->shaders, &ctx->descs,
cso->prog, &ctx->base.debug, v,
cso->req_local_mem, 0);
panfrost_shader_get(pctx->screen, &ctx->shaders, &ctx->descs,
so, &ctx->base.debug, v, cso->req_local_mem);
/* The NIR becomes invalid after this. For compute kernels, we never
* need to access it again. Don't keep a dangling pointer around.
*/
so->nir = NULL;
return so;
}
@@ -463,16 +503,6 @@ panfrost_bind_compute_state(struct pipe_context *pipe, void *cso)
uncompiled ? util_dynarray_begin(&uncompiled->variants) : NULL;
}
static void
panfrost_delete_compute_state(struct pipe_context *pipe, void *cso)
{
struct panfrost_uncompiled_shader *so =
(struct panfrost_uncompiled_shader *)cso;
util_dynarray_fini(&so->variants);
free(cso);
}
void
panfrost_shader_context_init(struct pipe_context *pctx)
{
@@ -486,5 +516,5 @@ panfrost_shader_context_init(struct pipe_context *pctx)
pctx->create_compute_state = panfrost_create_compute_state;
pctx->bind_compute_state = panfrost_bind_compute_state;
pctx->delete_compute_state = panfrost_delete_compute_state;
pctx->delete_compute_state = panfrost_delete_shader_state;
}