From 3df9d8ed807a6693d5fc8cbda4faec28af081ff3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 17 Oct 2021 17:16:09 -0400 Subject: [PATCH] gallium/u_threaded: implement pipelined partial buffer uploads using CPU storage This removes resource_copy_region for BufferSubData. Drivers have to opt in to use this. See the comment in the header file. Acked-by: Pierre-Eric Pelloux-Prayer Reviewed-By: Mike Blumenkrantz Part-of: --- src/gallium/auxiliary/driver_noop/noop_pipe.c | 2 +- .../auxiliary/util/u_threaded_context.c | 108 ++++++++++++++++-- .../auxiliary/util/u_threaded_context.h | 31 ++++- src/gallium/drivers/crocus/crocus_resource.c | 2 +- .../drivers/freedreno/freedreno_resource.c | 2 +- src/gallium/drivers/iris/iris_resource.c | 2 +- src/gallium/drivers/r600/r600_buffer_common.c | 2 +- src/gallium/drivers/radeonsi/si_buffer.c | 2 +- src/gallium/drivers/zink/zink_resource.c | 2 +- 9 files changed, 138 insertions(+), 15 deletions(-) diff --git a/src/gallium/auxiliary/driver_noop/noop_pipe.c b/src/gallium/auxiliary/driver_noop/noop_pipe.c index 73d35d003cf..19421dec596 100644 --- a/src/gallium/auxiliary/driver_noop/noop_pipe.c +++ b/src/gallium/auxiliary/driver_noop/noop_pipe.c @@ -120,7 +120,7 @@ static struct pipe_resource *noop_resource_create(struct pipe_screen *screen, FREE(nresource); return NULL; } - threaded_resource_init(&nresource->b.b); + threaded_resource_init(&nresource->b.b, false, 0); return &nresource->b.b; } diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c index 8eb7f86f161..dd45bbf58e3 100644 --- a/src/gallium/auxiliary/util/u_threaded_context.c +++ b/src/gallium/auxiliary/util/u_threaded_context.c @@ -71,6 +71,12 @@ typedef uint16_t (*tc_execute)(struct pipe_context *pipe, void *call, uint64_t * static const tc_execute execute_func[TC_NUM_CALLS]; +static void +tc_buffer_subdata(struct pipe_context *_pipe, + struct pipe_resource *resource, + unsigned usage, unsigned offset, + unsigned size, const void *data); + static void tc_batch_check(UNUSED struct tc_batch *batch) { @@ -669,18 +675,32 @@ tc_is_buffer_busy(struct threaded_context *tc, struct threaded_resource *tbuf, return tc->options.is_resource_busy(tc->pipe->screen, tbuf->latest, map_usage); } +/** + * allow_cpu_storage should be false for user memory and imported buffers. + */ void -threaded_resource_init(struct pipe_resource *res) +threaded_resource_init(struct pipe_resource *res, bool allow_cpu_storage, + unsigned map_buffer_alignment) { struct threaded_resource *tres = threaded_resource(res); tres->latest = &tres->b; + tres->cpu_storage = NULL; util_range_init(&tres->valid_buffer_range); tres->is_shared = false; tres->is_user_ptr = false; tres->buffer_id_unique = 0; tres->pending_staging_uploads = 0; util_range_init(&tres->pending_staging_uploads_range); + + if (allow_cpu_storage && + !(res->flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT | + PIPE_RESOURCE_FLAG_SPARSE | + PIPE_RESOURCE_FLAG_ENCRYPTED)) && + /* We need buffer invalidation and buffer busyness tracking for the CPU + * storage, which aren't supported with pipe_vertex_state. */ + !(res->bind & PIPE_BIND_VERTEX_STATE)) + tres->cpu_storage = align_malloc(res->width0, map_buffer_alignment); } void @@ -692,6 +712,7 @@ threaded_resource_deinit(struct pipe_resource *res) pipe_resource_reference(&tres->latest, NULL); util_range_destroy(&tres->valid_buffer_range); util_range_destroy(&tres->pending_staging_uploads_range); + align_free(tres->cpu_storage); } struct pipe_context * @@ -907,10 +928,12 @@ tc_get_query_result_resource(struct pipe_context *_pipe, struct pipe_resource *resource, unsigned offset) { struct threaded_context *tc = threaded_context(_pipe); + + tc_buffer_disable_cpu_storage(resource); + struct tc_query_result_resource *p = tc_add_call(tc, TC_CALL_get_query_result_resource, tc_query_result_resource); - p->query = query; p->wait = wait; p->result_type = result_type; @@ -1499,6 +1522,7 @@ tc_set_shader_images(struct pipe_context *_pipe, if (images[i].access & PIPE_IMAGE_ACCESS_WRITE) { struct threaded_resource *tres = threaded_resource(resource); + tc_buffer_disable_cpu_storage(resource); util_range_add(&tres->b, &tres->valid_buffer_range, images[i].u.buf.offset, images[i].u.buf.offset + images[i].u.buf.size); @@ -1591,6 +1615,7 @@ tc_set_shader_buffers(struct pipe_context *_pipe, tc_bind_buffer(&tc->shader_buffers[shader][start + i], next, &tres->b); if (writable_bitmask & BITFIELD_BIT(i)) { + tc_buffer_disable_cpu_storage(src->buffer); util_range_add(&tres->b, &tres->valid_buffer_range, src->buffer_offset, src->buffer_offset + src->buffer_size); @@ -1737,6 +1762,7 @@ tc_set_stream_output_targets(struct pipe_context *_pipe, p->targets[i] = NULL; pipe_so_target_reference(&p->targets[i], tgs[i]); if (tgs[i]) { + tc_buffer_disable_cpu_storage(tgs[i]->buffer); tc_bind_buffer(&tc->streamout_buffers[i], next, tgs[i]->buffer); } else { tc_unbind_buffer(&tc->streamout_buffers[i]); @@ -1906,6 +1932,9 @@ tc_create_image_handle(struct pipe_context *_pipe, struct threaded_context *tc = threaded_context(_pipe); struct pipe_context *pipe = tc->pipe; + if (image->resource->target == PIPE_BUFFER) + tc_buffer_disable_cpu_storage(image->resource); + tc_sync(tc); return pipe->create_image_handle(pipe, image); } @@ -2142,8 +2171,35 @@ tc_buffer_map(struct pipe_context *_pipe, struct threaded_resource *tres = threaded_resource(resource); struct pipe_context *pipe = tc->pipe; + /* PIPE_MAP_THREAD_SAFE is for glthread, which shouldn't use the CPU storage and + * this shouldn't normally be necessary because glthread only uses large buffers. + */ + if (usage & PIPE_MAP_THREAD_SAFE) + tc_buffer_disable_cpu_storage(resource); + usage = tc_improve_map_buffer_flags(tc, tres, usage, box->x, box->width); + /* If the CPU storage is enabled, return it directly. */ + if (tres->cpu_storage && !(usage & TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE)) { + /* We can't let resource_copy_region disable the CPU storage. */ + assert(!(tres->b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY)); + + struct threaded_transfer *ttrans = slab_alloc(&tc->pool_transfers); + ttrans->b.resource = resource; + ttrans->b.level = 0; + ttrans->b.usage = usage; + ttrans->b.box = *box; + ttrans->b.stride = 0; + ttrans->b.layer_stride = 0; + ttrans->b.offset = 0; + ttrans->staging = NULL; + ttrans->valid_buffer_range = &tres->valid_buffer_range; + ttrans->cpu_storage_mapped = true; + *transfer = &ttrans->b; + + return (uint8_t*)tres->cpu_storage + box->x; + } + /* Do a staging transfer within the threaded context. The driver should * only get resource_copy_region. */ @@ -2169,6 +2225,7 @@ tc_buffer_map(struct pipe_context *_pipe, ttrans->b.stride = 0; ttrans->b.layer_stride = 0; ttrans->valid_buffer_range = &tres->valid_buffer_range; + ttrans->cpu_storage_mapped = false; *transfer = &ttrans->b; p_atomic_inc(&tres->pending_staging_uploads); @@ -2203,6 +2260,7 @@ tc_buffer_map(struct pipe_context *_pipe, void *ret = pipe->buffer_map(pipe, tres->latest ? tres->latest : resource, level, usage, box, transfer); threaded_transfer(*transfer)->valid_buffer_range = &tres->valid_buffer_range; + threaded_transfer(*transfer)->cpu_storage_mapped = false; if (!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC)) tc_clear_driver_thread(tc); @@ -2285,8 +2343,13 @@ tc_buffer_do_flush_region(struct threaded_context *tc, ttrans->staging, 0, &src_box); } - util_range_add(&tres->b, ttrans->valid_buffer_range, - box->x, box->x + box->width); + /* Don't update the valid range when we're uploading the CPU storage + * because it includes the uninitialized range too. + */ + if (!(ttrans->b.usage & TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE)) { + util_range_add(&tres->b, ttrans->valid_buffer_range, + box->x, box->x + box->width); + } } static void @@ -2373,12 +2436,37 @@ tc_buffer_unmap(struct pipe_context *_pipe, struct pipe_transfer *transfer) return; } - bool was_staging_transfer = false; - if (transfer->usage & PIPE_MAP_WRITE && !(transfer->usage & PIPE_MAP_FLUSH_EXPLICIT)) tc_buffer_do_flush_region(tc, ttrans, &transfer->box); + if (ttrans->cpu_storage_mapped) { + /* GL allows simultaneous GPU stores with mapped buffers as long as GPU stores don't + * touch the mapped range. That's a problem because GPU stores free the CPU storage. + * If that happens, we just ignore the unmap call and don't upload anything to prevent + * a crash. + * + * Disallow the CPU storage in the driver to work around this. + */ + assert(tres->cpu_storage); + + if (tres->cpu_storage) { + tc_invalidate_buffer(tc, tres); + tc_buffer_subdata(&tc->base, &tres->b, + PIPE_MAP_UNSYNCHRONIZED | + TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE, + 0, tres->b.width0, tres->cpu_storage); + /* This shouldn't have been freed by buffer_subdata. */ + assert(tres->cpu_storage); + } + + tc_drop_resource_reference(ttrans->staging); + slab_free(&tc->pool_transfers, ttrans); + return; + } + + bool was_staging_transfer = false; + if (ttrans->staging) { was_staging_transfer = true; @@ -2483,7 +2571,8 @@ tc_buffer_subdata(struct pipe_context *_pipe, */ if (usage & (PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_DISCARD_WHOLE_RESOURCE) || - size > TC_MAX_SUBDATA_BYTES) { + size > TC_MAX_SUBDATA_BYTES || + tres->cpu_storage) { struct pipe_transfer *transfer; struct pipe_box box; uint8_t *map = NULL; @@ -3560,6 +3649,9 @@ tc_resource_copy_region(struct pipe_context *_pipe, tc_add_call(tc, TC_CALL_resource_copy_region, tc_resource_copy_region); + if (dst->target == PIPE_BUFFER) + tc_buffer_disable_cpu_storage(dst); + tc_set_resource_reference(&p->dst, dst); p->dst_level = dst_level; p->dstx = dstx; @@ -3874,6 +3966,8 @@ tc_clear_buffer(struct pipe_context *_pipe, struct pipe_resource *res, struct tc_clear_buffer *p = tc_add_call(tc, TC_CALL_clear_buffer, tc_clear_buffer); + tc_buffer_disable_cpu_storage(res); + tc_set_resource_reference(&p->res, res); tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], res); p->offset = offset; diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h index a961a11dbe5..ec7438c8e2f 100644 --- a/src/gallium/auxiliary/util/u_threaded_context.h +++ b/src/gallium/auxiliary/util/u_threaded_context.h @@ -208,6 +208,8 @@ struct tc_unflushed_batch_token; /* 0 = disabled, 1 = assertions, 2 = printfs, 3 = logging */ #define TC_DEBUG 0 +/* This is an internal flag not sent to the driver. */ +#define TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE (1u << 28) /* These are map flags sent to drivers. */ /* Never infer whether it's safe to use unsychronized mappings: */ #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29) @@ -313,6 +315,13 @@ struct threaded_resource { */ struct pipe_resource *latest; + /* Optional CPU storage of the buffer. When we get partial glBufferSubData(implemented by + * copy_buffer) + glDrawElements, we don't want to drain the gfx pipeline before executing + * the copy. For ideal pipelining, we upload to this CPU storage and then reallocate + * the GPU storage completely and reupload everything without copy_buffer. + */ + void *cpu_storage; + /* The buffer range which is initialized (with a write transfer, streamout, * or writable shader resources). The remainder of the buffer is considered * invalid and can be mapped unsynchronized. @@ -360,6 +369,8 @@ struct threaded_transfer { * the base instance. Initially it's set to &b.resource->valid_buffer_range. */ struct util_range *valid_buffer_range; + + bool cpu_storage_mapped; }; struct threaded_query { @@ -503,7 +514,8 @@ struct threaded_context { struct tc_buffer_list buffer_lists[TC_MAX_BUFFER_LISTS]; }; -void threaded_resource_init(struct pipe_resource *res); +void threaded_resource_init(struct pipe_resource *res, bool allow_cpu_storage, + unsigned map_buffer_alignment); void threaded_resource_deinit(struct pipe_resource *res); struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe); void tc_driver_internal_flush_notify(struct threaded_context *tc); @@ -579,4 +591,21 @@ tc_assert_driver_thread(struct threaded_context *tc) #endif } +/** + * This is called before GPU stores to disable the CPU storage because + * the CPU storage doesn't mirror the GPU storage. + * + * Drivers should also call it before exporting a DMABUF of a buffer. + */ +static inline void +tc_buffer_disable_cpu_storage(struct pipe_resource *buf) +{ + struct threaded_resource *tres = threaded_resource(buf); + + if (tres->cpu_storage) { + free(tres->cpu_storage); + tres->cpu_storage = NULL; + } +} + #endif diff --git a/src/gallium/drivers/crocus/crocus_resource.c b/src/gallium/drivers/crocus/crocus_resource.c index 51dcc921cfd..11309b8bef7 100644 --- a/src/gallium/drivers/crocus/crocus_resource.c +++ b/src/gallium/drivers/crocus/crocus_resource.c @@ -356,7 +356,7 @@ crocus_alloc_resource(struct pipe_screen *pscreen, res->base.b.screen = pscreen; res->orig_screen = crocus_pscreen_ref(pscreen); pipe_reference_init(&res->base.b.reference, 1); - threaded_resource_init(&res->base.b); + threaded_resource_init(&res->base.b, false, 0); if (templ->target == PIPE_BUFFER) util_range_init(&res->valid_buffer_range); diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c index add830d9c12..f8c8eb9d0c9 100644 --- a/src/gallium/drivers/freedreno/freedreno_resource.c +++ b/src/gallium/drivers/freedreno/freedreno_resource.c @@ -1109,7 +1109,7 @@ alloc_resource_struct(struct pipe_screen *pscreen, pipe_reference_init(&rsc->track->reference, 1); - threaded_resource_init(prsc); + threaded_resource_init(prsc, false, 0); if (tmpl->target == PIPE_BUFFER) rsc->b.buffer_id_unique = util_idalloc_mt_alloc(&screen->buffer_ids); diff --git a/src/gallium/drivers/iris/iris_resource.c b/src/gallium/drivers/iris/iris_resource.c index 8f3662f6e6f..12857416ef8 100644 --- a/src/gallium/drivers/iris/iris_resource.c +++ b/src/gallium/drivers/iris/iris_resource.c @@ -479,7 +479,7 @@ iris_alloc_resource(struct pipe_screen *pscreen, res->base.b.screen = pscreen; res->orig_screen = iris_pscreen_ref(pscreen); pipe_reference_init(&res->base.b.reference, 1); - threaded_resource_init(&res->base.b); + threaded_resource_init(&res->base.b, false, 0); res->aux.possible_usages = 1 << ISL_AUX_USAGE_NONE; res->aux.sampler_usages = 1 << ISL_AUX_USAGE_NONE; diff --git a/src/gallium/drivers/r600/r600_buffer_common.c b/src/gallium/drivers/r600/r600_buffer_common.c index ba645298f81..2035456e4f6 100644 --- a/src/gallium/drivers/r600/r600_buffer_common.c +++ b/src/gallium/drivers/r600/r600_buffer_common.c @@ -584,7 +584,7 @@ r600_alloc_buffer_struct(struct pipe_screen *screen, pipe_reference_init(&rbuffer->b.b.reference, 1); rbuffer->b.b.screen = screen; - threaded_resource_init(&rbuffer->b.b); + threaded_resource_init(&rbuffer->b.b, false, 0); rbuffer->buf = NULL; rbuffer->bind_history = 0; diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c index 1c45c534488..a16c50a94f6 100644 --- a/src/gallium/drivers/radeonsi/si_buffer.c +++ b/src/gallium/drivers/radeonsi/si_buffer.c @@ -567,7 +567,7 @@ static struct si_resource *si_alloc_buffer_struct(struct pipe_screen *screen, pipe_reference_init(&buf->b.b.reference, 1); buf->b.b.screen = screen; - threaded_resource_init(&buf->b.b); + threaded_resource_init(&buf->b.b, false, 0); buf->buf = NULL; buf->bind_history = 0; diff --git a/src/gallium/drivers/zink/zink_resource.c b/src/gallium/drivers/zink/zink_resource.c index b403bb88864..64516a2900d 100644 --- a/src/gallium/drivers/zink/zink_resource.c +++ b/src/gallium/drivers/zink/zink_resource.c @@ -760,7 +760,7 @@ resource_create(struct pipe_screen *pscreen, res->base.b = *templ; - threaded_resource_init(&res->base.b); + threaded_resource_init(&res->base.b, false, 0); pipe_reference_init(&res->base.b.reference, 1); res->base.b.screen = pscreen;