radeonsi: add fail_if_slow parameter into compute_clear/copy_buffer

and move all failure paths into it.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30173>
This commit is contained in:
Marek Olšák
2024-04-27 08:27:36 -04:00
committed by Marge Bot
parent e42a25aea1
commit 8df427f162

View File

@@ -257,20 +257,44 @@ void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *
1, &sb, 0x1);
}
static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_resource *dst,
unsigned dst_offset, struct pipe_resource *src,
unsigned src_offset, unsigned size,
const uint32_t *clear_value, unsigned clear_value_size,
unsigned flags, enum si_coherency coher)
static bool si_compute_clear_or_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
unsigned dst_offset, struct pipe_resource *src,
unsigned src_offset, unsigned size,
const uint32_t *clear_value, unsigned clear_value_size,
unsigned flags, enum si_coherency coher,
bool fail_if_slow)
{
assert(src_offset % 4 == 0);
assert(dst_offset % 4 == 0);
assert(size % 4 == 0);
bool is_copy = src != NULL;
if (src_offset % 4 || dst_offset % 4 || size % 4 || clear_value_size % 4)
return false;
/* This doesn't fail very often because the only possible fallback is CP DMA, which doesn't
* support the render condition.
*/
if (fail_if_slow && !(flags & SI_OP_CS_RENDER_COND_ENABLE) && sctx->screen->info.has_cp_dma &&
!sctx->screen->info.cp_sdma_ge_use_system_memory_scope) {
if (is_copy) {
/* Only use compute for large VRAM copies on dGPUs. */
if (size <= 8192 || !sctx->screen->info.has_dedicated_vram ||
!(si_resource(dst)->domains & RADEON_DOMAIN_VRAM) ||
!(si_resource(src)->domains & RADEON_DOMAIN_VRAM))
return false;
} else {
/* Buffer clear.
*
* CP DMA clears are terribly slow with GTT on GFX6-8, which can be encountered with any
* buffer due to BO evictions, so never use CP DMA clears on GFX6-8. On GFX9+, use CP DMA
* clears if the size is small.
*/
if (sctx->gfx_level >= GFX9 && clear_value_size <= 4 && size <= 4096)
return false;
}
}
assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
assert(!src || src_offset + size <= src->width0);
bool is_copy = src != NULL;
unsigned dwords_per_thread = clear_value_size == 12 ? 3 : 4;
unsigned num_threads = DIV_ROUND_UP(size, dwords_per_thread * 4);
@@ -301,6 +325,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
si_launch_grid_internal_ssbos(sctx, &info, *shader, flags, coher, is_copy ? 2 : 1, sb,
is_copy ? 0x2 : 0x1);
return true;
}
void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
@@ -318,52 +343,28 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
assert(offset % clear_alignment == 0);
assert(size % clear_alignment == 0);
assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
assert(offset < (UINT32_MAX & ~0x3)); /* the limit of pipe_shader_buffer::buffer_size */
assert(align(size, 16) < UINT32_MAX); /* we round up the size to 16 for compute */
uint32_t clamped;
if (util_lower_clearsize_to_dword(clear_value, (int*)&clear_value_size, &clamped))
clear_value = &clamped;
uint64_t aligned_size = size & ~3ull;
if (aligned_size >= 4) {
uint64_t compute_min_size;
if (sctx->gfx_level <= GFX8) {
/* CP DMA clears are terribly slow with GTT on GFX6-8, which can always
* happen due to BO evictions.
*/
compute_min_size = 0;
} else {
/* Use a small enough size because CP DMA is slower than compute with bigger sizes. */
compute_min_size = 4 * 1024;
}
/* TODO: use compute for 8-bit and 16-bit clear values */
if (method == SI_AUTO_SELECT_CLEAR_METHOD &&
/* CP DMA doesn't support the render condition. */
(flags & SI_OP_CS_RENDER_COND_ENABLE ||
/* CP DMA doesn't support large clear value sizes. */
clear_value_size > 4 ||
/* Use compute if the size is large enough. Always prefer compute on GFX12. */
(clear_value_size == 4 && offset % 4 == 0 &&
(!sctx->screen->info.has_cp_dma ||
sctx->screen->info.cp_sdma_ge_use_system_memory_scope || size > compute_min_size))))
method = SI_COMPUTE_CLEAR_METHOD;
if (method == SI_COMPUTE_CLEAR_METHOD) {
si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value,
clear_value_size, flags, coher);
} else {
assert(clear_value_size == 4);
assert(!(flags & SI_OP_CS_RENDER_COND_ENABLE));
si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, offset, aligned_size, *clear_value,
flags, coher, get_cache_policy(sctx, coher, size));
}
offset += aligned_size;
size -= aligned_size;
if (aligned_size &&
(method == SI_CP_DMA_CLEAR_METHOD ||
!si_compute_clear_or_copy_buffer(sctx, dst, offset, NULL, 0, aligned_size, clear_value,
clear_value_size, flags, coher,
method == SI_AUTO_SELECT_CLEAR_METHOD))) {
assert(clear_value_size == 4);
assert(!(flags & SI_OP_CS_RENDER_COND_ENABLE));
si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, offset, aligned_size, *clear_value,
flags, coher, get_cache_policy(sctx, coher, size));
}
offset += aligned_size;
size -= aligned_size;
/* Handle non-dword alignment. */
if (size) {
assert(!(flags & SI_OP_CS_RENDER_COND_ENABLE));
@@ -398,22 +399,15 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
enum si_coherency coher = SI_COHERENCY_SHADER;
enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
uint64_t compute_min_size = 8 * 1024;
si_improve_sync_flags(sctx, dst, src, &flags);
/* Only use compute for VRAM copies on dGPUs. */
/* TODO: use compute for unaligned big sizes */
if (dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0 &&
(!sctx->screen->info.has_cp_dma || sctx->screen->info.cp_sdma_ge_use_system_memory_scope ||
(sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > compute_min_size))) {
si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0,
flags, coher);
} else {
si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
flags, coher, cache_policy);
}
if (si_compute_clear_or_copy_buffer(sctx, dst, dst_offset, src, src_offset, size, NULL, 0,
flags, coher, true))
return;
si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, flags, coher,
cache_policy);
}
void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,