From 0125e8d3341331f948fb503fe57b2fb55f10e84c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 16 Jul 2023 14:10:20 -0400 Subject: [PATCH] radeonsi: merge si_upload_*_descriptors into si_emit_*_shader_pointers This removes calling si_upload_graphics_shader_descriptors from si_draw by moving the uploading into si_emit_graphics_shader_pointers. Similar for compute. si_upload_shader_descriptors used to set sctx->shader_pointers_dirty to pass the mask to the emit function. Now, shader_pointers_dirty is both set and consumed in si_emit_graphics_shader_pointers and si_emit_compute_- shader_pointers, so the mask is passed via a local variable. All places that set descriptors_dirty must now also dirty the gfx_shader_pointers state for the descriptors to be uploaded. All places that set bindless_descriptors_dirty must do the same and also make the cache_flush state dirty because si_emit_graphics_shader_pointers can now set cache flush flags (through si_upload_bindless_descriptors). Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_compute.c | 1 - src/gallium/drivers/radeonsi/si_descriptors.c | 131 +++++++++++------- src/gallium/drivers/radeonsi/si_state.h | 2 - .../drivers/radeonsi/si_state_draw.cpp | 8 +- 4 files changed, 83 insertions(+), 59 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index ce60c1c4b2a..b7502381b29 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -1008,7 +1008,6 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info info->variable_shared_mem)) return; - si_upload_compute_shader_descriptors(sctx); si_emit_compute_shader_pointers(sctx); if (program->ir_type == PIPE_SHADER_IR_NATIVE && diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index a591ac55dd1..171fd127e23 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -624,6 +624,8 @@ static void si_set_sampler_views(struct si_context *sctx, unsigned shader, samplers->needs_color_decompress_mask &= ~unbound_mask; sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); + if (shader != PIPE_SHADER_COMPUTE) + si_mark_atom_dirty(sctx, &sctx->atoms.s.gfx_shader_pointers); } static void si_update_shader_needs_decompress_mask(struct si_context *sctx, unsigned shader) @@ -739,6 +741,8 @@ static void si_disable_shader_image(struct si_context *ctx, unsigned shader, uns images->enabled_mask &= ~(1u << slot); images->display_dcc_store_mask &= ~(1u << slot); ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); + if (shader != PIPE_SHADER_COMPUTE) + si_mark_atom_dirty(ctx, &ctx->atoms.s.gfx_shader_pointers); } } @@ -887,6 +891,8 @@ static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigne images->enabled_mask |= 1u << slot; ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); + if (shader != PIPE_SHADER_COMPUTE) + si_mark_atom_dirty(ctx, &ctx->atoms.s.gfx_shader_pointers); /* Since this can flush, it must be done after enabled_mask is updated. */ si_sampler_view_add_buffer(ctx, &res->b.b, @@ -1033,6 +1039,7 @@ void si_update_ps_colorbuf0_slot(struct si_context *sctx) } sctx->descriptors_dirty |= 1u << SI_DESCS_INTERNAL; + si_mark_atom_dirty(sctx, &sctx->atoms.s.gfx_shader_pointers); sctx->ps_uses_fbfetch = surf != NULL; si_update_ps_iter_samples(sctx); si_ps_key_update_framebuffer(sctx); @@ -1080,6 +1087,8 @@ static void si_bind_sampler_states(struct pipe_context *ctx, enum pipe_shader_ty si_set_sampler_state_desc(sstates[i], sview, tex, desc->list + desc_slot * 16 + 12); sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); + if (shader != PIPE_SHADER_COMPUTE) + si_mark_atom_dirty(sctx, &sctx->atoms.s.gfx_shader_pointers); } } @@ -1259,6 +1268,8 @@ static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_res } sctx->descriptors_dirty |= 1u << descriptors_idx; + if (descriptors_idx < SI_DESCS_FIRST_COMPUTE) + si_mark_atom_dirty(sctx, &sctx->atoms.s.gfx_shader_pointers); } void si_get_inline_uniform_state(union si_shader_key *key, enum pipe_shader_type shader, @@ -1382,6 +1393,8 @@ static void si_set_shader_buffer(struct si_context *sctx, struct si_buffer_resou buffers->enabled_mask &= ~(1llu << slot); buffers->writable_mask &= ~(1llu << slot); sctx->descriptors_dirty |= 1u << descriptors_idx; + if (descriptors_idx < SI_DESCS_FIRST_COMPUTE) + si_mark_atom_dirty(sctx, &sctx->atoms.s.gfx_shader_pointers); return; } @@ -1403,6 +1416,8 @@ static void si_set_shader_buffer(struct si_context *sctx, struct si_buffer_resou buffers->enabled_mask |= 1llu << slot; sctx->descriptors_dirty |= 1lu << descriptors_idx; + if (descriptors_idx < SI_DESCS_FIRST_COMPUTE) + si_mark_atom_dirty(sctx, &sctx->atoms.s.gfx_shader_pointers); util_range_add(&buf->b.b, &buf->valid_buffer_range, sbuffer->buffer_offset, sbuffer->buffer_offset + sbuffer->buffer_size); @@ -1574,6 +1589,7 @@ void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource } sctx->descriptors_dirty |= 1u << SI_DESCS_INTERNAL; + si_mark_atom_dirty(sctx, &sctx->atoms.s.gfx_shader_pointers); } /* INTERNAL CONST BUFFERS */ @@ -1670,6 +1686,8 @@ static bool si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_ if (buffer && (!buf || buffer == buf)) { si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], descs->list + i * 4); sctx->descriptors_dirty |= 1u << descriptors_idx; + if (descriptors_idx < SI_DESCS_FIRST_COMPUTE) + si_mark_atom_dirty(sctx, &sctx->atoms.s.gfx_shader_pointers); radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(buffer), (buffers->writable_mask & (1llu << i) ? @@ -1680,6 +1698,15 @@ static bool si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_ return !noop; } +static void si_mark_bindless_descriptors_dirty(struct si_context *sctx) +{ + sctx->bindless_descriptors_dirty = true; + /* gfx_shader_pointers uploads bindless descriptors. */ + si_mark_atom_dirty(sctx, &sctx->atoms.s.gfx_shader_pointers); + /* gfx_shader_pointers can flag cache flags, so we need to dirty this too. */ + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); +} + /* Update all buffer bindings where the buffer is bound, including * all resource descriptors. This is invalidate_buffer without * the invalidation. @@ -1742,6 +1769,7 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf) si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], descs->list + i * 4); sctx->descriptors_dirty |= 1u << SI_DESCS_INTERNAL; + si_mark_atom_dirty(sctx, &sctx->atoms.s.gfx_shader_pointers); radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(buffer), RADEON_USAGE_WRITE | RADEON_PRIO_SHADER_RW_BUFFER); @@ -1799,6 +1827,8 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf) si_set_buf_desc_address(si_resource(buffer), samplers->views[i]->u.buf.offset, descs->list + desc_slot * 16 + 4); sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); + if (shader != PIPE_SHADER_COMPUTE) + si_mark_atom_dirty(sctx, &sctx->atoms.s.gfx_shader_pointers); radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(buffer), RADEON_USAGE_READ | RADEON_PRIO_SAMPLER_BUFFER); @@ -1829,6 +1859,8 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf) si_set_buf_desc_address(si_resource(buffer), images->views[i].u.buf.offset, descs->list + desc_slot * 8 + 4); sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); + if (shader != PIPE_SHADER_COMPUTE) + si_mark_atom_dirty(sctx, &sctx->atoms.s.gfx_shader_pointers); radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(buffer), RADEON_USAGE_READWRITE | @@ -1855,7 +1887,7 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf) descs->list + desc_slot * 16 + 4); (*tex_handle)->desc_dirty = true; - sctx->bindless_descriptors_dirty = true; + si_mark_bindless_descriptors_dirty(sctx); radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(buffer), RADEON_USAGE_READ | RADEON_PRIO_SAMPLER_BUFFER); @@ -1880,7 +1912,7 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf) descs->list + desc_slot * 16 + 4); (*img_handle)->desc_dirty = true; - sctx->bindless_descriptors_dirty = true; + si_mark_bindless_descriptors_dirty(sctx); radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(buffer), RADEON_USAGE_READWRITE | RADEON_PRIO_SAMPLER_BUFFER); @@ -1950,7 +1982,6 @@ static void si_upload_bindless_descriptors(struct si_context *sctx) /* Invalidate scalar L0 because the cache doesn't know that L2 changed. */ sctx->flags |= SI_CONTEXT_INV_SCACHE; - si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); sctx->bindless_descriptors_dirty = false; } @@ -1971,7 +2002,7 @@ static void si_update_bindless_texture_descriptor(struct si_context *sctx, if (memcmp(desc_list, desc->list + desc_slot_offset, sizeof(desc_list))) { tex_handle->desc_dirty = true; - sctx->bindless_descriptors_dirty = true; + si_mark_bindless_descriptors_dirty(sctx); } } @@ -1994,7 +2025,7 @@ static void si_update_bindless_image_descriptor(struct si_context *sctx, if (memcmp(image_desc, desc->list + desc_slot_offset, desc_size)) { img_handle->desc_dirty = true; - sctx->bindless_descriptors_dirty = true; + si_mark_bindless_descriptors_dirty(sctx); } } @@ -2007,8 +2038,6 @@ static void si_update_all_resident_texture_descriptors(struct si_context *sctx) util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) { si_update_bindless_image_descriptor(sctx, *img_handle); } - - si_upload_bindless_descriptors(sctx); } /* Update mutable image descriptor fields of all bound textures. */ @@ -2155,7 +2184,7 @@ void si_shader_change_notify(struct si_context *sctx) #define si_emit_consecutive_shader_pointers(sctx, pointer_mask, sh_base, type) do { \ unsigned sh_reg_base = (sh_base); \ if (sh_reg_base) { \ - unsigned mask = sctx->shader_pointers_dirty & (pointer_mask); \ + unsigned mask = shader_pointers_dirty & (pointer_mask); \ \ if (sctx->screen->info.has_set_pairs_packets) { \ u_foreach_bit(i, mask) { \ @@ -2225,8 +2254,27 @@ static void si_emit_global_shader_pointers(struct si_context *sctx, struct si_de void si_emit_graphics_shader_pointers(struct si_context *sctx, unsigned index) { uint32_t *sh_base = sctx->shader_pointers.sh_base; + unsigned all_gfx_desc_mask = BITFIELD_RANGE(0, SI_DESCS_FIRST_COMPUTE); + unsigned descriptors_dirty = sctx->descriptors_dirty & all_gfx_desc_mask; + unsigned shader_pointers_dirty = sctx->shader_pointers_dirty | descriptors_dirty; - if (sctx->shader_pointers_dirty & (1 << SI_DESCS_INTERNAL)) { + /* Blits shouldn't set VS shader pointers. */ + if (sctx->num_vs_blit_sgprs) + shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX); + + /* Upload descriptors. */ + if (descriptors_dirty) { + sctx->descriptors_dirty &= ~descriptors_dirty; + + do { + si_upload_descriptors(sctx, &sctx->descriptors[u_bit_scan(&descriptors_dirty)]); + } while (descriptors_dirty); + } + + si_upload_bindless_descriptors(sctx); + + /* Set shader pointers. */ + if (shader_pointers_dirty & (1 << SI_DESCS_INTERNAL)) { si_emit_global_shader_pointers(sctx, &sctx->descriptors[SI_DESCS_INTERNAL]); } @@ -2256,7 +2304,7 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx, unsigned index) } radeon_end(); - sctx->shader_pointers_dirty &= ~u_bit_consecutive(SI_DESCS_INTERNAL, SI_DESCS_FIRST_COMPUTE); + sctx->shader_pointers_dirty &= ~all_gfx_desc_mask; if (sctx->graphics_bindless_pointer_dirty) { si_emit_global_shader_pointers(sctx, &sctx->bindless_descriptors); @@ -2266,8 +2314,23 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx, unsigned index) void si_emit_compute_shader_pointers(struct si_context *sctx) { + /* This does not update internal bindings as that is not needed for compute shaders. */ + unsigned descriptors_dirty = sctx->descriptors_dirty & SI_DESCS_SHADER_MASK(COMPUTE); + unsigned shader_pointers_dirty = sctx->shader_pointers_dirty | descriptors_dirty; + + /* Upload descriptors. */ + if (descriptors_dirty) { + sctx->descriptors_dirty &= ~descriptors_dirty; + + do { + si_upload_descriptors(sctx, &sctx->descriptors[u_bit_scan(&descriptors_dirty)]); + } while (descriptors_dirty); + } + + si_upload_bindless_descriptors(sctx); + + /* Set shader pointers. */ struct radeon_cmdbuf *cs = &sctx->gfx_cs; - struct si_shader_selector *shader = &sctx->cs_shader_state.program->sel; unsigned base = R_00B900_COMPUTE_USER_DATA_0; radeon_begin(cs); @@ -2286,7 +2349,9 @@ void si_emit_compute_shader_pointers(struct si_context *sctx) } /* Set shader buffer descriptors in user SGPRs. */ + struct si_shader_selector *shader = &sctx->cs_shader_state.program->sel; unsigned num_shaderbufs = shader->cs_num_shaderbufs_in_user_sgprs; + if (num_shaderbufs && sctx->compute_shaderbuf_sgprs_dirty) { struct si_descriptors *desc = si_const_and_shader_buffer_descriptors(sctx, PIPE_SHADER_COMPUTE); @@ -2542,7 +2607,7 @@ static void si_make_texture_handle_resident(struct pipe_context *ctx, uint64_t h * wasn't resident. */ if (tex_handle->desc_dirty) - sctx->bindless_descriptors_dirty = true; + si_mark_bindless_descriptors_dirty(sctx); /* Add the texture handle to the per-context list. */ util_dynarray_append(&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle); @@ -2663,7 +2728,7 @@ static void si_make_image_handle_resident(struct pipe_context *ctx, uint64_t han * wasn't resident. */ if (img_handle->desc_dirty) - sctx->bindless_descriptors_dirty = true; + si_mark_bindless_descriptors_dirty(sctx); /* Add the image handle to the per-context list. */ util_dynarray_append(&sctx->resident_img_handles, struct si_image_handle *, img_handle); @@ -2835,41 +2900,6 @@ void si_init_all_descriptors(struct si_context *sctx) 0, ~0u, false, true, 16, 32, 0); } -static void si_upload_shader_descriptors(struct si_context *sctx, unsigned mask) -{ - unsigned dirty = sctx->descriptors_dirty & mask; - - if (dirty) { - unsigned iter_mask = dirty; - - do { - si_upload_descriptors(sctx, &sctx->descriptors[u_bit_scan(&iter_mask)]); - } while (iter_mask); - - sctx->descriptors_dirty &= ~dirty; - sctx->shader_pointers_dirty |= dirty; - si_mark_atom_dirty(sctx, &sctx->atoms.s.gfx_shader_pointers); - } - - si_upload_bindless_descriptors(sctx); -} - -void si_upload_graphics_shader_descriptors(struct si_context *sctx) -{ - const unsigned mask = u_bit_consecutive(0, SI_DESCS_FIRST_COMPUTE); - si_upload_shader_descriptors(sctx, mask); -} - -void si_upload_compute_shader_descriptors(struct si_context *sctx) -{ - /* This does not update internal bindings as that is not needed for compute shaders - * and the input buffer is using the same SGPR's anyway. - */ - const unsigned mask = - u_bit_consecutive(SI_DESCS_FIRST_COMPUTE, SI_NUM_DESCS - SI_DESCS_FIRST_COMPUTE); - si_upload_shader_descriptors(sctx, mask); -} - void si_release_all_descriptors(struct si_context *sctx) { int i; @@ -3037,8 +3067,11 @@ void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, uint6 /* Upload/dump descriptors if slots are being enabled. */ if (first < desc->first_active_slot || - first + count > desc->first_active_slot + desc->num_active_slots) + first + count > desc->first_active_slot + desc->num_active_slots) { sctx->descriptors_dirty |= 1u << desc_idx; + if (desc_idx < SI_DESCS_FIRST_COMPUTE) + si_mark_atom_dirty(sctx, &sctx->atoms.s.gfx_shader_pointers); + } desc->first_active_slot = first; desc->num_active_slots = count; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index ba23670d7b3..3e2d2076889 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -542,8 +542,6 @@ void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource unsigned stride, unsigned num_records, bool add_tid, bool swizzle, unsigned element_size, unsigned index_stride, uint64_t offset); void si_init_all_descriptors(struct si_context *sctx); -void si_upload_graphics_shader_descriptors(struct si_context *sctx); -void si_upload_compute_shader_descriptors(struct si_context *sctx); void si_release_all_descriptors(struct si_context *sctx); void si_gfx_resources_add_all_to_bo_list(struct si_context *sctx); void si_compute_resources_add_all_to_bo_list(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index f031f5d1255..623e381a5d4 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -2226,11 +2226,6 @@ static void si_draw(struct pipe_context *ctx, if (sctx->bo_list_add_all_gfx_resources) si_gfx_resources_add_all_to_bo_list(sctx); - /* Graphics shader descriptors must be uploaded after si_update_shaders because - * si_update_shaders binds tess and GS ring buffers. - */ - si_upload_graphics_shader_descriptors(sctx); - /* This is the optimal packet order: * Set all states first, so that all SET packets are processed in parallel with previous * draw calls. Then flush caches and wait if needed. Then draw and prefetch at the end. @@ -2412,8 +2407,7 @@ static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elem draw.start = 0; draw.count = 3; - /* Don't set per-stage shader pointers for VS. */ - sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX); + /* Blits don't use vertex buffers. */ sctx->vertex_buffers_dirty = false; si_draw