anv: Implement cmd_dispatch_unaligned callback

Rework: (Kevin)
- Calculate correct number of threads in GPGPU thread group based on
  SIMD size.
- Instead of round up, just use the simple division and let the
  remainder part handle groupCount < local_size_x.
- Drop indirect_unroll_off and fix the bug that we're not using is_unaligned_size_x

Co-authored-by: Kevin Chuang <kaiwenjon23@gmail.com>
Co-authored-by: Sagar Ghuge <sagar.ghuge@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31588>
This commit is contained in:
Sagar Ghuge
2024-05-22 08:45:06 -07:00
committed by Marge Bot
parent 0cab02ca9b
commit 9002e52037
4 changed files with 209 additions and 20 deletions

View File

@@ -1569,3 +1569,15 @@ anv_cmd_flush_buffer_write_cp(VkCommandBuffer commandBuffer)
* ANV_PIPE_DATA_CACHE_FLUSH_BIT.
*/
}
void
anv_cmd_dispatch_unaligned(VkCommandBuffer commandBuffer,
uint32_t invocations_x,
uint32_t invocations_y,
uint32_t invocations_z)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
anv_genX(cmd_buffer->device->info, cmd_dispatch_unaligned)
(commandBuffer, invocations_x, invocations_y, invocations_z);
}

View File

@@ -432,3 +432,15 @@ genX(cmd_buffer_flush_push_descriptors)(struct anv_cmd_buffer *cmd_buffer,
void genX(emit_embedded_sampler)(struct anv_device *device,
struct anv_embedded_sampler *sampler,
struct anv_pipeline_embedded_sampler_binding *binding);
void
genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer,
struct anv_address indirect_addr,
bool is_unaligned_size_x);
void
genX(cmd_dispatch_unaligned)(
VkCommandBuffer commandBuffer,
uint32_t invocations_x,
uint32_t invocations_y,
uint32_t invocations_z);

View File

@@ -5911,6 +5911,12 @@ anv_cmd_write_buffer_cp(VkCommandBuffer cmd_buffer,
void *data,
uint32_t size);
void
anv_cmd_dispatch_unaligned(VkCommandBuffer cmd_buffer,
uint32_t invocations_x,
uint32_t invocations_y,
uint32_t invocations_z);
void
anv_cmd_flush_buffer_write_cp(VkCommandBuffer cmd_buffer);
VkResult

View File

@@ -292,12 +292,30 @@ anv_cmd_buffer_push_workgroups(struct anv_cmd_buffer *cmd_buffer,
static void
compute_load_indirect_params(struct anv_cmd_buffer *cmd_buffer,
const struct anv_address indirect_addr)
const struct anv_address indirect_addr,
bool is_unaligned_size_x)
{
struct mi_builder b;
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
/* Convert unaligned thread invocations to aligned thread group in X
* dimension for unaligned shader dispatches during ray tracing phase.
*/
if (is_unaligned_size_x) {
const uint32_t mocs = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
mi_builder_set_mocs(&b, mocs);
struct anv_compute_pipeline *pipeline =
anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
assert(util_is_power_of_two_or_zero(prog_data->local_size[0]));
size_x = mi_udiv32_imm(&b, size_x, prog_data->local_size[0]);
size_x = mi_iadd(&b, size_x, mi_imm(1));
}
struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
@@ -415,16 +433,13 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
const struct anv_compute_pipeline *pipeline,
struct anv_address indirect_addr,
const struct brw_cs_prog_data *prog_data,
struct intel_cs_dispatch_info dispatch,
uint32_t groupCountX, uint32_t groupCountY,
uint32_t groupCountZ)
{
const struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
const bool predicate = cmd_buffer->state.conditional_render_enabled;
const struct intel_device_info *devinfo = pipeline->base.device->info;
const struct intel_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
uint32_t num_workgroup_data[3];
if (!anv_address_is_null(indirect_addr)) {
uint64_t indirect_addr64 = anv_address_physical(indirect_addr);
@@ -520,13 +535,19 @@ static inline void
emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
const struct anv_compute_pipeline *pipeline,
const struct brw_cs_prog_data *prog_data,
struct intel_cs_dispatch_info dispatch,
struct anv_address indirect_addr,
uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ,
bool is_unaligned_size_x)
{
bool is_indirect = !anv_address_is_null(indirect_addr);
#if GFX_VERx10 >= 125
if (is_indirect && cmd_buffer->device->info->has_indirect_unroll) {
/* For unaligned dispatch, we need to tweak the dispatch value with
* MI_MATH, so we can't use indirect HW instructions.
*/
if (is_indirect && !is_unaligned_size_x &&
cmd_buffer->device->info->has_indirect_unroll) {
emit_indirect_compute_walker(cmd_buffer, pipeline->cs, prog_data,
indirect_addr);
return;
@@ -534,11 +555,12 @@ emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
#endif
if (is_indirect)
compute_load_indirect_params(cmd_buffer, indirect_addr);
compute_load_indirect_params(cmd_buffer, indirect_addr,
is_unaligned_size_x);
#if GFX_VERx10 >= 125
emit_compute_walker(cmd_buffer, pipeline, indirect_addr, prog_data,
groupCountX, groupCountY, groupCountZ);
dispatch, groupCountX, groupCountY, groupCountZ);
#else
emit_gpgpu_walker(cmd_buffer, pipeline, is_indirect, prog_data,
groupCountX, groupCountY, groupCountZ);
@@ -558,6 +580,8 @@ void genX(CmdDispatchBase)(
struct anv_compute_pipeline *pipeline =
anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
struct intel_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
@@ -581,32 +605,154 @@ void genX(CmdDispatchBase)(
if (cmd_buffer->state.conditional_render_enabled)
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
emit_cs_walker(cmd_buffer, pipeline, prog_data,
emit_cs_walker(cmd_buffer, pipeline, prog_data, dispatch,
ANV_NULL_ADDRESS /* no indirect data */,
groupCountX, groupCountY, groupCountZ);
groupCountX, groupCountY, groupCountZ,
false);
trace_intel_end_compute(&cmd_buffer->trace,
groupCountX, groupCountY, groupCountZ);
}
void genX(CmdDispatchIndirect)(
static void
emit_unaligned_cs_walker(
VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset)
uint32_t baseGroupX,
uint32_t baseGroupY,
uint32_t baseGroupZ,
uint32_t groupCountX,
uint32_t groupCountY,
uint32_t groupCountZ,
struct intel_cs_dispatch_info dispatch)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
struct anv_compute_pipeline *pipeline =
anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
struct anv_address addr = anv_address_add(buffer->address, offset);
UNUSED struct anv_batch *batch = &cmd_buffer->batch;
if (anv_batch_has_error(&cmd_buffer->batch))
return;
anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data,
0, 0, 0, 0, 0, 0, addr);
baseGroupX, baseGroupY, baseGroupZ,
groupCountX, groupCountY, groupCountZ,
ANV_NULL_ADDRESS);
/* RT shaders have Y and Z local size set to 1 always. */
assert(prog_data->local_size[1] == 1 && prog_data->local_size[2] == 1);
/* RT shaders dispatched with group Y and Z set to 1 always. */
assert(groupCountY == 1 && groupCountZ == 1);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_COMPUTE,
"compute-unaligned-cs-walker",
groupCountX * groupCountY * groupCountZ *
prog_data->local_size[0] * prog_data->local_size[1] *
prog_data->local_size[2]);
trace_intel_begin_compute(&cmd_buffer->trace);
assert(!prog_data->uses_num_work_groups);
genX(cmd_buffer_flush_compute_state)(cmd_buffer);
if (cmd_buffer->state.conditional_render_enabled)
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
#if GFX_VERx10 >= 125
emit_compute_walker(cmd_buffer, pipeline, ANV_NULL_ADDRESS, prog_data,
dispatch, groupCountX, groupCountY, groupCountZ);
#endif
trace_intel_end_compute(&cmd_buffer->trace,
groupCountX, groupCountY, groupCountZ);
}
/*
* Dispatch compute work item with unaligned thread invocations.
*
* This helper takes unaligned thread invocations, convert it into aligned
* thread group count and dispatch compute work items.
*
* We launch two CS walker, one with aligned part and another CS walker
* with single group for remaining thread invocations.
*
* This function is now specifically for BVH building.
*/
void
genX(cmd_dispatch_unaligned)(
VkCommandBuffer commandBuffer,
uint32_t invocations_x,
uint32_t invocations_y,
uint32_t invocations_z)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
struct anv_compute_pipeline *pipeline =
anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
/* Group X can be unaligned for RT dispatches. */
uint32_t groupCountX = invocations_x / prog_data->local_size[0];
uint32_t groupCountY = invocations_y;
uint32_t groupCountZ = invocations_z;
struct intel_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
/* Launch first CS walker with aligned group count X. */
if (groupCountX) {
emit_unaligned_cs_walker(commandBuffer, 0, 0, 0, groupCountX,
groupCountY, groupCountZ, dispatch);
}
uint32_t unaligned_invocations_x = invocations_x % prog_data->local_size[0];
if (unaligned_invocations_x) {
dispatch.threads = DIV_ROUND_UP(unaligned_invocations_x,
dispatch.simd_size);
/* Make sure the 2nd walker has the same amount of invocations per
* workgroup as the 1st walker, so that gl_GlobalInvocationsID can be
* calculated correctly with baseGroup.
*/
assert(dispatch.threads * dispatch.simd_size == prog_data->local_size[0]);
const uint32_t remainder = unaligned_invocations_x & (dispatch.simd_size - 1);
if (remainder > 0) {
dispatch.right_mask = ~0u >> (32 - remainder);
} else {
dispatch.right_mask = ~0u >> (32 - dispatch.simd_size);
}
/* Launch second CS walker for unaligned part. */
emit_unaligned_cs_walker(commandBuffer, groupCountX, 0, 0, 1, 1, 1,
dispatch);
}
}
/*
* This dispatches compute work item with indirect parameters.
* Helper also makes the unaligned thread invocations aligned.
*/
void
genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer,
struct anv_address indirect_addr,
bool is_unaligned_size_x)
{
struct anv_compute_pipeline *pipeline =
anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
UNUSED struct anv_batch *batch = &cmd_buffer->batch;
struct intel_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data,
0, 0, 0, 0, 0, 0, indirect_addr);
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_COMPUTE,
@@ -619,10 +765,23 @@ void genX(CmdDispatchIndirect)(
if (cmd_buffer->state.conditional_render_enabled)
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
emit_cs_walker(cmd_buffer, pipeline, prog_data, addr, 0, 0, 0);
emit_cs_walker(cmd_buffer, pipeline, prog_data, dispatch, indirect_addr, 0,
0, 0, is_unaligned_size_x);
trace_intel_end_compute_indirect(&cmd_buffer->trace,
anv_address_utrace(addr));
anv_address_utrace(indirect_addr));
}
void genX(CmdDispatchIndirect)(
VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
struct anv_address addr = anv_address_add(buffer->address, offset);
genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false);
}
struct anv_address