diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index bbd1e028779..502eaf0dbd3 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -1569,3 +1569,15 @@ anv_cmd_flush_buffer_write_cp(VkCommandBuffer commandBuffer) * ANV_PIPE_DATA_CACHE_FLUSH_BIT. */ } + +void +anv_cmd_dispatch_unaligned(VkCommandBuffer commandBuffer, + uint32_t invocations_x, + uint32_t invocations_y, + uint32_t invocations_z) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + anv_genX(cmd_buffer->device->info, cmd_dispatch_unaligned) + (commandBuffer, invocations_x, invocations_y, invocations_z); +} diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index 4c055d08c0c..e1bd770d0d5 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -432,3 +432,15 @@ genX(cmd_buffer_flush_push_descriptors)(struct anv_cmd_buffer *cmd_buffer, void genX(emit_embedded_sampler)(struct anv_device *device, struct anv_embedded_sampler *sampler, struct anv_pipeline_embedded_sampler_binding *binding); + +void +genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer, + struct anv_address indirect_addr, + bool is_unaligned_size_x); + +void +genX(cmd_dispatch_unaligned)( + VkCommandBuffer commandBuffer, + uint32_t invocations_x, + uint32_t invocations_y, + uint32_t invocations_z); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index f762eb36ffe..1eb2c5a1fde 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -5911,6 +5911,12 @@ anv_cmd_write_buffer_cp(VkCommandBuffer cmd_buffer, void *data, uint32_t size); void +anv_cmd_dispatch_unaligned(VkCommandBuffer cmd_buffer, + uint32_t invocations_x, + uint32_t invocations_y, + uint32_t invocations_z); + +void anv_cmd_flush_buffer_write_cp(VkCommandBuffer cmd_buffer); VkResult diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c index 57906acd227..401b8f4c539 100644 --- a/src/intel/vulkan/genX_cmd_compute.c +++ b/src/intel/vulkan/genX_cmd_compute.c @@ -292,12 +292,30 @@ anv_cmd_buffer_push_workgroups(struct anv_cmd_buffer *cmd_buffer, static void compute_load_indirect_params(struct anv_cmd_buffer *cmd_buffer, - const struct anv_address indirect_addr) + const struct anv_address indirect_addr, + bool is_unaligned_size_x) { struct mi_builder b; mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0)); + + /* Convert unaligned thread invocations to aligned thread group in X + * dimension for unaligned shader dispatches during ray tracing phase. + */ + if (is_unaligned_size_x) { + const uint32_t mocs = isl_mocs(&cmd_buffer->device->isl_dev, 0, false); + mi_builder_set_mocs(&b, mocs); + + struct anv_compute_pipeline *pipeline = + anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline); + const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); + + assert(util_is_power_of_two_or_zero(prog_data->local_size[0])); + size_x = mi_udiv32_imm(&b, size_x, prog_data->local_size[0]); + size_x = mi_iadd(&b, size_x, mi_imm(1)); + } + struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4)); struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8)); @@ -415,16 +433,13 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, const struct anv_compute_pipeline *pipeline, struct anv_address indirect_addr, const struct brw_cs_prog_data *prog_data, + struct intel_cs_dispatch_info dispatch, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) { const struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute; const bool predicate = cmd_buffer->state.conditional_render_enabled; - const struct intel_device_info *devinfo = pipeline->base.device->info; - const struct intel_cs_dispatch_info dispatch = - brw_cs_get_dispatch_info(devinfo, prog_data, NULL); - uint32_t num_workgroup_data[3]; if (!anv_address_is_null(indirect_addr)) { uint64_t indirect_addr64 = anv_address_physical(indirect_addr); @@ -520,13 +535,19 @@ static inline void emit_cs_walker(struct anv_cmd_buffer *cmd_buffer, const struct anv_compute_pipeline *pipeline, const struct brw_cs_prog_data *prog_data, + struct intel_cs_dispatch_info dispatch, struct anv_address indirect_addr, - uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) + uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ, + bool is_unaligned_size_x) { bool is_indirect = !anv_address_is_null(indirect_addr); #if GFX_VERx10 >= 125 - if (is_indirect && cmd_buffer->device->info->has_indirect_unroll) { + /* For unaligned dispatch, we need to tweak the dispatch value with + * MI_MATH, so we can't use indirect HW instructions. + */ + if (is_indirect && !is_unaligned_size_x && + cmd_buffer->device->info->has_indirect_unroll) { emit_indirect_compute_walker(cmd_buffer, pipeline->cs, prog_data, indirect_addr); return; @@ -534,11 +555,12 @@ emit_cs_walker(struct anv_cmd_buffer *cmd_buffer, #endif if (is_indirect) - compute_load_indirect_params(cmd_buffer, indirect_addr); + compute_load_indirect_params(cmd_buffer, indirect_addr, + is_unaligned_size_x); #if GFX_VERx10 >= 125 emit_compute_walker(cmd_buffer, pipeline, indirect_addr, prog_data, - groupCountX, groupCountY, groupCountZ); + dispatch, groupCountX, groupCountY, groupCountZ); #else emit_gpgpu_walker(cmd_buffer, pipeline, is_indirect, prog_data, groupCountX, groupCountY, groupCountZ); @@ -558,6 +580,8 @@ void genX(CmdDispatchBase)( struct anv_compute_pipeline *pipeline = anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline); const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); + struct intel_cs_dispatch_info dispatch = + brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL); if (anv_batch_has_error(&cmd_buffer->batch)) return; @@ -581,32 +605,154 @@ void genX(CmdDispatchBase)( if (cmd_buffer->state.conditional_render_enabled) genX(cmd_emit_conditional_render_predicate)(cmd_buffer); - emit_cs_walker(cmd_buffer, pipeline, prog_data, + emit_cs_walker(cmd_buffer, pipeline, prog_data, dispatch, ANV_NULL_ADDRESS /* no indirect data */, - groupCountX, groupCountY, groupCountZ); + groupCountX, groupCountY, groupCountZ, + false); trace_intel_end_compute(&cmd_buffer->trace, groupCountX, groupCountY, groupCountZ); } -void genX(CmdDispatchIndirect)( +static void +emit_unaligned_cs_walker( VkCommandBuffer commandBuffer, - VkBuffer _buffer, - VkDeviceSize offset) + uint32_t baseGroupX, + uint32_t baseGroupY, + uint32_t baseGroupZ, + uint32_t groupCountX, + uint32_t groupCountY, + uint32_t groupCountZ, + struct intel_cs_dispatch_info dispatch) { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); struct anv_compute_pipeline *pipeline = anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline); const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); - struct anv_address addr = anv_address_add(buffer->address, offset); - UNUSED struct anv_batch *batch = &cmd_buffer->batch; if (anv_batch_has_error(&cmd_buffer->batch)) return; anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data, - 0, 0, 0, 0, 0, 0, addr); + baseGroupX, baseGroupY, baseGroupZ, + groupCountX, groupCountY, groupCountZ, + ANV_NULL_ADDRESS); + + /* RT shaders have Y and Z local size set to 1 always. */ + assert(prog_data->local_size[1] == 1 && prog_data->local_size[2] == 1); + + /* RT shaders dispatched with group Y and Z set to 1 always. */ + assert(groupCountY == 1 && groupCountZ == 1); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_COMPUTE, + "compute-unaligned-cs-walker", + groupCountX * groupCountY * groupCountZ * + prog_data->local_size[0] * prog_data->local_size[1] * + prog_data->local_size[2]); + + trace_intel_begin_compute(&cmd_buffer->trace); + + assert(!prog_data->uses_num_work_groups); + genX(cmd_buffer_flush_compute_state)(cmd_buffer); + + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + +#if GFX_VERx10 >= 125 + emit_compute_walker(cmd_buffer, pipeline, ANV_NULL_ADDRESS, prog_data, + dispatch, groupCountX, groupCountY, groupCountZ); +#endif + + trace_intel_end_compute(&cmd_buffer->trace, + groupCountX, groupCountY, groupCountZ); +} + +/* + * Dispatch compute work item with unaligned thread invocations. + * + * This helper takes unaligned thread invocations, convert it into aligned + * thread group count and dispatch compute work items. + * + * We launch two CS walker, one with aligned part and another CS walker + * with single group for remaining thread invocations. + * + * This function is now specifically for BVH building. + */ +void +genX(cmd_dispatch_unaligned)( + VkCommandBuffer commandBuffer, + uint32_t invocations_x, + uint32_t invocations_y, + uint32_t invocations_z) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_compute_pipeline *pipeline = + anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline); + const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); + + /* Group X can be unaligned for RT dispatches. */ + uint32_t groupCountX = invocations_x / prog_data->local_size[0]; + uint32_t groupCountY = invocations_y; + uint32_t groupCountZ = invocations_z; + + struct intel_cs_dispatch_info dispatch = + brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL); + + /* Launch first CS walker with aligned group count X. */ + if (groupCountX) { + emit_unaligned_cs_walker(commandBuffer, 0, 0, 0, groupCountX, + groupCountY, groupCountZ, dispatch); + } + + uint32_t unaligned_invocations_x = invocations_x % prog_data->local_size[0]; + if (unaligned_invocations_x) { + dispatch.threads = DIV_ROUND_UP(unaligned_invocations_x, + dispatch.simd_size); + + /* Make sure the 2nd walker has the same amount of invocations per + * workgroup as the 1st walker, so that gl_GlobalInvocationsID can be + * calculated correctly with baseGroup. + */ + assert(dispatch.threads * dispatch.simd_size == prog_data->local_size[0]); + + const uint32_t remainder = unaligned_invocations_x & (dispatch.simd_size - 1); + if (remainder > 0) { + dispatch.right_mask = ~0u >> (32 - remainder); + } else { + dispatch.right_mask = ~0u >> (32 - dispatch.simd_size); + } + + /* Launch second CS walker for unaligned part. */ + emit_unaligned_cs_walker(commandBuffer, groupCountX, 0, 0, 1, 1, 1, + dispatch); + } +} + +/* + * This dispatches compute work item with indirect parameters. + * Helper also makes the unaligned thread invocations aligned. + */ +void +genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer, + struct anv_address indirect_addr, + bool is_unaligned_size_x) +{ + struct anv_compute_pipeline *pipeline = + anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline); + const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); + UNUSED struct anv_batch *batch = &cmd_buffer->batch; + struct intel_cs_dispatch_info dispatch = + brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data, + 0, 0, 0, 0, 0, 0, indirect_addr); anv_measure_snapshot(cmd_buffer, INTEL_SNAPSHOT_COMPUTE, @@ -619,10 +765,23 @@ void genX(CmdDispatchIndirect)( if (cmd_buffer->state.conditional_render_enabled) genX(cmd_emit_conditional_render_predicate)(cmd_buffer); - emit_cs_walker(cmd_buffer, pipeline, prog_data, addr, 0, 0, 0); + emit_cs_walker(cmd_buffer, pipeline, prog_data, dispatch, indirect_addr, 0, + 0, 0, is_unaligned_size_x); trace_intel_end_compute_indirect(&cmd_buffer->trace, - anv_address_utrace(addr)); + anv_address_utrace(indirect_addr)); +} + +void genX(CmdDispatchIndirect)( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); + struct anv_address addr = anv_address_add(buffer->address, offset); + + genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false); } struct anv_address