From 53426d26c3700355f88578c0ef70a926faf21f2b Mon Sep 17 00:00:00 2001 From: Emma Anholt Date: Tue, 25 Jan 2022 21:31:06 -0800 Subject: [PATCH] softpipe: Dispatch 4 CS invocations per tgsi_exec thread. We were executing 1 non-helper invocation and 3 helpers per CS tgsi_exec machine, which was a total waste of the CPU when we could trivially have all 4 invocations do real work (at least in the common case of a gl_WorkGroupSize.x >= 4). This didn't have the effect on dEQP that I was hoping for, as it turns out that its shaders are almost all 1x1x1 workgroups. However, it does reduce the runtime of piglit arb_compute_shader-local-id from 2:10 to 47 seconds on my system. Part of #4097 Reviewed-by: Dave Airlie Part-of: --- src/gallium/drivers/softpipe/sp_compute.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/softpipe/sp_compute.c b/src/gallium/drivers/softpipe/sp_compute.c index ba389620a21..221ef7ec797 100644 --- a/src/gallium/drivers/softpipe/sp_compute.c +++ b/src/gallium/drivers/softpipe/sp_compute.c @@ -55,7 +55,7 @@ cs_prepare(const struct sp_compute_shader *cs, if (machine->SysSemanticToIndex[TGSI_SEMANTIC_THREAD_ID] != -1) { unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_THREAD_ID]; for (j = 0; j < TGSI_QUAD_SIZE; j++) { - machine->SystemValue[i].xyzw[0].i[j] = local_x; + machine->SystemValue[i].xyzw[0].i[j] = local_x + j; machine->SystemValue[i].xyzw[1].i[j] = local_y; machine->SystemValue[i].xyzw[2].i[j] = local_z; } @@ -180,7 +180,7 @@ softpipe_launch_grid(struct pipe_context *context, bwidth = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]; bheight = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT]; bdepth = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; - num_threads_in_group = bwidth * bheight * bdepth; + num_threads_in_group = DIV_ROUND_UP(bwidth, TGSI_QUAD_SIZE) * bheight * bdepth; fill_grid_size(context, info, grid_size); @@ -195,15 +195,15 @@ softpipe_launch_grid(struct pipe_context *context, } /* initialise machines + GRID_SIZE + THREAD_ID + BLOCK_SIZE */ + int idx = 0; for (local_z = 0; local_z < bdepth; local_z++) { for (local_y = 0; local_y < bheight; local_y++) { - for (local_x = 0; local_x < bwidth; local_x++) { - int idx = local_x + (local_y * bwidth) + (local_z * bheight * bwidth); + for (local_x = 0; local_x < bwidth; local_x += TGSI_QUAD_SIZE) { machines[idx] = tgsi_exec_machine_create(PIPE_SHADER_COMPUTE); machines[idx]->LocalMem = local_mem; machines[idx]->LocalMemSize = cs->shader.req_local_mem; - machines[idx]->NonHelperMask = 0x1; + machines[idx]->NonHelperMask = (1 << (MIN2(TGSI_QUAD_SIZE, bwidth - local_x))) - 1; cs_prepare(cs, machines[idx], local_x, local_y, local_z, grid_size[0], grid_size[1], grid_size[2], @@ -214,6 +214,7 @@ softpipe_launch_grid(struct pipe_context *context, tgsi_exec_set_constant_buffers(machines[idx], PIPE_MAX_CONSTANT_BUFFERS, softpipe->mapped_constants[PIPE_SHADER_COMPUTE], softpipe->const_buffer_size[PIPE_SHADER_COMPUTE]); + idx++; } } }