From 53426d26c3700355f88578c0ef70a926faf21f2b Mon Sep 17 00:00:00 2001
From: Emma Anholt <emma@anholt.net>
Date: Tue, 25 Jan 2022 21:31:06 -0800
Subject: [PATCH] softpipe: Dispatch 4 CS invocations per tgsi_exec thread.

We were executing 1 non-helper invocation and 3 helpers per CS tgsi_exec
machine, which was a total waste of the CPU when we could trivially have
all 4 invocations do real work (at least in the common case of a
gl_WorkGroupSize.x >= 4).

This didn't have the effect on dEQP that I was hoping for, as it turns out
that its shaders are almost all 1x1x1 workgroups.  However, it does reduce
the runtime of piglit arb_compute_shader-local-id from 2:10 to 47 seconds
on my system.

Part of #4097

Reviewed-by: Dave Airlie <airlied@redhat.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14728>
---
 src/gallium/drivers/softpipe/sp_compute.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_compute.c b/src/gallium/drivers/softpipe/sp_compute.c
index ba389620a21..221ef7ec797 100644
--- a/src/gallium/drivers/softpipe/sp_compute.c
+++ b/src/gallium/drivers/softpipe/sp_compute.c
@@ -55,7 +55,7 @@ cs_prepare(const struct sp_compute_shader *cs,
    if (machine->SysSemanticToIndex[TGSI_SEMANTIC_THREAD_ID] != -1) {
       unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_THREAD_ID];
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-         machine->SystemValue[i].xyzw[0].i[j] = local_x;
+         machine->SystemValue[i].xyzw[0].i[j] = local_x + j;
          machine->SystemValue[i].xyzw[1].i[j] = local_y;
          machine->SystemValue[i].xyzw[2].i[j] = local_z;
       }
@@ -180,7 +180,7 @@ softpipe_launch_grid(struct pipe_context *context,
    bwidth = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH];
    bheight = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT];
    bdepth = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
-   num_threads_in_group = bwidth * bheight * bdepth;
+   num_threads_in_group = DIV_ROUND_UP(bwidth, TGSI_QUAD_SIZE) * bheight * bdepth;
 
    fill_grid_size(context, info, grid_size);
 
@@ -195,15 +195,15 @@ softpipe_launch_grid(struct pipe_context *context,
    }
 
    /* initialise machines + GRID_SIZE + THREAD_ID  + BLOCK_SIZE */
+   int idx = 0;
    for (local_z = 0; local_z < bdepth; local_z++) {
       for (local_y = 0; local_y < bheight; local_y++) {
-         for (local_x = 0; local_x < bwidth; local_x++) {
-            int idx = local_x + (local_y * bwidth) + (local_z * bheight * bwidth);
+         for (local_x = 0; local_x < bwidth; local_x += TGSI_QUAD_SIZE) {
             machines[idx] = tgsi_exec_machine_create(PIPE_SHADER_COMPUTE);
 
             machines[idx]->LocalMem = local_mem;
             machines[idx]->LocalMemSize = cs->shader.req_local_mem;
-            machines[idx]->NonHelperMask = 0x1;
+            machines[idx]->NonHelperMask = (1 << (MIN2(TGSI_QUAD_SIZE, bwidth - local_x))) - 1;
             cs_prepare(cs, machines[idx],
                        local_x, local_y, local_z,
                        grid_size[0], grid_size[1], grid_size[2],
@@ -214,6 +214,7 @@ softpipe_launch_grid(struct pipe_context *context,
             tgsi_exec_set_constant_buffers(machines[idx], PIPE_MAX_CONSTANT_BUFFERS,
                                            softpipe->mapped_constants[PIPE_SHADER_COMPUTE],
                                            softpipe->const_buffer_size[PIPE_SHADER_COMPUTE]);
+            idx++;
          }
       }
    }