From 9e7d9a6efb6714848ea05209950e910fd7efe446 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 30 Jun 2021 10:43:54 +0200
Subject: [PATCH] v3dv: add support for geometry shaders to pipelines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This gets our graphics pipelines (and pipeline cache) to accept
and compile geometry shader modules.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11783>
---
 src/broadcom/vulkan/v3dv_pipeline.c       | 498 +++++++++++++++++++---
 src/broadcom/vulkan/v3dv_pipeline_cache.c |  31 +-
 src/broadcom/vulkan/v3dv_private.h        |  11 +-
 src/broadcom/vulkan/v3dvx_pipeline.c      |  10 +-
 4 files changed, 465 insertions(+), 85 deletions(-)

diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c
index 90c33a66e68..2fd7f0c457e 100644
--- a/src/broadcom/vulkan/v3dv_pipeline.c
+++ b/src/broadcom/vulkan/v3dv_pipeline.c
@@ -37,6 +37,9 @@
 
 #include "vulkan/util/vk_format.h"
 
+static VkResult
+compute_vpm_config(struct v3dv_pipeline *pipeline);
+
 void
 v3dv_print_v3d_key(struct v3d_key *key,
                    uint32_t v3d_key_size)
@@ -118,11 +121,15 @@ pipeline_free_stages(struct v3dv_device *device,
     */
    destroy_pipeline_stage(device, pipeline->vs, pAllocator);
    destroy_pipeline_stage(device, pipeline->vs_bin, pAllocator);
+   destroy_pipeline_stage(device, pipeline->gs, pAllocator);
+   destroy_pipeline_stage(device, pipeline->gs_bin, pAllocator);
    destroy_pipeline_stage(device, pipeline->fs, pAllocator);
    destroy_pipeline_stage(device, pipeline->cs, pAllocator);
 
    pipeline->vs = NULL;
    pipeline->vs_bin = NULL;
+   pipeline->gs = NULL;
+   pipeline->gs_bin = NULL;
    pipeline->fs = NULL;
    pipeline->cs = NULL;
 }
@@ -999,6 +1006,18 @@ lower_fs_io(nir_shader *nir)
               type_size_vec4, 0);
 }
 
+static void
+lower_gs_io(struct nir_shader *nir)
+{
+   NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);
+
+   nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
+                               MESA_SHADER_GEOMETRY);
+
+   nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
+                               MESA_SHADER_GEOMETRY);
+}
+
 static void
 lower_vs_io(struct nir_shader *nir)
 {
@@ -1063,12 +1082,23 @@ pipeline_populate_v3d_key(struct v3d_key *key,
          key->sampler[sampler_idx].return_size == 32 ? 4 : 2;
    }
 
-
-
-   /* default value. Would be override on the vs/gs populate methods when GS
-    * gets supported
-    */
-   key->is_last_geometry_stage = true;
+   switch (p_stage->stage) {
+   case BROADCOM_SHADER_VERTEX:
+   case BROADCOM_SHADER_VERTEX_BIN:
+      key->is_last_geometry_stage = p_stage->pipeline->gs == NULL;
+      break;
+   case BROADCOM_SHADER_GEOMETRY:
+   case BROADCOM_SHADER_GEOMETRY_BIN:
+      /* FIXME: while we don't implement tessellation shaders */
+      key->is_last_geometry_stage = true;
+      break;
+   case BROADCOM_SHADER_FRAGMENT:
+   case BROADCOM_SHADER_COMPUTE:
+      key->is_last_geometry_stage = false;
+      break;
+   default:
+      unreachable("unsupported shader stage");
+   }
 
    /* Vulkan doesn't have fixed function state for user clip planes. Instead,
     * shaders can write to gl_ClipDistance[], in which case the SPIR-V compiler
@@ -1128,6 +1158,8 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
                              const struct v3dv_pipeline_stage *p_stage,
                              uint32_t ucp_enables)
 {
+   assert(p_stage->stage == BROADCOM_SHADER_FRAGMENT);
+
    memset(key, 0, sizeof(*key));
 
    const bool rba = p_stage->pipeline->device->features.robustBufferAccess;
@@ -1227,15 +1259,74 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
 }
 
 static void
-pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
+setup_stage_outputs_from_next_stage_inputs(
+   uint8_t next_stage_num_inputs,
+   struct v3d_varying_slot *next_stage_input_slots,
+   uint8_t *num_used_outputs,
+   struct v3d_varying_slot *used_output_slots,
+   uint32_t size_of_used_output_slots)
+{
+   *num_used_outputs = next_stage_num_inputs;
+   memcpy(used_output_slots, next_stage_input_slots, size_of_used_output_slots);
+}
+
+static void
+pipeline_populate_v3d_gs_key(struct v3d_gs_key *key,
                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
                              const struct v3dv_pipeline_stage *p_stage)
 {
+   assert(p_stage->stage == BROADCOM_SHADER_GEOMETRY ||
+          p_stage->stage == BROADCOM_SHADER_GEOMETRY_BIN);
+
    memset(key, 0, sizeof(*key));
 
    const bool rba = p_stage->pipeline->device->features.robustBufferAccess;
    pipeline_populate_v3d_key(&key->base, p_stage, 0, rba);
 
+   struct v3dv_pipeline *pipeline = p_stage->pipeline;
+
+   key->per_vertex_point_size =
+      p_stage->nir->info.outputs_written & (1ull << VARYING_SLOT_PSIZ);
+
+   key->is_coord = broadcom_shader_stage_is_binning(p_stage->stage);
+
+   assert(key->base.is_last_geometry_stage);
+   if (key->is_coord) {
+      /* Output varyings in the last binning shader are only used for transform
+       * feedback. Set to 0 as VK_EXT_transform_feedback is not supported.
+       */
+      key->num_used_outputs = 0;
+   } else {
+      struct v3dv_shader_variant *fs_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+
+      STATIC_ASSERT(sizeof(key->used_outputs) ==
+                    sizeof(fs_variant->prog_data.fs->input_slots));
+
+      setup_stage_outputs_from_next_stage_inputs(
+         fs_variant->prog_data.fs->num_inputs,
+         fs_variant->prog_data.fs->input_slots,
+         &key->num_used_outputs,
+         key->used_outputs,
+         sizeof(key->used_outputs));
+   }
+}
+
+static void
+pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
+                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                             const struct v3dv_pipeline_stage *p_stage)
+{
+   assert(p_stage->stage == BROADCOM_SHADER_VERTEX ||
+          p_stage->stage == BROADCOM_SHADER_VERTEX_BIN);
+
+   memset(key, 0, sizeof(*key));
+
+   const bool rba = p_stage->pipeline->device->features.robustBufferAccess;
+   pipeline_populate_v3d_key(&key->base, p_stage, 0, rba);
+
+   struct v3dv_pipeline *pipeline = p_stage->pipeline;
+
    /* Vulkan specifies a point size per vertex, so true for if the prim are
     * points, like on ES2)
     */
@@ -1243,27 +1334,65 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
       pCreateInfo->pInputAssemblyState;
    uint8_t topology = vk_to_pipe_prim_type[ia_info->topology];
 
-   /* FIXME: not enough to being PRIM_POINTS, on gallium the full check is
+   /* FIXME: PRIM_POINTS is not enough, in gallium the full check is
     * PIPE_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */
    key->per_vertex_point_size = (topology == PIPE_PRIM_POINTS);
 
-   key->is_coord = p_stage->stage == BROADCOM_SHADER_VERTEX_BIN;
-   if (key->is_coord) {
-      /* The only output varying on coord shaders are for transform
-       * feedback. Set to 0 as VK_EXT_transform_feedback is not supported.
-       */
-      key->num_used_outputs = 0;
-   } else {
-      struct v3dv_pipeline *pipeline = p_stage->pipeline;
-      struct v3dv_shader_variant *fs_variant =
-         pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+   key->is_coord = broadcom_shader_stage_is_binning(p_stage->stage);
 
-      key->num_used_outputs = fs_variant->prog_data.fs->num_inputs;
+   if (key->is_coord) { /* Binning VS*/
+      if (key->base.is_last_geometry_stage) {
+         /* Output varyings in the last binning shader are only used for
+          * transform feedback. Set to 0 as VK_EXT_transform_feedback is not
+          * supported.
+          */
+         key->num_used_outputs = 0;
+      } else {
+         /* Linking against GS binning program */
+         assert(pipeline->gs);
+         struct v3dv_shader_variant *gs_bin_variant =
+            pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
 
-      STATIC_ASSERT(sizeof(key->used_outputs) ==
-                    sizeof(fs_variant->prog_data.fs->input_slots));
-      memcpy(key->used_outputs, fs_variant->prog_data.fs->input_slots,
-             sizeof(key->used_outputs));
+         STATIC_ASSERT(sizeof(key->used_outputs) ==
+                       sizeof(gs_bin_variant->prog_data.gs->input_slots));
+
+         setup_stage_outputs_from_next_stage_inputs(
+            gs_bin_variant->prog_data.gs->num_inputs,
+            gs_bin_variant->prog_data.gs->input_slots,
+            &key->num_used_outputs,
+            key->used_outputs,
+            sizeof(key->used_outputs));
+      }
+   } else { /* Render VS */
+      if (pipeline->gs) {
+         /* Linking against GS render program */
+         struct v3dv_shader_variant *gs_variant =
+            pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
+
+         STATIC_ASSERT(sizeof(key->used_outputs) ==
+                       sizeof(gs_variant->prog_data.gs->input_slots));
+
+         setup_stage_outputs_from_next_stage_inputs(
+            gs_variant->prog_data.gs->num_inputs,
+            gs_variant->prog_data.gs->input_slots,
+            &key->num_used_outputs,
+            key->used_outputs,
+            sizeof(key->used_outputs));
+      } else {
+         /* Linking against FS program */
+         struct v3dv_shader_variant *fs_variant =
+            pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+
+         STATIC_ASSERT(sizeof(key->used_outputs) ==
+                       sizeof(fs_variant->prog_data.fs->input_slots));
+
+         setup_stage_outputs_from_next_stage_inputs(
+            fs_variant->prog_data.fs->num_inputs,
+            fs_variant->prog_data.fs->input_slots,
+            &key->num_used_outputs,
+            key->used_outputs,
+            sizeof(key->used_outputs));
+      }
    }
 
    const VkPipelineVertexInputStateCreateInfo *vi_info =
@@ -1375,14 +1504,18 @@ pipeline_hash_graphics(const struct v3dv_pipeline *pipeline,
    struct mesa_sha1 ctx;
    _mesa_sha1_init(&ctx);
 
-   /* We need to include both on the sha1 key as one could affect the other
-    * during linking (like if vertex output are constants, then the
-    * fragment shader would load_const intead of load_input). An
-    * alternative would be to use the serialized nir, but that seems like
-    * an overkill
+   /* We need to include all shader stages in the sha1 key as linking may modify
+    * the shader code in any stage. An alternative would be to use the
+    * serialized NIR, but that seems like an overkill.
     */
    _mesa_sha1_update(&ctx, pipeline->vs->shader_sha1,
                      sizeof(pipeline->vs->shader_sha1));
+
+   if (pipeline->gs) {
+      _mesa_sha1_update(&ctx, pipeline->gs->shader_sha1,
+                        sizeof(pipeline->gs->shader_sha1));
+   }
+
    _mesa_sha1_update(&ctx, pipeline->fs->shader_sha1,
                      sizeof(pipeline->fs->shader_sha1));
 
@@ -1502,7 +1635,7 @@ v3dv_shader_variant_create(struct v3dv_device *device,
  *     VK_ERROR_UNKNOWN, even if we know that the problem was a compiler
  *     error.
  */
-static struct v3dv_shader_variant*
+static struct v3dv_shader_variant *
 pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
                                 struct v3d_key *key,
                                 size_t key_size,
@@ -1703,7 +1836,7 @@ get_ucp_enable_mask(struct v3dv_pipeline_stage *p_stage)
    return 0;
 }
 
-static nir_shader*
+static nir_shader *
 pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage,
                        struct v3dv_pipeline *pipeline,
                        struct v3dv_pipeline_cache *cache)
@@ -1771,13 +1904,6 @@ pipeline_compile_vertex_shader(struct v3dv_pipeline *pipeline,
                                const VkAllocationCallbacks *pAllocator,
                                const VkGraphicsPipelineCreateInfo *pCreateInfo)
 {
-   struct v3dv_pipeline_stage *p_stage = pipeline->vs;
-
-   /* Right now we only support pipelines with both vertex and fragment
-    * shader.
-    */
-   assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
-
    assert(pipeline->vs_bin != NULL);
    if (pipeline->vs_bin->nir == NULL) {
       assert(pipeline->vs->nir);
@@ -1793,8 +1919,7 @@ pipeline_compile_vertex_shader(struct v3dv_pipeline *pipeline,
    if (vk_result != VK_SUCCESS)
       return vk_result;
 
-   p_stage = pipeline->vs_bin;
-   pipeline_populate_v3d_vs_key(&key, pCreateInfo, p_stage);
+   pipeline_populate_v3d_vs_key(&key, pCreateInfo, pipeline->vs_bin);
    pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN] =
       pipeline_compile_shader_variant(pipeline->vs_bin, &key.base, sizeof(key),
                                       pAllocator, &vk_result);
@@ -1802,6 +1927,36 @@ pipeline_compile_vertex_shader(struct v3dv_pipeline *pipeline,
    return vk_result;
 }
 
+static VkResult
+pipeline_compile_geometry_shader(struct v3dv_pipeline *pipeline,
+                                 const VkAllocationCallbacks *pAllocator,
+                                 const VkGraphicsPipelineCreateInfo *pCreateInfo)
+{
+   assert(pipeline->gs);
+
+   assert(pipeline->gs_bin != NULL);
+   if (pipeline->gs_bin->nir == NULL) {
+      assert(pipeline->gs->nir);
+      pipeline->gs_bin->nir = nir_shader_clone(NULL, pipeline->gs->nir);
+   }
+
+   VkResult vk_result;
+   struct v3d_gs_key key;
+   pipeline_populate_v3d_gs_key(&key, pCreateInfo, pipeline->gs);
+   pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY] =
+      pipeline_compile_shader_variant(pipeline->gs, &key.base, sizeof(key),
+                                      pAllocator, &vk_result);
+   if (vk_result != VK_SUCCESS)
+      return vk_result;
+
+   pipeline_populate_v3d_gs_key(&key, pCreateInfo, pipeline->gs_bin);
+   pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN] =
+      pipeline_compile_shader_variant(pipeline->gs_bin, &key.base, sizeof(key),
+                                      pAllocator, &vk_result);
+
+   return vk_result;
+}
+
 static VkResult
 pipeline_compile_fragment_shader(struct v3dv_pipeline *pipeline,
                                  const VkAllocationCallbacks *pAllocator,
@@ -1924,7 +2079,7 @@ pipeline_populate_compute_key(struct v3dv_pipeline *pipeline,
 
 static struct v3dv_pipeline_shared_data *
 v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20],
-                                    struct v3dv_device *device,
+                                    struct v3dv_pipeline *pipeline,
                                     bool is_graphics_pipeline)
 {
    /* We create new_entry using the device alloc. Right now shared_data is ref
@@ -1933,7 +2088,7 @@ v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20],
     * unref.
     */
    struct v3dv_pipeline_shared_data *new_entry =
-      vk_zalloc2(&device->vk.alloc, NULL,
+      vk_zalloc2(&pipeline->device->vk.alloc, NULL,
                  sizeof(struct v3dv_pipeline_shared_data), 8,
                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
@@ -1941,10 +2096,10 @@ v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20],
       return NULL;
 
    for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
-      /* We don't need specific descriptor map for vertex_bin, we can share
-       * with vertex
+      /* We don't need specific descriptor maps for binning stages we use the
+       * map for the render stage.
        */
-      if (stage == BROADCOM_SHADER_VERTEX_BIN)
+      if (broadcom_shader_stage_is_binning(stage))
          continue;
 
       if ((is_graphics_pipeline && stage == BROADCOM_SHADER_COMPUTE) ||
@@ -1952,8 +2107,11 @@ v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20],
          continue;
       }
 
+      if (stage == BROADCOM_SHADER_GEOMETRY && !pipeline->gs)
+         continue;
+
       struct v3dv_descriptor_maps *new_maps =
-         vk_zalloc2(&device->vk.alloc, NULL,
+         vk_zalloc2(&pipeline->device->vk.alloc, NULL,
                     sizeof(struct v3dv_descriptor_maps), 8,
                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
@@ -1966,6 +2124,9 @@ v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20],
    new_entry->maps[BROADCOM_SHADER_VERTEX_BIN] =
       new_entry->maps[BROADCOM_SHADER_VERTEX];
 
+   new_entry->maps[BROADCOM_SHADER_GEOMETRY_BIN] =
+      new_entry->maps[BROADCOM_SHADER_GEOMETRY];
+
    new_entry->ref_cnt = 1;
    memcpy(new_entry->sha1_key, sha1_key, 20);
 
@@ -1975,11 +2136,11 @@ fail:
    if (new_entry != NULL) {
       for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
          if (new_entry->maps[stage] != NULL)
-            vk_free(&device->vk.alloc, new_entry->maps[stage]);
+            vk_free(&pipeline->device->vk.alloc, new_entry->maps[stage]);
       }
    }
 
-   vk_free(&device->vk.alloc, new_entry);
+   vk_free(&pipeline->device->vk.alloc, new_entry);
 
    return NULL;
 }
@@ -2053,11 +2214,21 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
             pipeline_stage_create_binning(pipeline->vs, pAllocator);
          if (pipeline->vs_bin == NULL)
             return VK_ERROR_OUT_OF_HOST_MEMORY;
-
          break;
+
+      case MESA_SHADER_GEOMETRY:
+         pipeline->has_gs = true;
+         pipeline->gs = p_stage;
+         pipeline->gs_bin =
+            pipeline_stage_create_binning(pipeline->gs, pAllocator);
+         if (pipeline->gs_bin == NULL)
+            return VK_ERROR_OUT_OF_HOST_MEMORY;
+         break;
+
       case MESA_SHADER_FRAGMENT:
          pipeline->fs = p_stage;
          break;
+
       default:
          unreachable("not supported shader stage");
       }
@@ -2089,7 +2260,7 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
       pipeline->active_stages |= MESA_SHADER_FRAGMENT;
    }
 
-   /* Now we will try to get the variants from the pipeline cache */
+   /* First we try to get the variants from the pipeline cache */
    struct v3dv_pipeline_key pipeline_key;
    pipeline_populate_graphics_key(pipeline, &pipeline_key, pCreateInfo);
    unsigned char pipeline_sha1[20];
@@ -2099,29 +2270,46 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
       v3dv_pipeline_cache_search_for_pipeline(cache, pipeline_sha1);
 
    if (pipeline->shared_data != NULL) {
+      /* A correct pipeline must have at least a VS and FS */
       assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]);
       assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
       assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
-
+      assert(!pipeline->gs ||
+             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]);
+      assert(!pipeline->gs ||
+             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]);
       goto success;
    }
 
-   pipeline->shared_data =
-      v3dv_pipeline_shared_data_new_empty(pipeline_sha1, pipeline->device, true);
-   /* If not, we try to get the nir shaders (from the SPIR-V shader, or from
-    * the pipeline cache again) and compile.
+   /* Otherwise we try to get the NIR shaders (either from the original SPIR-V
+    * shader or the pipeline cache) and compile.
     */
+   pipeline->shared_data =
+      v3dv_pipeline_shared_data_new_empty(pipeline_sha1, pipeline, true);
+
    if (!pipeline->vs->nir)
       pipeline->vs->nir = pipeline_stage_get_nir(pipeline->vs, pipeline, cache);
+   if (pipeline->gs && !pipeline->gs->nir)
+      pipeline->gs->nir = pipeline_stage_get_nir(pipeline->gs, pipeline, cache);
    if (!pipeline->fs->nir)
       pipeline->fs->nir = pipeline_stage_get_nir(pipeline->fs, pipeline, cache);
 
    /* Linking + pipeline lowerings */
-   link_shaders(pipeline->vs->nir, pipeline->fs->nir);
+   if (pipeline->gs) {
+      link_shaders(pipeline->gs->nir, pipeline->fs->nir);
+      link_shaders(pipeline->vs->nir, pipeline->gs->nir);
+   } else {
+      link_shaders(pipeline->vs->nir, pipeline->fs->nir);
+   }
 
    pipeline_lower_nir(pipeline, pipeline->fs, pipeline->layout);
    lower_fs_io(pipeline->fs->nir);
 
+   if (pipeline->gs) {
+      pipeline_lower_nir(pipeline, pipeline->gs, pipeline->layout);
+      lower_gs_io(pipeline->vs->nir);
+   }
+
    pipeline_lower_nir(pipeline, pipeline->vs, pipeline->layout);
    lower_vs_io(pipeline->vs->nir);
 
@@ -2134,6 +2322,16 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
    if (vk_result != VK_SUCCESS)
       return vk_result;
 
+   assert(!pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY] &&
+          !pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]);
+
+   if (pipeline->gs) {
+      vk_result =
+         pipeline_compile_geometry_shader(pipeline, pAllocator, pCreateInfo);
+      if (vk_result != VK_SUCCESS)
+         return vk_result;
+   }
+
    assert(!pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX] &&
           !pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
 
@@ -2147,28 +2345,194 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
    v3dv_pipeline_cache_upload_pipeline(pipeline, cache);
 
  success:
-   /* As we got the variants in pipeline->shared_data, after compiling we
-    * don't need the pipeline_stages
+   /* Since we have the variants in the pipeline shared data we can now free
+    * the pipeline stages.
     */
    pipeline_free_stages(device, pipeline, pAllocator);
 
    pipeline_check_spill_size(pipeline);
 
-   /* FIXME: values below are default when non-GS is available. Would need to
-    * provide real values if GS gets supported
+   return compute_vpm_config(pipeline);
+}
+
+static inline uint32_t
+compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo)
+{
+   assert(devinfo->vpm_size > 0);
+   const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
+   return devinfo->vpm_size / sector_size;
+}
+
+/* Computes various parameters affecting VPM memory configuration for programs
+ * involving geometry shaders to ensure the program fits in memory and honors
+ * requirements described in section "VPM usage" of the programming manual.
+ *
+ * FIXME: put this code in common and share with v3d.
+ */
+static bool
+compute_vpm_config_gs(struct v3d_device_info *devinfo,
+                      struct v3d_vs_prog_data *vs,
+                      struct v3d_gs_prog_data *gs,
+                      struct vpm_config *vpm_cfg_out)
+{
+   const uint32_t A = vs->separate_segments ? 1 : 0;
+   const uint32_t Ad = vs->vpm_input_size;
+   const uint32_t Vd = vs->vpm_output_size;
+
+   const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo);
+
+   /* Try to fit program into our VPM memory budget by adjusting
+    * configurable parameters iteratively. We do this in two phases:
+    * the first phase tries to fit the program into the total available
+    * VPM memory. If we succeed at that, then the second phase attempts
+    * to fit the program into half of that budget so we can run bin and
+    * render programs in parallel.
     */
+   struct vpm_config vpm_cfg[2];
+   struct vpm_config *final_vpm_cfg = NULL;
+   uint32_t phase = 0;
+
+   vpm_cfg[phase].As = 1;
+   vpm_cfg[phase].Gs = 1;
+   vpm_cfg[phase].Gd = gs->vpm_output_size;
+   vpm_cfg[phase].gs_width = gs->simd_width;
+
+   /* While there is a requirement that Vc >= [Vn / 16], this is
+    * always the case when tessellation is not present because in that
+    * case Vn can only be 6 at most (when input primitive is triangles
+    * with adjacency).
+    *
+    * We always choose Vc=2. We can't go lower than this due to GFXH-1744,
+    * and Broadcom has not found it worth it to increase it beyond this
+    * in general. Increasing Vc also increases VPM memory pressure which
+    * can turn up being detrimental for performance in some scenarios.
+    */
+   vpm_cfg[phase].Vc = 2;
+
+   /* Gv is a constraint on the hardware to not exceed the
+    * specified number of vertex segments per GS batch. If adding a
+    * new primitive to a GS batch would result in a range of more
+    * than Gv vertex segments being referenced by the batch, then
+    * the hardware will flush the batch and start a new one. This
+    * means that we can choose any value we want, we just need to
+    * be aware that larger values improve GS batch utilization
+    * at the expense of more VPM memory pressure (which can affect
+    * other performance aspects, such as GS dispatch width).
+    * We start with the largest value, and will reduce it if we
+    * find that total memory pressure is too high.
+    */
+   vpm_cfg[phase].Gv = 3;
+   do {
+      /* When GS is present in absence of TES, then we need to satisfy
+       * that Ve >= Gv. We go with the smallest value of Ve to avoid
+       * increasing memory pressure.
+       */
+      vpm_cfg[phase].Ve = vpm_cfg[phase].Gv;
+
+      uint32_t vpm_sectors =
+         A * vpm_cfg[phase].As * Ad +
+         (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd +
+         vpm_cfg[phase].Gs * vpm_cfg[phase].Gd;
+
+      /* Ideally we want to use no more than half of the available
+       * memory so we can execute a bin and render program in parallel
+       * without stalls. If we achieved that then we are done.
+       */
+      if (vpm_sectors <= vpm_size / 2) {
+         final_vpm_cfg = &vpm_cfg[phase];
+         break;
+      }
+
+      /* At the very least, we should not allocate more than the
+       * total available VPM memory. If we have a configuration that
+       * succeeds at this we save it and continue to see if we can
+       * meet the half-memory-use criteria too.
+       */
+      if (phase == 0 && vpm_sectors <= vpm_size) {
+         vpm_cfg[1] = vpm_cfg[0];
+         phase = 1;
+      }
+
+      /* Try lowering Gv */
+      if (vpm_cfg[phase].Gv > 0) {
+         vpm_cfg[phase].Gv--;
+         continue;
+      }
+
+      /* Try lowering GS dispatch width */
+      if (vpm_cfg[phase].gs_width > 1) {
+         do {
+            vpm_cfg[phase].gs_width >>= 1;
+            vpm_cfg[phase].Gd = align(vpm_cfg[phase].Gd, 2) / 2;
+         } while (vpm_cfg[phase].gs_width == 2);
+
+         /* Reset Gv to max after dropping dispatch width */
+         vpm_cfg[phase].Gv = 3;
+         continue;
+      }
+
+      /* We ran out of options to reduce memory pressure. If we
+       * are at phase 1 we have at least a valid configuration, so we
+       * we use that.
+       */
+      if (phase == 1)
+         final_vpm_cfg = &vpm_cfg[0];
+      break;
+   } while (true);
+
+   if (!final_vpm_cfg)
+      return false;
+
+   assert(final_vpm_cfg);
+   assert(final_vpm_cfg->Gd <= 16);
+   assert(final_vpm_cfg->Gv < 4);
+   assert(final_vpm_cfg->Ve < 4);
+   assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4);
+   assert(final_vpm_cfg->gs_width == 1 ||
+          final_vpm_cfg->gs_width == 4 ||
+          final_vpm_cfg->gs_width == 8 ||
+          final_vpm_cfg->gs_width == 16);
+
+   *vpm_cfg_out = *final_vpm_cfg;
+   return true;
+}
+
+static VkResult
+compute_vpm_config(struct v3dv_pipeline *pipeline)
+{
    struct v3dv_shader_variant *vs_variant =
       pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
    struct v3dv_shader_variant *vs_bin_variant =
-      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
+   struct v3d_vs_prog_data *vs = vs_variant->prog_data.vs;
+   struct v3d_vs_prog_data *vs_bin =vs_bin_variant->prog_data.vs;
 
-   pipeline->vpm_cfg_bin.As = 1;
-   pipeline->vpm_cfg_bin.Ve = 0;
-   pipeline->vpm_cfg_bin.Vc = vs_bin_variant->prog_data.vs->vcm_cache_size;
+   if (!pipeline->has_gs) {
+      pipeline->vpm_cfg_bin.As = 1;
+      pipeline->vpm_cfg_bin.Ve = 0;
+      pipeline->vpm_cfg_bin.Vc = vs_bin->vcm_cache_size;
 
-   pipeline->vpm_cfg.As = 1;
-   pipeline->vpm_cfg.Ve = 0;
-   pipeline->vpm_cfg.Vc = vs_variant->prog_data.vs->vcm_cache_size;
+      pipeline->vpm_cfg.As = 1;
+      pipeline->vpm_cfg.Ve = 0;
+      pipeline->vpm_cfg.Vc = vs->vcm_cache_size;
+   } else {
+      struct v3dv_shader_variant *gs_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
+      struct v3dv_shader_variant *gs_bin_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
+      struct v3d_gs_prog_data *gs = gs_variant->prog_data.gs;
+      struct v3d_gs_prog_data *gs_bin = gs_bin_variant->prog_data.gs;
+
+      if (!compute_vpm_config_gs(&pipeline->device->devinfo,
+                                 vs_bin, gs_bin, &pipeline->vpm_cfg_bin)) {
+         return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+      }
+
+      if (!compute_vpm_config_gs(&pipeline->device->devinfo,
+                                 vs, gs, &pipeline->vpm_cfg)) {
+         return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+      }
+   }
 
    return VK_SUCCESS;
 }
@@ -2677,7 +3041,7 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
    }
 
    pipeline->shared_data = v3dv_pipeline_shared_data_new_empty(pipeline_sha1,
-                                                               pipeline->device,
+                                                               pipeline,
                                                                false);
 
    /* If not found on cache, compile it */
diff --git a/src/broadcom/vulkan/v3dv_pipeline_cache.c b/src/broadcom/vulkan/v3dv_pipeline_cache.c
index 1440e3cce62..fb9904be2bc 100644
--- a/src/broadcom/vulkan/v3dv_pipeline_cache.c
+++ b/src/broadcom/vulkan/v3dv_pipeline_cache.c
@@ -325,11 +325,11 @@ v3dv_pipeline_shared_data_destroy(struct v3dv_device *device,
       if (shared_data->variants[stage] != NULL)
          v3dv_shader_variant_destroy(device, shared_data->variants[stage]);
 
-      /* We don't free the vertex_bin descriptor maps as we are sharing them
-       * with the vertex shader.
+      /* We don't free binning descriptor maps as we are sharing them
+       * with the render shaders.
        */
       if (shared_data->maps[stage] != NULL &&
-          stage != BROADCOM_SHADER_VERTEX_BIN) {
+          !broadcom_shader_stage_is_binning(stage)) {
          vk_free(&device->vk.alloc, shared_data->maps[stage]);
       }
    }
@@ -563,8 +563,11 @@ v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache,
          return NULL;
 
       memcpy(maps[stage], current_maps, sizeof(struct v3dv_descriptor_maps));
-      if (stage == BROADCOM_SHADER_VERTEX)
-         maps[BROADCOM_SHADER_VERTEX_BIN] = maps[stage];
+      if (broadcom_shader_stage_is_render_with_binning(stage)) {
+         enum broadcom_shader_stage bin_stage =
+            broadcom_binning_shader_stage_for_render_stage(stage);
+            maps[bin_stage] = maps[stage];
+      }
    }
 
    uint8_t variant_count = blob_read_uint8(blob);
@@ -835,25 +838,25 @@ v3dv_pipeline_shared_data_write_to_blob(const struct v3dv_pipeline_shared_data *
 
    uint8_t descriptor_maps_count = 0;
    for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
-      if (stage == BROADCOM_SHADER_VERTEX_BIN)
+      if (broadcom_shader_stage_is_binning(stage))
          continue;
       if (cache_entry->maps[stage] == NULL)
          continue;
       descriptor_maps_count++;
    }
 
-   /* Right now we only support compute pipeline, or graphics pipeline with
-    * vertex, vertex bin, and fragment shader, but vertex and vertex bin
-    * descriptor maps are shared.
+   /* Compute pipelines only have one descriptor map,
+    * graphics pipelines may have 2 (VS+FS) or 3 (VS+GS+FS), since the binning
+    * stages take the descriptor map from the render stage.
     */
-   assert(descriptor_maps_count == 2 ||
+   assert((descriptor_maps_count >= 2 && descriptor_maps_count <= 3) ||
           (descriptor_maps_count == 1 && cache_entry->variants[BROADCOM_SHADER_COMPUTE]));
    blob_write_uint8(blob, descriptor_maps_count);
 
    for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
       if (cache_entry->maps[stage] == NULL)
          continue;
-      if (stage == BROADCOM_SHADER_VERTEX_BIN)
+      if (broadcom_shader_stage_is_binning(stage))
          continue;
 
       blob_write_uint8(blob, stage);
@@ -868,10 +871,10 @@ v3dv_pipeline_shared_data_write_to_blob(const struct v3dv_pipeline_shared_data *
       variant_count++;
    }
 
-   /* Right now we only support compute pipeline, or graphics pipeline with
-    * vertex, vertex bin, and fragment shader.
+   /* Graphics pipelines with VS+FS have 3 variants, VS+GS+FS will have 5 and
+    * compute pipelines only have 1.
     */
-   assert(variant_count == 3 ||
+   assert((variant_count == 5  || variant_count == 3) ||
           (variant_count == 1 && cache_entry->variants[BROADCOM_SHADER_COMPUTE]));
    blob_write_uint8(blob, variant_count);
 
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 152a9c0a34e..ca28f111884 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -1397,6 +1397,7 @@ struct v3dv_shader_variant {
    union {
       struct v3d_prog_data *base;
       struct v3d_vs_prog_data *vs;
+      struct v3d_gs_prog_data *gs;
       struct v3d_fs_prog_data *fs;
       struct v3d_compute_prog_data *cs;
    } prog_data;
@@ -1738,14 +1739,20 @@ struct v3dv_pipeline {
    struct v3dv_render_pass *pass;
    struct v3dv_subpass *subpass;
 
-   /* Note: We can't use just a MESA_SHADER_STAGES array as we need to track
-    * too the coordinate shader
+   /* Note: We can't use just a MESA_SHADER_STAGES array because we also need
+    * to track binning shaders. Note these will be freed once the pipeline
+    * has been compiled.
     */
    struct v3dv_pipeline_stage *vs;
    struct v3dv_pipeline_stage *vs_bin;
+   struct v3dv_pipeline_stage *gs;
+   struct v3dv_pipeline_stage *gs_bin;
    struct v3dv_pipeline_stage *fs;
    struct v3dv_pipeline_stage *cs;
 
+   /* Flags for whether optional pipeline stages are present, for convenience */
+   bool has_gs;
+
    /* Spilling memory requirements */
    struct {
       struct v3dv_bo *bo;
diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
index 47948c86ab2..8fb224df845 100644
--- a/src/broadcom/vulkan/v3dvx_pipeline.c
+++ b/src/broadcom/vulkan/v3dvx_pipeline.c
@@ -368,8 +368,14 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
    v3dvx_pack(pipeline->shader_state_record, GL_SHADER_STATE_RECORD, shader) {
       shader.enable_clipping = true;
 
-      shader.point_size_in_shaded_vertex_data =
-         pipeline->topology == PIPE_PRIM_POINTS;
+      if (!pipeline->has_gs) {
+         shader.point_size_in_shaded_vertex_data =
+            pipeline->topology == PIPE_PRIM_POINTS;
+      } else {
+         struct v3d_gs_prog_data *prog_data_gs =
+            pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]->prog_data.gs;
+         shader.point_size_in_shaded_vertex_data = prog_data_gs->writes_psiz;
+      }
 
       /* Must be set if the shader modifies Z, discards, or modifies
        * the sample mask.  For any of these cases, the fragment