intel: Move subslice_total into devinfo

Reworks: * Move asserts for subslice_total into intel_device_info.c (s-b Ken) * Drop now unused intel_device_info_subslice_total (s-b Ken) * Add comment for subslice_total (Ken) Suggested-by: Kenneth Graunke <kenneth@whitecape.org> Signed-off-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12799>
2021-09-08 16:20:24 -07:00
parent 3c18e69078
commit 32e848aeaa
20 changed files with 30 additions and 65 deletions
--- a/src/gallium/drivers/crocus/crocus_program.c
+++ b/src/gallium/drivers/crocus/crocus_program.c
@@ -2645,9 +2645,8 @@ crocus_get_scratch_space(struct crocus_context *ice,

   struct crocus_bo **bop = &ice->shaders.scratch_bos[encoded_size][stage];

-   unsigned subslice_total = screen->subslice_total;
-   subslice_total = 4 * devinfo->num_slices;
-   //   assert(subslice_total >= screen->subslice_total);
+   /* TODO: This doesn't seem to match brw_alloc_stage_scratch */
+   unsigned cs_subslices = 4 * devinfo->num_slices;

   if (!*bop) {
      unsigned scratch_ids_per_subslice = devinfo->max_cs_threads;
@@ -2658,7 +2657,7 @@ crocus_get_scratch_space(struct crocus_context *ice,
         [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
         [MESA_SHADER_GEOMETRY]  = devinfo->max_gs_threads,
         [MESA_SHADER_FRAGMENT]  = devinfo->max_wm_threads,
-         [MESA_SHADER_COMPUTE]   = scratch_ids_per_subslice * subslice_total,
+         [MESA_SHADER_COMPUTE]   = scratch_ids_per_subslice * cs_subslices,
      };

      uint32_t size = per_thread_scratch * max_threads[stage];
--- a/src/gallium/drivers/crocus/crocus_screen.c
+++ b/src/gallium/drivers/crocus/crocus_screen.c
@@ -813,9 +813,6 @@ crocus_screen_create(int fd, const struct pipe_screen_config *config)
   slab_create_parent(&screen->transfer_pool,
                      sizeof(struct crocus_transfer), 64);

-   screen->subslice_total = intel_device_info_subslice_total(&screen->devinfo);
-   assert(screen->subslice_total >= 1);
-
   struct pipe_screen *pscreen = &screen->base;

   crocus_init_screen_fence_functions(pscreen);
--- a/src/gallium/drivers/crocus/crocus_screen.h
+++ b/src/gallium/drivers/crocus/crocus_screen.h
@@ -201,8 +201,6 @@ struct crocus_screen {
      bool always_flush_cache;
   } driconf;

-   unsigned subslice_total;
-
   uint64_t aperture_bytes;

   struct intel_device_info devinfo;
--- a/src/gallium/drivers/crocus/crocus_state.c
+++ b/src/gallium/drivers/crocus/crocus_state.c
@@ -8075,7 +8075,7 @@ crocus_upload_compute_state(struct crocus_context *ice,
         }

         vfe.MaximumNumberofThreads =
-            devinfo->max_cs_threads * screen->subslice_total - 1;
+            devinfo->max_cs_threads * devinfo->subslice_total - 1;
         vfe.ResetGatewayTimer =
            Resettingrelativetimerandlatchingtheglobaltimestamp;
         vfe.BypassGatewayControl = true;
--- a/src/gallium/drivers/iris/iris_program.c
+++ b/src/gallium/drivers/iris/iris_program.c
@@ -2334,7 +2334,7 @@ iris_get_scratch_space(struct iris_context *ice,
    * For, Gfx11+, scratch space allocation is based on the number of threads
    * in the base configuration.
    */
-   unsigned subslice_total = screen->subslice_total;
+   unsigned subslice_total = devinfo->subslice_total;
   if (devinfo->verx10 == 125)
      subslice_total = 32;
   else if (devinfo->ver == 12)
@@ -2343,7 +2343,7 @@ iris_get_scratch_space(struct iris_context *ice,
      subslice_total = 8;
   else if (devinfo->ver < 11)
      subslice_total = 4 * devinfo->num_slices;
-   assert(subslice_total >= screen->subslice_total);
+   assert(subslice_total >= devinfo->subslice_total);

   if (!*bop) {
      unsigned scratch_ids_per_subslice = devinfo->max_cs_threads;
--- a/src/gallium/drivers/iris/iris_screen.c
+++ b/src/gallium/drivers/iris/iris_screen.c
@@ -872,9 +872,6 @@ iris_screen_create(int fd, const struct pipe_screen_config *config)
   slab_create_parent(&screen->transfer_pool,
                      sizeof(struct iris_transfer), 64);

-   screen->subslice_total = intel_device_info_subslice_total(&screen->devinfo);
-   assert(screen->subslice_total >= 1);
-
   iris_detect_kernel_features(screen);

   struct pipe_screen *pscreen = &screen->base;
--- a/src/gallium/drivers/iris/iris_screen.h
+++ b/src/gallium/drivers/iris/iris_screen.h
@@ -185,8 +185,6 @@ struct iris_screen {
   unsigned kernel_features;
 #define KERNEL_HAS_WAIT_FOR_SUBMIT (1<<0)

-   unsigned subslice_total;
-
   uint64_t aperture_bytes;

   /**
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -6920,7 +6920,7 @@ iris_upload_compute_walker(struct iris_context *ice,
   if (stage_dirty & IRIS_STAGE_DIRTY_CS) {
      iris_emit_cmd(batch, GENX(CFE_STATE), cfe) {
         cfe.MaximumNumberofThreads =
-            devinfo->max_cs_threads * screen->subslice_total - 1;
+            devinfo->max_cs_threads * devinfo->subslice_total - 1;
         if (prog_data->total_scratch > 0) {
            cfe.ScratchSpaceBuffer =
               iris_get_scratch_surf(ice, prog_data->total_scratch)->offset >> 4;
@@ -7003,7 +7003,7 @@ iris_upload_gpgpu_walker(struct iris_context *ice,
         }

         vfe.MaximumNumberofThreads =
-            devinfo->max_cs_threads * screen->subslice_total - 1;
+            devinfo->max_cs_threads * devinfo->subslice_total - 1;
 #if GFX_VER < 11
         vfe.ResetGatewayTimer =
            Resettingrelativetimerandlatchingtheglobaltimestamp;
--- a/src/intel/dev/intel_device_info.c
+++ b/src/intel/dev/intel_device_info.c
@@ -1538,5 +1538,13 @@ intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo)
   intel_get_aperture_size(fd, &devinfo->aperture_bytes);
   devinfo->has_tiling_uapi = has_get_tiling(fd);

+   devinfo->subslice_total = 0;
+   for (uint32_t i = 0; i < devinfo->num_slices; i++)
+      devinfo->subslice_total += __builtin_popcount(devinfo->subslice_masks[i]);
+
+   /* Gfx7 and older do not support EU/Subslice info */
+   assert(devinfo->subslice_total >= 1 || devinfo->ver <= 7);
+   devinfo->subslice_total = MAX2(devinfo->subslice_total, 1);
+
   return true;
 }
--- a/src/intel/dev/intel_device_info.h
+++ b/src/intel/dev/intel_device_info.h
@@ -166,6 +166,12 @@ struct intel_device_info
   uint8_t subslice_masks[INTEL_DEVICE_MAX_SLICES *
                          DIV_ROUND_UP(INTEL_DEVICE_MAX_SUBSLICES, 8)];

+   /**
+    * The number of enabled subslices (considering fusing). For exactly which
+    * subslices are enabled, see subslice_masks[].
+    */
+   unsigned subslice_total;
+
   /**
    * An array of bit mask of EUs available, use eu_slice_stride &
    * eu_subslice_stride to access this array.
@@ -332,17 +338,6 @@ intel_device_info_eu_available(const struct intel_device_info *devinfo,
   return (devinfo->eu_masks[subslice_offset + eu / 8] & (1U << eu % 8)) != 0;
 }

-static inline uint32_t
-intel_device_info_subslice_total(const struct intel_device_info *devinfo)
-{
-   uint32_t total = 0;
-
-   for (uint32_t i = 0; i < devinfo->num_slices; i++)
-      total += __builtin_popcount(devinfo->subslice_masks[i]);
-
-   return total;
-}
-
 static inline uint32_t
 intel_device_info_eu_total(const struct intel_device_info *devinfo)
 {
--- a/src/intel/vulkan/anv_allocator.c
+++ b/src/intel/vulkan/anv_allocator.c
@@ -1464,7 +1464,7 @@ anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
   if (bo != NULL)
      return bo;

-   unsigned subslices = MAX2(device->physical->subslice_total, 1);
+   unsigned subslices = devinfo->subslice_total;

   /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
    *
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -940,9 +940,6 @@ anv_physical_device_try_create(struct anv_instance *instance,
   device->has_userptr_probe =
      anv_gem_get_param(fd, I915_PARAM_HAS_USERPTR_PROBE);

-   /* GENs prior to 8 do not support EU/Subslice info */
-   device->subslice_total = intel_device_info_subslice_total(&device->info);
-
   device->compiler = brw_compiler_create(NULL, &device->info);
   if (device->compiler == NULL) {
      result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -960,8 +960,6 @@ struct anv_physical_device {

    bool                                        always_flush_cache;

-    uint32_t                                    subslice_total;
-
    struct {
      uint32_t                                  family_count;
      struct anv_queue_family                   families[ANV_MAX_QUEUE_FAMILIES];
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -5401,11 +5401,9 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
       * GPGPU and 3D are back-to-back and this seems to fix it.  We don't
       * really know why.
       */
-      const uint32_t subslices =
-         MAX2(cmd_buffer->device->physical->subslice_total, 1);
      anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {
         vfe.MaximumNumberofThreads =
-            devinfo->max_cs_threads * subslices - 1;
+            devinfo->max_cs_threads * devinfo->subslice_total - 1;
         vfe.NumberofURBEntries     = 2;
         vfe.URBEntryAllocationSize = 2;
      }
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -2589,14 +2589,12 @@ emit_compute_state(struct anv_compute_pipeline *pipeline,
   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
   anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);

-   const uint32_t subslices = MAX2(device->physical->subslice_total, 1);
-
   const UNUSED struct anv_shader_bin *cs_bin = pipeline->cs;
   const struct intel_device_info *devinfo = &device->info;

   anv_batch_emit(&pipeline->base.batch, GENX(CFE_STATE), cfe) {
      cfe.MaximumNumberofThreads =
-         devinfo->max_cs_threads * subslices - 1;
+         devinfo->max_cs_threads * devinfo->subslice_total - 1;
      cfe.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, cs_bin);
   }
 }
@@ -2618,8 +2616,6 @@ emit_compute_state(struct anv_compute_pipeline *pipeline,
      ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
            cs_prog_data->push.cross_thread.regs, 2);

-   const uint32_t subslices = MAX2(device->physical->subslice_total, 1);
-
   const struct anv_shader_bin *cs_bin = pipeline->cs;

   anv_batch_emit(&pipeline->base.batch, GENX(MEDIA_VFE_STATE), vfe) {
@@ -2629,7 +2625,7 @@ emit_compute_state(struct anv_compute_pipeline *pipeline,
      vfe.GPGPUMode              = true;
 #endif
      vfe.MaximumNumberofThreads =
-         devinfo->max_cs_threads * subslices - 1;
+         devinfo->max_cs_threads * devinfo->subslice_total - 1;
      vfe.NumberofURBEntries     = GFX_VER <= 7 ? 0 : 2;
 #if GFX_VER < 11
      vfe.ResetGatewayTimer      = true;
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -497,9 +497,8 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
      /* We seem to have issues with geometry flickering when 3D and compute
       * are combined in the same batch and this appears to fix it.
       */
-      const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
      const uint32_t maxNumberofThreads =
-         devinfo->max_cs_threads * subslices - 1;
+         devinfo->max_cs_threads * devinfo->subslice_total - 1;

      BEGIN_BATCH(9);
      OUT_BATCH(MEDIA_VFE_STATE << 16 | (9 - 2));
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -455,7 +455,7 @@ brw_alloc_stage_scratch(struct brw_context *brw,
      thread_count = devinfo->max_wm_threads;
      break;
   case MESA_SHADER_COMPUTE: {
-      unsigned subslices = MAX2(brw->screen->subslice_total, 1);
+      unsigned subslices = devinfo->subslice_total;

      /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
       *
--- a/src/mesa/drivers/dri/i965/brw_screen.c
+++ b/src/mesa/drivers/dri/i965/brw_screen.c
@@ -2611,9 +2611,6 @@ __DRIconfig **brw_init_screen(__DRIscreen *dri_screen)
   isl_device_init(&screen->isl_dev, &screen->devinfo,
                   screen->hw_has_swizzling);

-   /* GENs prior to 8 do not support EU/Subslice info */
-   screen->subslice_total = intel_device_info_subslice_total(devinfo);
-
   /* Gfx7-7.5 kernel requirements / command parser saga:
    *
    * - pre-v3.16:
--- a/src/mesa/drivers/dri/i965/brw_screen.h
+++ b/src/mesa/drivers/dri/i965/brw_screen.h
@@ -107,11 +107,6 @@ struct brw_screen
    */
   int cmd_parser_version;

-   /**
-    * Number of subslices reported by the I915_PARAM_SUBSLICE_TOTAL parameter
-    */
-   int subslice_total;
-
   bool mesa_format_supports_texture[MESA_FORMAT_COUNT];
   bool mesa_format_supports_render[MESA_FORMAT_COUNT];
   enum isl_format mesa_to_isl_render_format[MESA_FORMAT_COUNT];
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -4321,15 +4321,8 @@ genX(upload_cs_state)(struct brw_context *brw)
         vfe.PerThreadScratchSpace = per_thread_scratch_value;
      }

-      /* If brw->screen->subslice_total is greater than one, then
-       * devinfo->max_cs_threads stores number of threads per sub-slice;
-       * thus we need to multiply by that number by subslices to get
-       * the actual maximum number of threads; the -1 is because the HW
-       * has a bias of 1 (would not make sense to say the maximum number
-       * of threads is 0).
-       */
-      const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
-      vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1;
+      vfe.MaximumNumberofThreads =
+         devinfo->max_cs_threads * devinfo->subslice_total - 1;
      vfe.NumberofURBEntries = GFX_VER >= 8 ? 2 : 0;
 #if GFX_VER < 11
      vfe.ResetGatewayTimer =