pvr: Upload spm load programs to device.

The programs are currently unused but will be needed for the spm background object load op. Signed-off-by: Karmjit Mahil <Karmjit.Mahil@imgtec.com> Reviewed-by: Frank Binns <frank.binns@imgtec.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21279>
2022-11-18 14:51:37 +00:00
parent d6408e08df
commit c75c58e54c
5 changed files with 223 additions and 0 deletions
--- a/src/imagination/vulkan/pvr_device.c
+++ b/src/imagination/vulkan/pvr_device.c
@@ -1778,6 +1778,10 @@ VkResult pvr_CreateDevice(VkPhysicalDevice physicalDevice,
   if (result != VK_SUCCESS)
      goto err_pvr_finish_compute_idfwdf;

+   result = pvr_device_init_spm_load_state(device);
+   if (result != VK_SUCCESS)
+      goto err_pvr_finish_graphics_static_clear_state;
+
   pvr_device_init_tile_buffer_state(device);

   result = pvr_queues_create(device, pCreateInfo);
@@ -1810,6 +1814,9 @@ VkResult pvr_CreateDevice(VkPhysicalDevice physicalDevice,

 err_pvr_finish_tile_buffer_state:
   pvr_device_finish_tile_buffer_state(device);
+   pvr_device_finish_spm_load_state(device);
+
+err_pvr_finish_graphics_static_clear_state:
   pvr_device_finish_graphics_static_clear_state(device);

 err_pvr_finish_compute_idfwdf:
@@ -1860,6 +1867,7 @@ void pvr_DestroyDevice(VkDevice _device,
   pvr_spm_finish_scratch_buffer_store(device);
   pvr_queues_destroy(device);
   pvr_device_finish_tile_buffer_state(device);
+   pvr_device_finish_spm_load_state(device);
   pvr_device_finish_graphics_static_clear_state(device);
   pvr_device_finish_compute_idfwdf_state(device);
   pvr_device_destroy_compute_query_programs(device);
--- a/src/imagination/vulkan/pvr_private.h
+++ b/src/imagination/vulkan/pvr_private.h
@@ -366,6 +366,19 @@ struct pvr_device {
         [PVR_CLEAR_ATTACHMENT_PROGRAM_COUNT_WITH_HOLES];
   } static_clear_state;

+   struct {
+      struct pvr_bo *usc_programs;
+      struct pvr_bo *pds_programs;
+
+      struct {
+         pvr_dev_addr_t pds_pixel_program_offset;
+         pvr_dev_addr_t pds_uniform_program_offset;
+
+         uint32_t pds_texture_program_data_size;
+         uint32_t pds_texture_program_temps_count;
+      } load_program[PVR_SPM_LOAD_PROGRAM_COUNT];
+   } spm_load_state;
+
   struct {
      simple_mtx_t mtx;

--- a/src/imagination/vulkan/pvr_spm.c
+++ b/src/imagination/vulkan/pvr_spm.c
@@ -23,13 +23,18 @@

 #include <stdint.h>
 #include <stddef.h>
+#include <string.h>
 #include <vulkan/vulkan_core.h>

 #include "c11/threads.h"
 #include "hwdef/rogue_hw_utils.h"
 #include "pvr_bo.h"
+#include "pvr_device_info.h"
+#include "pvr_pds.h"
 #include "pvr_private.h"
+#include "pvr_shader_factory.h"
 #include "pvr_spm.h"
+#include "pvr_static_shaders.h"
 #include "util/simple_mtx.h"
 #include "util/u_atomic.h"
 #include "vk_alloc.h"
@@ -236,3 +241,155 @@ VkResult pvr_spm_scratch_buffer_get_buffer(

   return VK_SUCCESS;
 }
+
+VkResult pvr_device_init_spm_load_state(struct pvr_device *device)
+{
+   const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
+   uint32_t pds_texture_aligned_offsets[PVR_SPM_LOAD_PROGRAM_COUNT];
+   uint32_t pds_kick_aligned_offsets[PVR_SPM_LOAD_PROGRAM_COUNT];
+   uint32_t usc_aligned_offsets[PVR_SPM_LOAD_PROGRAM_COUNT];
+   uint32_t pds_allocation_size = 0;
+   uint32_t usc_allocation_size = 0;
+   struct pvr_bo *pds_bo;
+   struct pvr_bo *usc_bo;
+   uint8_t *mem_ptr;
+   VkResult result;
+
+   static_assert(PVR_SPM_LOAD_PROGRAM_COUNT == ARRAY_SIZE(spm_load_collection),
+                 "Size mismatch");
+
+   /* TODO: We don't need to upload all the programs since the set contains
+    * programs for devices with 8 output regs as well. We can save some memory
+    * by not uploading them on devices without the feature.
+    * It's likely that once the compiler is hooked up we'll be using the shader
+    * cache and generate the shaders as needed so this todo will be unnecessary.
+    */
+
+   /* Upload USC shaders. */
+
+   for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) {
+      usc_aligned_offsets[i] = usc_allocation_size;
+      usc_allocation_size += ALIGN_POT(spm_load_collection[i].size, 4);
+   }
+
+   result = pvr_bo_alloc(device,
+                         device->heaps.usc_heap,
+                         usc_allocation_size,
+                         4,
+                         PVR_BO_ALLOC_FLAG_CPU_MAPPED,
+                         &usc_bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   mem_ptr = (uint8_t *)usc_bo->bo->map;
+
+   for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) {
+      memcpy(mem_ptr + usc_aligned_offsets[i],
+             spm_load_collection[i].code,
+             spm_load_collection[i].size);
+   }
+
+   pvr_bo_cpu_unmap(device, usc_bo);
+
+   /* Upload PDS programs. */
+
+   for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) {
+      struct pvr_pds_pixel_shader_sa_program pds_texture_program = {
+         /* DMA for clear colors and tile buffer address parts. */
+         .num_texture_dma_kicks = 1,
+      };
+      struct pvr_pds_kickusc_program pds_kick_program = { 0 };
+
+      /* TODO: This looks a bit odd and isn't consistent with other code where
+       * we're getting the size of the PDS program. Can we improve this?
+       */
+      pvr_pds_set_sizes_pixel_shader_uniform_texture_code(&pds_texture_program);
+      pvr_pds_set_sizes_pixel_shader_sa_texture_data(&pds_texture_program,
+                                                     dev_info);
+
+      /* TODO: Looking at the pvr_pds_generate_...() functions and the run-time
+       * behavior the data size is always the same here. Should we try saving
+       * some memory by adjusting things based on that?
+       */
+      device->spm_load_state.load_program[i].pds_texture_program_data_size =
+         pds_texture_program.data_size;
+
+      pds_texture_aligned_offsets[i] = pds_allocation_size;
+      /* FIXME: Figure out the define for alignment of 16. */
+      pds_allocation_size += ALIGN_POT(pds_texture_program.code_size * 4, 16);
+
+      pvr_pds_set_sizes_pixel_shader(&pds_kick_program);
+
+      pds_kick_aligned_offsets[i] = pds_allocation_size;
+      /* FIXME: Figure out the define for alignment of 16. */
+      pds_allocation_size += ALIGN_POT(
+         (pds_kick_program.code_size + pds_kick_program.data_size) * 4,
+         16);
+   }
+
+   /* FIXME: Figure out the define for alignment of 16. */
+   result = pvr_bo_alloc(device,
+                         device->heaps.pds_heap,
+                         pds_allocation_size,
+                         16,
+                         PVR_BO_ALLOC_FLAG_CPU_MAPPED,
+                         &pds_bo);
+   if (result != VK_SUCCESS) {
+      pvr_bo_free(device, usc_bo);
+      return result;
+   }
+
+   mem_ptr = (uint8_t *)pds_bo->bo->map;
+
+   for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) {
+      struct pvr_pds_pixel_shader_sa_program pds_texture_program = {
+         /* DMA for clear colors and tile buffer address parts. */
+         .num_texture_dma_kicks = 1,
+      };
+      const pvr_dev_addr_t usc_program_dev_addr =
+         PVR_DEV_ADDR_OFFSET(usc_bo->vma->dev_addr, usc_aligned_offsets[i]);
+      struct pvr_pds_kickusc_program pds_kick_program = { 0 };
+
+      pvr_pds_generate_pixel_shader_sa_code_segment(
+         &pds_texture_program,
+         (uint32_t *)(mem_ptr + pds_texture_aligned_offsets[i]));
+
+      pvr_pds_setup_doutu(&pds_kick_program.usc_task_control,
+                          usc_program_dev_addr.addr,
+                          spm_load_collection[i].info->temps_required,
+                          PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
+                          false);
+
+      /* Generated both code and data. */
+      pvr_pds_generate_pixel_shader_program(
+         &pds_kick_program,
+         (uint32_t *)(mem_ptr + pds_kick_aligned_offsets[i]));
+
+      device->spm_load_state.load_program[i].pds_pixel_program_offset =
+         PVR_DEV_ADDR_OFFSET(pds_bo->vma->dev_addr,
+                             pds_kick_aligned_offsets[i]);
+      device->spm_load_state.load_program[i].pds_uniform_program_offset =
+         PVR_DEV_ADDR_OFFSET(pds_bo->vma->dev_addr,
+                             pds_texture_aligned_offsets[i]);
+
+      /* TODO: From looking at the pvr_pds_generate_...() functions, it seems
+       * like temps_used is always 1. Should we remove this and hard code it
+       * with a define in the PDS code?
+       */
+      device->spm_load_state.load_program[i].pds_texture_program_temps_count =
+         pds_texture_program.temps_used;
+   }
+
+   pvr_bo_cpu_unmap(device, pds_bo);
+
+   device->spm_load_state.usc_programs = usc_bo;
+   device->spm_load_state.pds_programs = pds_bo;
+
+   return VK_SUCCESS;
+}
+
+void pvr_device_finish_spm_load_state(struct pvr_device *device)
+{
+   pvr_bo_free(device, device->spm_load_state.pds_programs);
+   pvr_bo_free(device, device->spm_load_state.usc_programs);
+}
--- a/src/imagination/vulkan/pvr_spm.h
+++ b/src/imagination/vulkan/pvr_spm.h
@@ -75,4 +75,8 @@ VkResult pvr_spm_scratch_buffer_get_buffer(
 void pvr_spm_scratch_buffer_release(struct pvr_device *device,
                                    struct pvr_spm_scratch_buffer *buffer);

+/* The SPM load programs are needed for the SPM background object load op. */
+VkResult pvr_device_init_spm_load_state(struct pvr_device *device);
+void pvr_device_finish_spm_load_state(struct pvr_device *device);
+
 #endif /* PVR_SPM_H */
--- a/src/imagination/vulkan/usc/programs/pvr_shader_factory.h
+++ b/src/imagination/vulkan/usc/programs/pvr_shader_factory.h
@@ -125,4 +125,45 @@ pvr_get_clear_attachment_program_index(uint32_t dword_count,
   return idx;
 }

+enum pvr_spm_load_const {
+   SPM_LOAD_CONST_TILE_BUFFER_1_UPPER,
+   SPM_LOAD_CONST_TILE_BUFFER_1_LOWER,
+   SPM_LOAD_CONST_TILE_BUFFER_2_UPPER,
+   SPM_LOAD_CONST_TILE_BUFFER_2_LOWER,
+   SPM_LOAD_CONST_TILE_BUFFER_3_UPPER,
+   SPM_LOAD_CONST_TILE_BUFFER_3_LOWER,
+   /* The following are only available if the core does not have the
+    * has_eight_output_registers feature. I.e. only available if the device has
+    * 4 output regs.
+    */
+   SPM_LOAD_CONST_TILE_BUFFER_4_UPPER,
+   SPM_LOAD_CONST_TILE_BUFFER_4_LOWER,
+   SPM_LOAD_CONST_TILE_BUFFER_5_UPPER,
+   SPM_LOAD_CONST_TILE_BUFFER_5_LOWER,
+   SPM_LOAD_CONST_TILE_BUFFER_6_UPPER,
+   SPM_LOAD_CONST_TILE_BUFFER_6_LOWER,
+   SPM_LOAD_CONST_TILE_BUFFER_7_UPPER,
+   SPM_LOAD_CONST_TILE_BUFFER_7_LOWER,
+};
+#define PVR_SPM_LOAD_DEST_UNUSED ~0
+
+#define PVR_SPM_LOAD_SAMPLES_COUNT 4U
+
+/* If output_regs == 8
+ *    reg_load_programs = 4            # 1, 2, 4, 8
+ *    tile_buffer_load_programs = 3    # 1, 2, 3
+ * else                                #output_regs == 4
+ *    reg_load_programs = 3            # 1, 2, 4
+ *    tile_buffer_load_programs = 7    # 1, 2, 3, 4, 5, 6, 7
+ *
+ * See PVR_SPM_LOAD_IN_BUFFERS_COUNT for where the amount of
+ * tile_buffer_load_programs comes from.
+ *
+ * Tot = sample_count * (reg_load_programs + tile_buffer_load_programs)
+ */
+/* FIXME: This is currently hard coded for the am62. The Chromebook has 8
+ * output regs so the count is different.
+ */
+#define PVR_SPM_LOAD_PROGRAM_COUNT (PVR_SPM_LOAD_SAMPLES_COUNT * (3 + 7))
+
 #endif /* PVR_SHADER_FACTORY_H */