From c75c58e54c5a17d97ac9579cd26ec10b79fb6154 Mon Sep 17 00:00:00 2001 From: Karmjit Mahil Date: Fri, 18 Nov 2022 14:51:37 +0000 Subject: [PATCH] pvr: Upload spm load programs to device. The programs are currently unused but will be needed for the spm background object load op. Signed-off-by: Karmjit Mahil Reviewed-by: Frank Binns Part-of: --- src/imagination/vulkan/pvr_device.c | 8 + src/imagination/vulkan/pvr_private.h | 13 ++ src/imagination/vulkan/pvr_spm.c | 157 ++++++++++++++++++ src/imagination/vulkan/pvr_spm.h | 4 + .../vulkan/usc/programs/pvr_shader_factory.h | 41 +++++ 5 files changed, 223 insertions(+) diff --git a/src/imagination/vulkan/pvr_device.c b/src/imagination/vulkan/pvr_device.c index 98f221bf660..de4bcde608b 100644 --- a/src/imagination/vulkan/pvr_device.c +++ b/src/imagination/vulkan/pvr_device.c @@ -1778,6 +1778,10 @@ VkResult pvr_CreateDevice(VkPhysicalDevice physicalDevice, if (result != VK_SUCCESS) goto err_pvr_finish_compute_idfwdf; + result = pvr_device_init_spm_load_state(device); + if (result != VK_SUCCESS) + goto err_pvr_finish_graphics_static_clear_state; + pvr_device_init_tile_buffer_state(device); result = pvr_queues_create(device, pCreateInfo); @@ -1810,6 +1814,9 @@ VkResult pvr_CreateDevice(VkPhysicalDevice physicalDevice, err_pvr_finish_tile_buffer_state: pvr_device_finish_tile_buffer_state(device); + pvr_device_finish_spm_load_state(device); + +err_pvr_finish_graphics_static_clear_state: pvr_device_finish_graphics_static_clear_state(device); err_pvr_finish_compute_idfwdf: @@ -1860,6 +1867,7 @@ void pvr_DestroyDevice(VkDevice _device, pvr_spm_finish_scratch_buffer_store(device); pvr_queues_destroy(device); pvr_device_finish_tile_buffer_state(device); + pvr_device_finish_spm_load_state(device); pvr_device_finish_graphics_static_clear_state(device); pvr_device_finish_compute_idfwdf_state(device); pvr_device_destroy_compute_query_programs(device); diff --git a/src/imagination/vulkan/pvr_private.h b/src/imagination/vulkan/pvr_private.h index e8fd902b2e0..e71ee78fe98 100644 --- a/src/imagination/vulkan/pvr_private.h +++ b/src/imagination/vulkan/pvr_private.h @@ -366,6 +366,19 @@ struct pvr_device { [PVR_CLEAR_ATTACHMENT_PROGRAM_COUNT_WITH_HOLES]; } static_clear_state; + struct { + struct pvr_bo *usc_programs; + struct pvr_bo *pds_programs; + + struct { + pvr_dev_addr_t pds_pixel_program_offset; + pvr_dev_addr_t pds_uniform_program_offset; + + uint32_t pds_texture_program_data_size; + uint32_t pds_texture_program_temps_count; + } load_program[PVR_SPM_LOAD_PROGRAM_COUNT]; + } spm_load_state; + struct { simple_mtx_t mtx; diff --git a/src/imagination/vulkan/pvr_spm.c b/src/imagination/vulkan/pvr_spm.c index 3fd26d94ccd..a0d096ebb29 100644 --- a/src/imagination/vulkan/pvr_spm.c +++ b/src/imagination/vulkan/pvr_spm.c @@ -23,13 +23,18 @@ #include #include +#include #include #include "c11/threads.h" #include "hwdef/rogue_hw_utils.h" #include "pvr_bo.h" +#include "pvr_device_info.h" +#include "pvr_pds.h" #include "pvr_private.h" +#include "pvr_shader_factory.h" #include "pvr_spm.h" +#include "pvr_static_shaders.h" #include "util/simple_mtx.h" #include "util/u_atomic.h" #include "vk_alloc.h" @@ -236,3 +241,155 @@ VkResult pvr_spm_scratch_buffer_get_buffer( return VK_SUCCESS; } + +VkResult pvr_device_init_spm_load_state(struct pvr_device *device) +{ + const struct pvr_device_info *dev_info = &device->pdevice->dev_info; + uint32_t pds_texture_aligned_offsets[PVR_SPM_LOAD_PROGRAM_COUNT]; + uint32_t pds_kick_aligned_offsets[PVR_SPM_LOAD_PROGRAM_COUNT]; + uint32_t usc_aligned_offsets[PVR_SPM_LOAD_PROGRAM_COUNT]; + uint32_t pds_allocation_size = 0; + uint32_t usc_allocation_size = 0; + struct pvr_bo *pds_bo; + struct pvr_bo *usc_bo; + uint8_t *mem_ptr; + VkResult result; + + static_assert(PVR_SPM_LOAD_PROGRAM_COUNT == ARRAY_SIZE(spm_load_collection), + "Size mismatch"); + + /* TODO: We don't need to upload all the programs since the set contains + * programs for devices with 8 output regs as well. We can save some memory + * by not uploading them on devices without the feature. + * It's likely that once the compiler is hooked up we'll be using the shader + * cache and generate the shaders as needed so this todo will be unnecessary. + */ + + /* Upload USC shaders. */ + + for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) { + usc_aligned_offsets[i] = usc_allocation_size; + usc_allocation_size += ALIGN_POT(spm_load_collection[i].size, 4); + } + + result = pvr_bo_alloc(device, + device->heaps.usc_heap, + usc_allocation_size, + 4, + PVR_BO_ALLOC_FLAG_CPU_MAPPED, + &usc_bo); + if (result != VK_SUCCESS) + return result; + + mem_ptr = (uint8_t *)usc_bo->bo->map; + + for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) { + memcpy(mem_ptr + usc_aligned_offsets[i], + spm_load_collection[i].code, + spm_load_collection[i].size); + } + + pvr_bo_cpu_unmap(device, usc_bo); + + /* Upload PDS programs. */ + + for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) { + struct pvr_pds_pixel_shader_sa_program pds_texture_program = { + /* DMA for clear colors and tile buffer address parts. */ + .num_texture_dma_kicks = 1, + }; + struct pvr_pds_kickusc_program pds_kick_program = { 0 }; + + /* TODO: This looks a bit odd and isn't consistent with other code where + * we're getting the size of the PDS program. Can we improve this? + */ + pvr_pds_set_sizes_pixel_shader_uniform_texture_code(&pds_texture_program); + pvr_pds_set_sizes_pixel_shader_sa_texture_data(&pds_texture_program, + dev_info); + + /* TODO: Looking at the pvr_pds_generate_...() functions and the run-time + * behavior the data size is always the same here. Should we try saving + * some memory by adjusting things based on that? + */ + device->spm_load_state.load_program[i].pds_texture_program_data_size = + pds_texture_program.data_size; + + pds_texture_aligned_offsets[i] = pds_allocation_size; + /* FIXME: Figure out the define for alignment of 16. */ + pds_allocation_size += ALIGN_POT(pds_texture_program.code_size * 4, 16); + + pvr_pds_set_sizes_pixel_shader(&pds_kick_program); + + pds_kick_aligned_offsets[i] = pds_allocation_size; + /* FIXME: Figure out the define for alignment of 16. */ + pds_allocation_size += ALIGN_POT( + (pds_kick_program.code_size + pds_kick_program.data_size) * 4, + 16); + } + + /* FIXME: Figure out the define for alignment of 16. */ + result = pvr_bo_alloc(device, + device->heaps.pds_heap, + pds_allocation_size, + 16, + PVR_BO_ALLOC_FLAG_CPU_MAPPED, + &pds_bo); + if (result != VK_SUCCESS) { + pvr_bo_free(device, usc_bo); + return result; + } + + mem_ptr = (uint8_t *)pds_bo->bo->map; + + for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) { + struct pvr_pds_pixel_shader_sa_program pds_texture_program = { + /* DMA for clear colors and tile buffer address parts. */ + .num_texture_dma_kicks = 1, + }; + const pvr_dev_addr_t usc_program_dev_addr = + PVR_DEV_ADDR_OFFSET(usc_bo->vma->dev_addr, usc_aligned_offsets[i]); + struct pvr_pds_kickusc_program pds_kick_program = { 0 }; + + pvr_pds_generate_pixel_shader_sa_code_segment( + &pds_texture_program, + (uint32_t *)(mem_ptr + pds_texture_aligned_offsets[i])); + + pvr_pds_setup_doutu(&pds_kick_program.usc_task_control, + usc_program_dev_addr.addr, + spm_load_collection[i].info->temps_required, + PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE), + false); + + /* Generated both code and data. */ + pvr_pds_generate_pixel_shader_program( + &pds_kick_program, + (uint32_t *)(mem_ptr + pds_kick_aligned_offsets[i])); + + device->spm_load_state.load_program[i].pds_pixel_program_offset = + PVR_DEV_ADDR_OFFSET(pds_bo->vma->dev_addr, + pds_kick_aligned_offsets[i]); + device->spm_load_state.load_program[i].pds_uniform_program_offset = + PVR_DEV_ADDR_OFFSET(pds_bo->vma->dev_addr, + pds_texture_aligned_offsets[i]); + + /* TODO: From looking at the pvr_pds_generate_...() functions, it seems + * like temps_used is always 1. Should we remove this and hard code it + * with a define in the PDS code? + */ + device->spm_load_state.load_program[i].pds_texture_program_temps_count = + pds_texture_program.temps_used; + } + + pvr_bo_cpu_unmap(device, pds_bo); + + device->spm_load_state.usc_programs = usc_bo; + device->spm_load_state.pds_programs = pds_bo; + + return VK_SUCCESS; +} + +void pvr_device_finish_spm_load_state(struct pvr_device *device) +{ + pvr_bo_free(device, device->spm_load_state.pds_programs); + pvr_bo_free(device, device->spm_load_state.usc_programs); +} diff --git a/src/imagination/vulkan/pvr_spm.h b/src/imagination/vulkan/pvr_spm.h index a3475d9c3f6..71c48e8f113 100644 --- a/src/imagination/vulkan/pvr_spm.h +++ b/src/imagination/vulkan/pvr_spm.h @@ -75,4 +75,8 @@ VkResult pvr_spm_scratch_buffer_get_buffer( void pvr_spm_scratch_buffer_release(struct pvr_device *device, struct pvr_spm_scratch_buffer *buffer); +/* The SPM load programs are needed for the SPM background object load op. */ +VkResult pvr_device_init_spm_load_state(struct pvr_device *device); +void pvr_device_finish_spm_load_state(struct pvr_device *device); + #endif /* PVR_SPM_H */ diff --git a/src/imagination/vulkan/usc/programs/pvr_shader_factory.h b/src/imagination/vulkan/usc/programs/pvr_shader_factory.h index 980f3a533f5..dc8915df1f0 100644 --- a/src/imagination/vulkan/usc/programs/pvr_shader_factory.h +++ b/src/imagination/vulkan/usc/programs/pvr_shader_factory.h @@ -125,4 +125,45 @@ pvr_get_clear_attachment_program_index(uint32_t dword_count, return idx; } +enum pvr_spm_load_const { + SPM_LOAD_CONST_TILE_BUFFER_1_UPPER, + SPM_LOAD_CONST_TILE_BUFFER_1_LOWER, + SPM_LOAD_CONST_TILE_BUFFER_2_UPPER, + SPM_LOAD_CONST_TILE_BUFFER_2_LOWER, + SPM_LOAD_CONST_TILE_BUFFER_3_UPPER, + SPM_LOAD_CONST_TILE_BUFFER_3_LOWER, + /* The following are only available if the core does not have the + * has_eight_output_registers feature. I.e. only available if the device has + * 4 output regs. + */ + SPM_LOAD_CONST_TILE_BUFFER_4_UPPER, + SPM_LOAD_CONST_TILE_BUFFER_4_LOWER, + SPM_LOAD_CONST_TILE_BUFFER_5_UPPER, + SPM_LOAD_CONST_TILE_BUFFER_5_LOWER, + SPM_LOAD_CONST_TILE_BUFFER_6_UPPER, + SPM_LOAD_CONST_TILE_BUFFER_6_LOWER, + SPM_LOAD_CONST_TILE_BUFFER_7_UPPER, + SPM_LOAD_CONST_TILE_BUFFER_7_LOWER, +}; +#define PVR_SPM_LOAD_DEST_UNUSED ~0 + +#define PVR_SPM_LOAD_SAMPLES_COUNT 4U + +/* If output_regs == 8 + * reg_load_programs = 4 # 1, 2, 4, 8 + * tile_buffer_load_programs = 3 # 1, 2, 3 + * else #output_regs == 4 + * reg_load_programs = 3 # 1, 2, 4 + * tile_buffer_load_programs = 7 # 1, 2, 3, 4, 5, 6, 7 + * + * See PVR_SPM_LOAD_IN_BUFFERS_COUNT for where the amount of + * tile_buffer_load_programs comes from. + * + * Tot = sample_count * (reg_load_programs + tile_buffer_load_programs) + */ +/* FIXME: This is currently hard coded for the am62. The Chromebook has 8 + * output regs so the count is different. + */ +#define PVR_SPM_LOAD_PROGRAM_COUNT (PVR_SPM_LOAD_SAMPLES_COUNT * (3 + 7)) + #endif /* PVR_SHADER_FACTORY_H */