pvr: Upload spm load programs to device.
The programs are currently unused but will be needed for the spm background object load op. Signed-off-by: Karmjit Mahil <Karmjit.Mahil@imgtec.com> Reviewed-by: Frank Binns <frank.binns@imgtec.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21279>
This commit is contained in:
@@ -1778,6 +1778,10 @@ VkResult pvr_CreateDevice(VkPhysicalDevice physicalDevice,
|
||||
if (result != VK_SUCCESS)
|
||||
goto err_pvr_finish_compute_idfwdf;
|
||||
|
||||
result = pvr_device_init_spm_load_state(device);
|
||||
if (result != VK_SUCCESS)
|
||||
goto err_pvr_finish_graphics_static_clear_state;
|
||||
|
||||
pvr_device_init_tile_buffer_state(device);
|
||||
|
||||
result = pvr_queues_create(device, pCreateInfo);
|
||||
@@ -1810,6 +1814,9 @@ VkResult pvr_CreateDevice(VkPhysicalDevice physicalDevice,
|
||||
|
||||
err_pvr_finish_tile_buffer_state:
|
||||
pvr_device_finish_tile_buffer_state(device);
|
||||
pvr_device_finish_spm_load_state(device);
|
||||
|
||||
err_pvr_finish_graphics_static_clear_state:
|
||||
pvr_device_finish_graphics_static_clear_state(device);
|
||||
|
||||
err_pvr_finish_compute_idfwdf:
|
||||
@@ -1860,6 +1867,7 @@ void pvr_DestroyDevice(VkDevice _device,
|
||||
pvr_spm_finish_scratch_buffer_store(device);
|
||||
pvr_queues_destroy(device);
|
||||
pvr_device_finish_tile_buffer_state(device);
|
||||
pvr_device_finish_spm_load_state(device);
|
||||
pvr_device_finish_graphics_static_clear_state(device);
|
||||
pvr_device_finish_compute_idfwdf_state(device);
|
||||
pvr_device_destroy_compute_query_programs(device);
|
||||
|
@@ -366,6 +366,19 @@ struct pvr_device {
|
||||
[PVR_CLEAR_ATTACHMENT_PROGRAM_COUNT_WITH_HOLES];
|
||||
} static_clear_state;
|
||||
|
||||
struct {
|
||||
struct pvr_bo *usc_programs;
|
||||
struct pvr_bo *pds_programs;
|
||||
|
||||
struct {
|
||||
pvr_dev_addr_t pds_pixel_program_offset;
|
||||
pvr_dev_addr_t pds_uniform_program_offset;
|
||||
|
||||
uint32_t pds_texture_program_data_size;
|
||||
uint32_t pds_texture_program_temps_count;
|
||||
} load_program[PVR_SPM_LOAD_PROGRAM_COUNT];
|
||||
} spm_load_state;
|
||||
|
||||
struct {
|
||||
simple_mtx_t mtx;
|
||||
|
||||
|
@@ -23,13 +23,18 @@
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <vulkan/vulkan_core.h>
|
||||
|
||||
#include "c11/threads.h"
|
||||
#include "hwdef/rogue_hw_utils.h"
|
||||
#include "pvr_bo.h"
|
||||
#include "pvr_device_info.h"
|
||||
#include "pvr_pds.h"
|
||||
#include "pvr_private.h"
|
||||
#include "pvr_shader_factory.h"
|
||||
#include "pvr_spm.h"
|
||||
#include "pvr_static_shaders.h"
|
||||
#include "util/simple_mtx.h"
|
||||
#include "util/u_atomic.h"
|
||||
#include "vk_alloc.h"
|
||||
@@ -236,3 +241,155 @@ VkResult pvr_spm_scratch_buffer_get_buffer(
|
||||
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
VkResult pvr_device_init_spm_load_state(struct pvr_device *device)
|
||||
{
|
||||
const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
|
||||
uint32_t pds_texture_aligned_offsets[PVR_SPM_LOAD_PROGRAM_COUNT];
|
||||
uint32_t pds_kick_aligned_offsets[PVR_SPM_LOAD_PROGRAM_COUNT];
|
||||
uint32_t usc_aligned_offsets[PVR_SPM_LOAD_PROGRAM_COUNT];
|
||||
uint32_t pds_allocation_size = 0;
|
||||
uint32_t usc_allocation_size = 0;
|
||||
struct pvr_bo *pds_bo;
|
||||
struct pvr_bo *usc_bo;
|
||||
uint8_t *mem_ptr;
|
||||
VkResult result;
|
||||
|
||||
static_assert(PVR_SPM_LOAD_PROGRAM_COUNT == ARRAY_SIZE(spm_load_collection),
|
||||
"Size mismatch");
|
||||
|
||||
/* TODO: We don't need to upload all the programs since the set contains
|
||||
* programs for devices with 8 output regs as well. We can save some memory
|
||||
* by not uploading them on devices without the feature.
|
||||
* It's likely that once the compiler is hooked up we'll be using the shader
|
||||
* cache and generate the shaders as needed so this todo will be unnecessary.
|
||||
*/
|
||||
|
||||
/* Upload USC shaders. */
|
||||
|
||||
for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) {
|
||||
usc_aligned_offsets[i] = usc_allocation_size;
|
||||
usc_allocation_size += ALIGN_POT(spm_load_collection[i].size, 4);
|
||||
}
|
||||
|
||||
result = pvr_bo_alloc(device,
|
||||
device->heaps.usc_heap,
|
||||
usc_allocation_size,
|
||||
4,
|
||||
PVR_BO_ALLOC_FLAG_CPU_MAPPED,
|
||||
&usc_bo);
|
||||
if (result != VK_SUCCESS)
|
||||
return result;
|
||||
|
||||
mem_ptr = (uint8_t *)usc_bo->bo->map;
|
||||
|
||||
for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) {
|
||||
memcpy(mem_ptr + usc_aligned_offsets[i],
|
||||
spm_load_collection[i].code,
|
||||
spm_load_collection[i].size);
|
||||
}
|
||||
|
||||
pvr_bo_cpu_unmap(device, usc_bo);
|
||||
|
||||
/* Upload PDS programs. */
|
||||
|
||||
for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) {
|
||||
struct pvr_pds_pixel_shader_sa_program pds_texture_program = {
|
||||
/* DMA for clear colors and tile buffer address parts. */
|
||||
.num_texture_dma_kicks = 1,
|
||||
};
|
||||
struct pvr_pds_kickusc_program pds_kick_program = { 0 };
|
||||
|
||||
/* TODO: This looks a bit odd and isn't consistent with other code where
|
||||
* we're getting the size of the PDS program. Can we improve this?
|
||||
*/
|
||||
pvr_pds_set_sizes_pixel_shader_uniform_texture_code(&pds_texture_program);
|
||||
pvr_pds_set_sizes_pixel_shader_sa_texture_data(&pds_texture_program,
|
||||
dev_info);
|
||||
|
||||
/* TODO: Looking at the pvr_pds_generate_...() functions and the run-time
|
||||
* behavior the data size is always the same here. Should we try saving
|
||||
* some memory by adjusting things based on that?
|
||||
*/
|
||||
device->spm_load_state.load_program[i].pds_texture_program_data_size =
|
||||
pds_texture_program.data_size;
|
||||
|
||||
pds_texture_aligned_offsets[i] = pds_allocation_size;
|
||||
/* FIXME: Figure out the define for alignment of 16. */
|
||||
pds_allocation_size += ALIGN_POT(pds_texture_program.code_size * 4, 16);
|
||||
|
||||
pvr_pds_set_sizes_pixel_shader(&pds_kick_program);
|
||||
|
||||
pds_kick_aligned_offsets[i] = pds_allocation_size;
|
||||
/* FIXME: Figure out the define for alignment of 16. */
|
||||
pds_allocation_size += ALIGN_POT(
|
||||
(pds_kick_program.code_size + pds_kick_program.data_size) * 4,
|
||||
16);
|
||||
}
|
||||
|
||||
/* FIXME: Figure out the define for alignment of 16. */
|
||||
result = pvr_bo_alloc(device,
|
||||
device->heaps.pds_heap,
|
||||
pds_allocation_size,
|
||||
16,
|
||||
PVR_BO_ALLOC_FLAG_CPU_MAPPED,
|
||||
&pds_bo);
|
||||
if (result != VK_SUCCESS) {
|
||||
pvr_bo_free(device, usc_bo);
|
||||
return result;
|
||||
}
|
||||
|
||||
mem_ptr = (uint8_t *)pds_bo->bo->map;
|
||||
|
||||
for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) {
|
||||
struct pvr_pds_pixel_shader_sa_program pds_texture_program = {
|
||||
/* DMA for clear colors and tile buffer address parts. */
|
||||
.num_texture_dma_kicks = 1,
|
||||
};
|
||||
const pvr_dev_addr_t usc_program_dev_addr =
|
||||
PVR_DEV_ADDR_OFFSET(usc_bo->vma->dev_addr, usc_aligned_offsets[i]);
|
||||
struct pvr_pds_kickusc_program pds_kick_program = { 0 };
|
||||
|
||||
pvr_pds_generate_pixel_shader_sa_code_segment(
|
||||
&pds_texture_program,
|
||||
(uint32_t *)(mem_ptr + pds_texture_aligned_offsets[i]));
|
||||
|
||||
pvr_pds_setup_doutu(&pds_kick_program.usc_task_control,
|
||||
usc_program_dev_addr.addr,
|
||||
spm_load_collection[i].info->temps_required,
|
||||
PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
|
||||
false);
|
||||
|
||||
/* Generated both code and data. */
|
||||
pvr_pds_generate_pixel_shader_program(
|
||||
&pds_kick_program,
|
||||
(uint32_t *)(mem_ptr + pds_kick_aligned_offsets[i]));
|
||||
|
||||
device->spm_load_state.load_program[i].pds_pixel_program_offset =
|
||||
PVR_DEV_ADDR_OFFSET(pds_bo->vma->dev_addr,
|
||||
pds_kick_aligned_offsets[i]);
|
||||
device->spm_load_state.load_program[i].pds_uniform_program_offset =
|
||||
PVR_DEV_ADDR_OFFSET(pds_bo->vma->dev_addr,
|
||||
pds_texture_aligned_offsets[i]);
|
||||
|
||||
/* TODO: From looking at the pvr_pds_generate_...() functions, it seems
|
||||
* like temps_used is always 1. Should we remove this and hard code it
|
||||
* with a define in the PDS code?
|
||||
*/
|
||||
device->spm_load_state.load_program[i].pds_texture_program_temps_count =
|
||||
pds_texture_program.temps_used;
|
||||
}
|
||||
|
||||
pvr_bo_cpu_unmap(device, pds_bo);
|
||||
|
||||
device->spm_load_state.usc_programs = usc_bo;
|
||||
device->spm_load_state.pds_programs = pds_bo;
|
||||
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
void pvr_device_finish_spm_load_state(struct pvr_device *device)
|
||||
{
|
||||
pvr_bo_free(device, device->spm_load_state.pds_programs);
|
||||
pvr_bo_free(device, device->spm_load_state.usc_programs);
|
||||
}
|
||||
|
@@ -75,4 +75,8 @@ VkResult pvr_spm_scratch_buffer_get_buffer(
|
||||
void pvr_spm_scratch_buffer_release(struct pvr_device *device,
|
||||
struct pvr_spm_scratch_buffer *buffer);
|
||||
|
||||
/* The SPM load programs are needed for the SPM background object load op. */
|
||||
VkResult pvr_device_init_spm_load_state(struct pvr_device *device);
|
||||
void pvr_device_finish_spm_load_state(struct pvr_device *device);
|
||||
|
||||
#endif /* PVR_SPM_H */
|
||||
|
@@ -125,4 +125,45 @@ pvr_get_clear_attachment_program_index(uint32_t dword_count,
|
||||
return idx;
|
||||
}
|
||||
|
||||
enum pvr_spm_load_const {
|
||||
SPM_LOAD_CONST_TILE_BUFFER_1_UPPER,
|
||||
SPM_LOAD_CONST_TILE_BUFFER_1_LOWER,
|
||||
SPM_LOAD_CONST_TILE_BUFFER_2_UPPER,
|
||||
SPM_LOAD_CONST_TILE_BUFFER_2_LOWER,
|
||||
SPM_LOAD_CONST_TILE_BUFFER_3_UPPER,
|
||||
SPM_LOAD_CONST_TILE_BUFFER_3_LOWER,
|
||||
/* The following are only available if the core does not have the
|
||||
* has_eight_output_registers feature. I.e. only available if the device has
|
||||
* 4 output regs.
|
||||
*/
|
||||
SPM_LOAD_CONST_TILE_BUFFER_4_UPPER,
|
||||
SPM_LOAD_CONST_TILE_BUFFER_4_LOWER,
|
||||
SPM_LOAD_CONST_TILE_BUFFER_5_UPPER,
|
||||
SPM_LOAD_CONST_TILE_BUFFER_5_LOWER,
|
||||
SPM_LOAD_CONST_TILE_BUFFER_6_UPPER,
|
||||
SPM_LOAD_CONST_TILE_BUFFER_6_LOWER,
|
||||
SPM_LOAD_CONST_TILE_BUFFER_7_UPPER,
|
||||
SPM_LOAD_CONST_TILE_BUFFER_7_LOWER,
|
||||
};
|
||||
#define PVR_SPM_LOAD_DEST_UNUSED ~0
|
||||
|
||||
#define PVR_SPM_LOAD_SAMPLES_COUNT 4U
|
||||
|
||||
/* If output_regs == 8
|
||||
* reg_load_programs = 4 # 1, 2, 4, 8
|
||||
* tile_buffer_load_programs = 3 # 1, 2, 3
|
||||
* else #output_regs == 4
|
||||
* reg_load_programs = 3 # 1, 2, 4
|
||||
* tile_buffer_load_programs = 7 # 1, 2, 3, 4, 5, 6, 7
|
||||
*
|
||||
* See PVR_SPM_LOAD_IN_BUFFERS_COUNT for where the amount of
|
||||
* tile_buffer_load_programs comes from.
|
||||
*
|
||||
* Tot = sample_count * (reg_load_programs + tile_buffer_load_programs)
|
||||
*/
|
||||
/* FIXME: This is currently hard coded for the am62. The Chromebook has 8
|
||||
* output regs so the count is different.
|
||||
*/
|
||||
#define PVR_SPM_LOAD_PROGRAM_COUNT (PVR_SPM_LOAD_SAMPLES_COUNT * (3 + 7))
|
||||
|
||||
#endif /* PVR_SHADER_FACTORY_H */
|
||||
|
Reference in New Issue
Block a user