tu: Support higher descriptor set count for A7XX
Allows for the descriptor set count to vary at runtime depending on the specific GPU to allow for 7 usable descriptor sets on A7XX with one reserved for dynamic offsets. Passing VK-CTS: dEQP-VK.binding_model.* Signed-off-by: Mark Collins <mark@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25534>
This commit is contained in:
@@ -825,8 +825,8 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit,
|
||||
.cs_ibo = true,
|
||||
.gfx_ibo = true,
|
||||
.gfx_shared_const = true,
|
||||
.cs_bindless = 0x1f,
|
||||
.gfx_bindless = 0x1f,));
|
||||
.cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
|
||||
.gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
|
||||
|
||||
tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_VERTEX, vs);
|
||||
tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_CTRL, NULL);
|
||||
|
@@ -183,8 +183,8 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
|
||||
tu_emit_event_write<CHIP>(cmd_buffer, cs, FD_CACHE_INVALIDATE);
|
||||
if (flushes & TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE) {
|
||||
tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
|
||||
.cs_bindless = 0x1f,
|
||||
.gfx_bindless = 0x1f,
|
||||
.cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
|
||||
.gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,
|
||||
));
|
||||
}
|
||||
if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)
|
||||
@@ -1146,8 +1146,8 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
||||
.gfx_ibo = true,
|
||||
.cs_shared_const = true,
|
||||
.gfx_shared_const = true,
|
||||
.cs_bindless = 0x1f,
|
||||
.gfx_bindless = 0x1f,));
|
||||
.cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
|
||||
.gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
|
||||
|
||||
tu_cs_emit_wfi(cs);
|
||||
|
||||
@@ -2395,19 +2395,22 @@ tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd,
|
||||
tu_cs_emit_array(cs, (const uint32_t*)descriptors_state->set_iova, 2 * descriptors_state->max_sets_bound);
|
||||
}
|
||||
|
||||
/* Dynamic descriptors get the last descriptor set. */
|
||||
/* Dynamic descriptors get the reserved descriptor set. */
|
||||
if (descriptors_state->dynamic_bound) {
|
||||
tu_cs_emit_pkt4(cs, sp_bindless_base_reg + 4 * 2, 2);
|
||||
tu_cs_emit_qw(cs, descriptors_state->set_iova[MAX_SETS]);
|
||||
int reserved_set_idx = cmd->device->physical_device->reserved_set_idx;
|
||||
assert(reserved_set_idx >= 0); /* reserved set must be bound */
|
||||
|
||||
tu_cs_emit_pkt4(cs, sp_bindless_base_reg + reserved_set_idx * 2, 2);
|
||||
tu_cs_emit_qw(cs, descriptors_state->set_iova[reserved_set_idx]);
|
||||
if (CHIP == A6XX) {
|
||||
tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg + 4 * 2, 2);
|
||||
tu_cs_emit_qw(cs, descriptors_state->set_iova[MAX_SETS]);
|
||||
tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg + reserved_set_idx * 2, 2);
|
||||
tu_cs_emit_qw(cs, descriptors_state->set_iova[reserved_set_idx]);
|
||||
}
|
||||
}
|
||||
|
||||
tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
|
||||
.cs_bindless = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE ? 0x1f : 0,
|
||||
.gfx_bindless = bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS ? 0x1f : 0,
|
||||
.cs_bindless = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE ? CHIP == A6XX ? 0x1f : 0xff : 0,
|
||||
.gfx_bindless = bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS ? CHIP == A6XX ? 0x1f : 0xff : 0,
|
||||
));
|
||||
|
||||
if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
|
||||
@@ -2539,6 +2542,7 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
|
||||
if (layout->dynamic_offset_size) {
|
||||
/* allocate and fill out dynamic descriptor set */
|
||||
struct tu_cs_memory dynamic_desc_set;
|
||||
int reserved_set_idx = cmd->device->physical_device->reserved_set_idx;
|
||||
VkResult result = tu_cs_alloc(&cmd->sub_cs,
|
||||
layout->dynamic_offset_size / (4 * A6XX_TEX_CONST_DWORDS),
|
||||
A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
|
||||
@@ -2549,7 +2553,8 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
|
||||
|
||||
memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors,
|
||||
layout->dynamic_offset_size);
|
||||
descriptors_state->set_iova[MAX_SETS] = dynamic_desc_set.iova | BINDLESS_DESCRIPTOR_64B;
|
||||
assert(reserved_set_idx >= 0); /* reserved set must be bound */
|
||||
descriptors_state->set_iova[reserved_set_idx] = dynamic_desc_set.iova | BINDLESS_DESCRIPTOR_64B;
|
||||
descriptors_state->dynamic_bound = true;
|
||||
}
|
||||
|
||||
|
@@ -52,7 +52,7 @@ struct tu_descriptor_state
|
||||
struct tu_descriptor_set *sets[MAX_SETS];
|
||||
struct tu_descriptor_set push_set;
|
||||
uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
|
||||
uint64_t set_iova[MAX_SETS + 1];
|
||||
uint64_t set_iova[MAX_SETS];
|
||||
uint32_t max_sets_bound;
|
||||
bool dynamic_bound;
|
||||
};
|
||||
|
@@ -491,7 +491,6 @@ tu_pipeline_layout_init(struct tu_pipeline_layout *layout)
|
||||
unsigned dynamic_offset_size = 0;
|
||||
|
||||
for (uint32_t set = 0; set < layout->num_sets; set++) {
|
||||
assert(set < MAX_SETS);
|
||||
layout->set[set].dynamic_offset_start = dynamic_offset_size;
|
||||
|
||||
if (layout->set[set].layout)
|
||||
@@ -548,7 +547,7 @@ tu_CreatePipelineLayout(VkDevice _device,
|
||||
TU_FROM_HANDLE(tu_descriptor_set_layout, set_layout,
|
||||
pCreateInfo->pSetLayouts[set]);
|
||||
|
||||
assert(set < MAX_SETS);
|
||||
assert(set < device->physical_device->usable_sets);
|
||||
layout->set[set].layout = set_layout;
|
||||
if (set_layout)
|
||||
vk_descriptor_set_layout_ref(&set_layout->vk);
|
||||
@@ -1431,7 +1430,7 @@ tu_CreateDescriptorUpdateTemplate(
|
||||
/* descriptorSetLayout should be ignored for push descriptors
|
||||
* and instead it refers to pipelineLayout and set.
|
||||
*/
|
||||
assert(pCreateInfo->set < MAX_SETS);
|
||||
assert(pCreateInfo->set < device->physical_device->usable_sets);
|
||||
set_layout = pipeline_layout->set[pCreateInfo->set].layout;
|
||||
} else {
|
||||
TU_FROM_HANDLE(tu_descriptor_set_layout, _set_layout,
|
||||
|
@@ -10,10 +10,11 @@
|
||||
|
||||
#include "vk_descriptor_set_layout.h"
|
||||
|
||||
/* The hardware supports 5 descriptor sets, but we reserve 1 for dynamic
|
||||
* descriptors and input attachments.
|
||||
/* The hardware supports up to 8 descriptor sets since A7XX.
|
||||
* Note: This is the maximum across generations, not the maximum for a
|
||||
* particular generation so it should only be used for allocation.
|
||||
*/
|
||||
#define MAX_SETS 4
|
||||
#define MAX_SETS 8
|
||||
|
||||
/* I have no idea what the maximum size is, but the hardware supports very
|
||||
* large numbers of descriptors (at least 2^16). This limit is based on
|
||||
|
@@ -628,6 +628,8 @@ tu_physical_device_init(struct tu_physical_device *device,
|
||||
|
||||
device->ccu_offset_bypass = depth_cache_size;
|
||||
device->ccu_offset_gmem = device->gmem_size - color_cache_size;
|
||||
|
||||
device->usable_sets = device->reserved_set_idx = device->info->a6xx.max_sets - 1;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@@ -1065,7 +1067,7 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
|
||||
.maxSamplerAllocationCount = 64 * 1024,
|
||||
.bufferImageGranularity = 64, /* A cache line */
|
||||
.sparseAddressSpaceSize = 0,
|
||||
.maxBoundDescriptorSets = MAX_SETS,
|
||||
.maxBoundDescriptorSets = pdevice->usable_sets,
|
||||
.maxPerStageDescriptorSamplers = max_descriptor_set_size,
|
||||
.maxPerStageDescriptorUniformBuffers = max_descriptor_set_size,
|
||||
.maxPerStageDescriptorStorageBuffers = max_descriptor_set_size,
|
||||
@@ -1327,10 +1329,10 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
|
||||
properties->bufferlessPushDescriptors = true;
|
||||
properties->allowSamplerImageViewPostSubmitCreation = true;
|
||||
properties->descriptorBufferOffsetAlignment = A6XX_TEX_CONST_DWORDS * 4;
|
||||
properties->maxDescriptorBufferBindings = MAX_SETS;
|
||||
properties->maxResourceDescriptorBufferBindings = MAX_SETS;
|
||||
properties->maxSamplerDescriptorBufferBindings = MAX_SETS;
|
||||
properties->maxEmbeddedImmutableSamplerBindings = MAX_SETS;
|
||||
properties->maxDescriptorBufferBindings = pdevice->usable_sets;
|
||||
properties->maxResourceDescriptorBufferBindings = pdevice->usable_sets;
|
||||
properties->maxSamplerDescriptorBufferBindings = pdevice->usable_sets;
|
||||
properties->maxEmbeddedImmutableSamplerBindings = pdevice->usable_sets;
|
||||
properties->maxEmbeddedImmutableSamplers = max_descriptor_set_size;
|
||||
properties->bufferCaptureReplayDescriptorDataSize = 0;
|
||||
properties->imageCaptureReplayDescriptorDataSize = 0;
|
||||
|
@@ -89,6 +89,11 @@ struct tu_physical_device
|
||||
uint32_t ccu_offset_gmem;
|
||||
uint32_t ccu_offset_bypass;
|
||||
|
||||
/* Amount of usable descriptor sets, this excludes any reserved set */
|
||||
uint32_t usable_sets;
|
||||
/* Index of the reserved descriptor set, may be -1 if unset */
|
||||
int32_t reserved_set_idx;
|
||||
|
||||
bool has_set_iova;
|
||||
uint64_t va_start;
|
||||
uint64_t va_size;
|
||||
|
@@ -110,7 +110,8 @@ tu6_load_state_size(struct tu_pipeline *pipeline,
|
||||
}
|
||||
|
||||
static void
|
||||
tu6_emit_load_state(struct tu_pipeline *pipeline,
|
||||
tu6_emit_load_state(struct tu_device *device,
|
||||
struct tu_pipeline *pipeline,
|
||||
struct tu_pipeline_layout *layout)
|
||||
{
|
||||
unsigned size = tu6_load_state_size(pipeline, layout);
|
||||
@@ -165,7 +166,8 @@ tu6_emit_load_state(struct tu_pipeline *pipeline,
|
||||
continue;
|
||||
switch (binding->type) {
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
|
||||
base = MAX_SETS;
|
||||
assert(device->physical_device->reserved_set_idx >= 0);
|
||||
base = device->physical_device->reserved_set_idx;
|
||||
offset = (layout->set[i].dynamic_offset_start +
|
||||
binding->dynamic_offset_offset) / 4;
|
||||
FALLTHROUGH;
|
||||
@@ -201,7 +203,8 @@ tu6_emit_load_state(struct tu_pipeline *pipeline,
|
||||
break;
|
||||
}
|
||||
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
|
||||
base = MAX_SETS;
|
||||
assert(device->physical_device->reserved_set_idx >= 0);
|
||||
base = device->physical_device->reserved_set_idx;
|
||||
offset = (layout->set[i].dynamic_offset_start +
|
||||
binding->dynamic_offset_offset) / 4;
|
||||
FALLTHROUGH;
|
||||
@@ -404,19 +407,20 @@ tu6_emit_dynamic_offset(struct tu_cs *cs,
|
||||
const struct tu_shader *shader,
|
||||
struct tu_pipeline_builder *builder)
|
||||
{
|
||||
const struct tu_physical_device *phys_dev = cs->device->physical_device;
|
||||
if (!xs || shader->const_state.dynamic_offset_loc == UINT32_MAX)
|
||||
return;
|
||||
|
||||
tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + MAX_SETS);
|
||||
tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + phys_dev->usable_sets);
|
||||
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(shader->const_state.dynamic_offset_loc / 4) |
|
||||
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
|
||||
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
|
||||
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
|
||||
CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(MAX_SETS, 4)));
|
||||
CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(phys_dev->usable_sets, 4)));
|
||||
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
|
||||
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
|
||||
|
||||
for (unsigned i = 0; i < MAX_SETS; i++) {
|
||||
for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
|
||||
unsigned dynamic_offset_start =
|
||||
builder->layout.set[i].dynamic_offset_start / (A6XX_TEX_CONST_DWORDS * 4);
|
||||
tu_cs_emit(cs, i < builder->layout.num_sets ? dynamic_offset_start : 0);
|
||||
@@ -2235,9 +2239,9 @@ tu_pipeline_builder_parse_layout(struct tu_pipeline_builder *builder,
|
||||
struct tu_graphics_lib_pipeline *library = builder->libraries[i];
|
||||
builder->layout.num_sets = MAX2(builder->layout.num_sets,
|
||||
library->num_sets);
|
||||
assert(builder->layout.num_sets <= builder->device->physical_device->usable_sets);
|
||||
for (unsigned j = 0; j < library->num_sets; j++) {
|
||||
if (library->layouts[i])
|
||||
builder->layout.set[i].layout = library->layouts[i];
|
||||
builder->layout.set[i].layout = library->layouts[i];
|
||||
}
|
||||
|
||||
builder->layout.push_constant_size = library->push_constant_size;
|
||||
@@ -3920,7 +3924,7 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
|
||||
/* Blob doesn't preload state on A7XX, likely preloading either
|
||||
* doesn't work or doesn't provide benefits.
|
||||
*/
|
||||
tu6_emit_load_state(*pipeline, &builder->layout);
|
||||
tu6_emit_load_state(builder->device, *pipeline, &builder->layout);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4370,7 +4374,7 @@ tu_compute_pipeline_create(VkDevice device,
|
||||
pipeline->local_size[i] = v->local_size[i];
|
||||
|
||||
if (CHIP == A6XX) {
|
||||
tu6_emit_load_state(&pipeline->base, layout);
|
||||
tu6_emit_load_state(dev, &pipeline->base, layout);
|
||||
}
|
||||
|
||||
tu_append_executable(&pipeline->base, v, nir_initial_disasm);
|
||||
|
@@ -167,7 +167,8 @@ lower_load_push_constant(struct tu_device *dev,
|
||||
}
|
||||
|
||||
static void
|
||||
lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr,
|
||||
lower_vulkan_resource_index(struct tu_device *dev, nir_builder *b,
|
||||
nir_intrinsic_instr *instr,
|
||||
struct tu_shader *shader,
|
||||
const struct tu_pipeline_layout *layout)
|
||||
{
|
||||
@@ -203,7 +204,8 @@ lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr,
|
||||
base = nir_imm_int(b, (layout->set[set].dynamic_offset_start +
|
||||
binding_layout->dynamic_offset_offset) / (4 * A6XX_TEX_CONST_DWORDS));
|
||||
}
|
||||
set = MAX_SETS;
|
||||
assert(dev->physical_device->reserved_set_idx >= 0);
|
||||
set = dev->physical_device->reserved_set_idx;
|
||||
break;
|
||||
default:
|
||||
base = nir_imm_int(b, binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS));
|
||||
@@ -288,7 +290,7 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev,
|
||||
descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1);
|
||||
}
|
||||
|
||||
nir_def *results[MAX_SETS + 1] = { NULL };
|
||||
nir_def *results[MAX_SETS] = { NULL };
|
||||
|
||||
if (nir_scalar_is_const(scalar_idx)) {
|
||||
nir_def *bindless =
|
||||
@@ -298,7 +300,7 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev,
|
||||
}
|
||||
|
||||
nir_def *base_idx = nir_channel(b, scalar_idx.def, scalar_idx.comp);
|
||||
for (unsigned i = 0; i < MAX_SETS + 1; i++) {
|
||||
for (unsigned i = 0; i < dev->physical_device->info->a6xx.max_sets; i++) {
|
||||
/* if (base_idx == i) { ... */
|
||||
nir_if *nif = nir_push_if(b, nir_ieq_imm(b, base_idx, i));
|
||||
|
||||
@@ -336,7 +338,7 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev,
|
||||
|
||||
nir_def *result =
|
||||
nir_undef(b, intrin->def.num_components, intrin->def.bit_size);
|
||||
for (int i = MAX_SETS; i >= 0; i--) {
|
||||
for (int i = dev->physical_device->info->a6xx.max_sets - 1; i >= 0; i--) {
|
||||
nir_pop_if(b, NULL);
|
||||
if (info->has_dest)
|
||||
result = nir_if_phi(b, results[i], result);
|
||||
@@ -433,7 +435,7 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
|
||||
return true;
|
||||
|
||||
case nir_intrinsic_vulkan_resource_index:
|
||||
lower_vulkan_resource_index(b, instr, shader, layout);
|
||||
lower_vulkan_resource_index(dev, b, instr, shader, layout);
|
||||
return true;
|
||||
case nir_intrinsic_vulkan_resource_reindex:
|
||||
lower_vulkan_resource_reindex(b, instr);
|
||||
@@ -715,7 +717,8 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
|
||||
|
||||
if (layout->independent_sets) {
|
||||
const_state->dynamic_offset_loc = reserved_consts_vec4 * 4;
|
||||
reserved_consts_vec4 += DIV_ROUND_UP(MAX_SETS, 4);
|
||||
assert(dev->physical_device->reserved_set_idx >= 0);
|
||||
reserved_consts_vec4 += DIV_ROUND_UP(dev->physical_device->reserved_set_idx, 4);
|
||||
} else {
|
||||
const_state->dynamic_offset_loc = UINT32_MAX;
|
||||
}
|
||||
|
Reference in New Issue
Block a user