tu: support KHR_8bit_storage

Add basic KHR_8bit_storage support for Adreno 750 devices, for now enabling
the storageBuffer8BitAccess feature. A separate descriptor is provided for
8-bit storage access. The descriptor index is adjusted appropriately for
8-bit SSBO loads and stores.

The 8-bit SSBO loads cannot go through isam since that instruction isn't
able to handle those. The ldib and stib instruction encodings are a bit
peculiar but they match the blob's image buffer access through VK_FORMAT_R8
and the dedicated descriptor. These loads and stores do not work in
vectorized form, so they have to be scalarized. Additionally stores of
8-bit values have to clear up higher bits of those values.

8-bit truncation can leave higher bits as undefined. Zero-extension of
8-bit values has to use masking since the corresponding cov instruction
doesn't function as intended. 8-bit sign extension through cov from a
non-shared to a shared register also doesn't work, so an exception is
applied to avoid it.

Conversion of 8-bit values to and from floating-point values also doesn't
work with a straightforward cov instruction, instead the conversion has
to go through a 16-bit value.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9979
Signed-off-by: Zan Dobersek <zdobersek@igalia.com>
Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28254>
This commit is contained in:
Zan Dobersek
2024-03-18 21:12:44 +01:00
committed by Marge Bot
parent c93a629f2c
commit 8a84e77b15
15 changed files with 169 additions and 24 deletions

View File

@@ -448,7 +448,7 @@ Vulkan 1.1 -- all DONE: anv, lvp, nvk, radv, tu, vn
Vulkan 1.2 -- all DONE: anv, nvk, tu, vn
VK_KHR_8bit_storage DONE (anv, dzn, hasvk, lvp, nvk, radv, v3dv, vn)
VK_KHR_8bit_storage DONE (anv, dzn, hasvk, lvp, nvk, radv, tu/a750+, v3dv, vn)
VK_KHR_buffer_device_address DONE (anv, hasvk, lvp, nvk, panvk, radv, tu, v3dv, vn)
VK_KHR_create_renderpass2 DONE (anv, dzn, hasvk, lvp, nvk, radv, tu, v3dv, vn)
VK_KHR_depth_stencil_resolve DONE (anv, dzn, hasvk, lvp, nvk, radv, tu, v3dv, vn)

View File

@@ -271,6 +271,8 @@ struct fd_dev_info {
* best thing we could do is a toggle.
*/
bool enable_tp_ubwc_flag_hint;
bool storage_8bit;
} a7xx;
};

View File

@@ -856,6 +856,7 @@ a7xx_750 = A7XXProps(
# now.
#supports_ibo_ubwc = True,
no_gs_hw_binning_quirk = True,
storage_8bit = True,
)
a730_magic_regs = dict(

View File

@@ -1175,12 +1175,14 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
/* floating-point conversions when moving from non-shared to shared
* seem not to work. We only use floating-point types in ir3 for
* conversions, so don't bother specially handling the case where the
* types are equal.
* types are equal. Same goes for 8-bit sign extension.
*/
if ((instr->dsts[0]->flags & IR3_REG_SHARED) &&
!(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)) &&
(full_type(instr->cat1.src_type) == TYPE_F32 ||
full_type(instr->cat1.dst_type) == TYPE_F32))
((full_type(instr->cat1.src_type) == TYPE_F32 ||
full_type(instr->cat1.dst_type) == TYPE_F32) ||
(instr->cat1.src_type == TYPE_U8 &&
full_type(instr->cat1.dst_type) == TYPE_S32)))
return false;
/* Conversions seem not to work in shared->shared copies before scalar

View File

@@ -69,7 +69,24 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
ldib->dsts[0]->wrmask = MASK(intr->num_components);
ldib->cat6.iim_val = intr->num_components;
ldib->cat6.d = 1;
ldib->cat6.type = intr->def.bit_size == 16 ? TYPE_U16 : TYPE_U32;
switch (intr->def.bit_size) {
case 8:
/* This encodes the 8-bit SSBO load and matches blob's encoding of
* imageBuffer access using VK_FORMAT_R8 and the dedicated 8-bit
* descriptor. No vectorization is possible.
*/
assert(intr->num_components == 1);
ldib->cat6.type = TYPE_U16;
ldib->cat6.typed = true;
break;
case 16:
ldib->cat6.type = TYPE_U16;
break;
default:
ldib->cat6.type = TYPE_U32;
break;
}
ldib->barrier_class = IR3_BARRIER_BUFFER_R;
ldib->barrier_conflict = IR3_BARRIER_BUFFER_W;
@@ -100,6 +117,17 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
*/
val = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp);
/* Any 8-bit store will be done on a single-component value that additionally
* has to be masked to clear up the higher bits or it will malfunction.
*/
if (intr->src[0].ssa->bit_size == 8) {
assert(ncomp == 1);
struct ir3_instruction *mask = create_immed_typed(b, 0xff, TYPE_U8);
val = ir3_AND_B(b, val, 0, mask, 0);
val->dsts[0]->flags |= IR3_REG_HALF;
}
lower_ssbo_offset(ctx, intr, &intr->src[3], &offset, &imm_offset_val);
struct ir3_instruction *imm_offset = create_immed(b, imm_offset_val);
@@ -107,7 +135,24 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
imm_offset, 0, val, 0);
stib->cat6.iim_val = ncomp;
stib->cat6.d = 1;
stib->cat6.type = intr->src[0].ssa->bit_size == 16 ? TYPE_U16 : TYPE_U32;
switch (intr->src[0].ssa->bit_size) {
case 8:
/* As with ldib, this encodes the 8-bit SSBO store and matches blob's
* encoding of imageBuffer access using VK_FORMAT_R8 and the extra 8-bit
* descriptor. No vectorization is possible and we have to override the
* relevant field anyway.
*/
stib->cat6.type = TYPE_U16;
stib->cat6.iim_val = 4;
stib->cat6.typed = true;
break;
case 16:
stib->cat6.type = TYPE_U16;
break;
default:
stib->cat6.type = TYPE_U32;
break;
}
stib->barrier_class = IR3_BARRIER_BUFFER_W;
stib->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;

View File

@@ -67,6 +67,8 @@ struct ir3_compiler_options {
/* True if 16-bit descriptors are available. */
bool storage_16bit;
/* True if 8-bit descriptors are available. */
bool storage_8bit;
/* If base_vertex should be lowered in nir */
bool lower_base_vertex;

View File

@@ -273,6 +273,50 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
if (src_type == dst_type)
return src;
/* Zero-extension of 8-bit values doesn't work with `cov`, so simple masking
* is used to achieve the result.
*/
if (src_type == TYPE_U8 && full_type(dst_type) == TYPE_U32) {
struct ir3_instruction *mask = create_immed_typed(ctx->block, 0xff, TYPE_U8);
struct ir3_instruction *cov = ir3_AND_B(ctx->block, src, 0, mask, 0);
cov->dsts[0]->flags |= type_flags(dst_type);
return cov;
}
/* Conversion of 8-bit values into floating-point values doesn't work with
* a simple `cov`, instead the 8-bit values first have to be converted into
* corresponding 16-bit values and converted from there.
*/
if (src_type == TYPE_U8 && full_type(dst_type) == TYPE_F32) {
assert(op == nir_op_u2f16 || op == nir_op_i2f16 ||
op == nir_op_u2f32 || op == nir_op_i2f32);
struct ir3_instruction *cov;
if (op == nir_op_u2f16 || op == nir_op_u2f32) {
struct ir3_instruction *mask = create_immed_typed(ctx->block, 0xff, TYPE_U8);
cov = ir3_AND_B(ctx->block, src, 0, mask, 0);
cov->dsts[0]->flags |= IR3_REG_HALF;
cov = ir3_COV(ctx->block, cov, TYPE_U16, dst_type);
} else {
cov = ir3_COV(ctx->block, src, TYPE_U8, TYPE_S16);
cov = ir3_COV(ctx->block, cov, TYPE_S16, dst_type);
}
return cov;
}
/* Conversion of floating-point values to 8-bit values also doesn't work
* through a single `cov`, instead the conversion has to go through the
* corresponding 16-bit type that's then truncated.
*/
if (full_type(src_type) == TYPE_F32 && dst_type == TYPE_U8) {
assert(op == nir_op_f2u8 || op == nir_op_f2i8);
type_t intermediate_type = op == nir_op_f2u8 ? TYPE_U16 : TYPE_S16;
struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, intermediate_type);
cov = ir3_COV(ctx->block, cov, intermediate_type, TYPE_U8);
return cov;
}
struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, dst_type);
if (op == nir_op_f2f16_rtne) {
@@ -1611,9 +1655,11 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx,
{
/* Note: we can only use isam for vectorized loads/stores if isam.v is
* available.
* Note: isam also can't handle 8-bit loads.
*/
if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) ||
(intr->def.num_components > 1 && !ctx->compiler->has_isam_v) ||
(ctx->compiler->options.storage_8bit && intr->def.bit_size == 8) ||
!ctx->compiler->has_isam_ssbo) {
ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
return;

View File

@@ -416,7 +416,7 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
return false;
if (!is_cat2_float(instr->opc) && !is_cat3_float(instr->opc))
return false;
} else if (src->cat1.dst_type == TYPE_U16) {
} else if (src->cat1.dst_type == TYPE_U16 || src->cat1.dst_type == TYPE_S16) {
/* Since we set CONSTANT_DEMOTION_ENABLE, a float reference of
* what was a U16 value read from the constbuf would incorrectly
* do 32f->16f conversion, when we want to read a 16f value.

View File

@@ -100,6 +100,13 @@ ir3_nir_should_scalarize_mem(const nir_instr *instr, const void *data)
return true;
}
if ((intrin->intrinsic == nir_intrinsic_load_ssbo &&
intrin->def.bit_size == 8) ||
(intrin->intrinsic == nir_intrinsic_store_ssbo &&
intrin->src[0].ssa->bit_size == 8)) {
return true;
}
return false;
}

View File

@@ -187,6 +187,11 @@ lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b,
(!has_dest && intrinsic->src[0].ssa->bit_size == 16))
shift = 1;
/* for 8-bit ssbo access, offset is in 8-bit words instead of dwords */
if ((has_dest && intrinsic->def.bit_size == 8) ||
(!has_dest && intrinsic->src[0].ssa->bit_size == 8))
shift = 0;
/* Here we create a new intrinsic and copy over all contents from the old
* one. */
@@ -326,7 +331,8 @@ ir3_nir_max_imm_offset(nir_intrinsic_instr *intrin, const void *data)
switch (intrin->intrinsic) {
case nir_intrinsic_load_ssbo_ir3:
if ((nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER))
if ((nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
!(compiler->options.storage_8bit && intrin->def.bit_size == 8))
return 255; /* isam.v */
return 127; /* ldib.b */
case nir_intrinsic_store_ssbo_ir3:

View File

@@ -761,10 +761,14 @@ can_demote_src(struct ir3_instruction *instr)
case OPC_META_COLLECT:
return false;
case OPC_MOV:
/* non-shared -> shared floating-point conversions don't work */
/* non-shared -> shared floating-point conversions and
* 8-bit sign extension don't work.
*/
return (!(instr->dsts[0]->flags & IR3_REG_SHARED) ||
(full_type(instr->cat1.src_type) != TYPE_F32 &&
full_type(instr->cat1.dst_type) != TYPE_F32));
!((full_type(instr->cat1.src_type) == TYPE_F32 ||
full_type(instr->cat1.dst_type) == TYPE_F32) ||
(instr->cat1.src_type == TYPE_U8 &&
full_type(instr->cat1.dst_type) == TYPE_S32)));
default:
return (!is_alu(instr) && !is_sfu(instr)) ||
!(instr->dsts[0]->flags & IR3_REG_SHARED);

View File

@@ -2795,6 +2795,7 @@ tu_bind_descriptor_sets(struct tu_cmd_buffer *cmd,
case FMT6_32_UINT:
offset_shift = 2;
break;
case FMT6_8_UINT:
default:
offset_shift = 0;
break;

View File

@@ -69,7 +69,8 @@ descriptor_size(struct tu_device *dev,
*/
return A6XX_TEX_CONST_DWORDS * 4 * (1 +
COND(dev->physical_device->info->a6xx.storage_16bit &&
!dev->physical_device->info->a6xx.has_isam_v, 1));
!dev->physical_device->info->a6xx.has_isam_v, 1) +
COND(dev->physical_device->info->a7xx.storage_8bit, 1));
case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
return binding->descriptorCount;
default:
@@ -1016,9 +1017,17 @@ write_buffer_descriptor_addr(const struct tu_device *device,
* 16-bit and 32-bit access through isam.v will of course only be functional
* when 16-bit storage is supported. */
assert(!info->a6xx.has_isam_v || info->a6xx.storage_16bit);
/* Any configuration enabling 8-bit storage support will also provide 16-bit
* storage support and 16-bit descriptors capable of 32-bit isam loads. This
* indirectly ensures we won't need more than two descriptors for access of
* any size.
*/
assert(!info->a7xx.storage_8bit || (info->a6xx.storage_16bit &&
info->a6xx.has_isam_v));
unsigned num_descriptors = 1 + COND(info->a6xx.storage_16bit &&
!info->a6xx.has_isam_v, 1);
unsigned num_descriptors = 1 +
COND(info->a6xx.storage_16bit && !info->a6xx.has_isam_v, 1) +
COND(info->a7xx.storage_8bit, 1);
memset(dst, 0, num_descriptors * A6XX_TEX_CONST_DWORDS * sizeof(uint32_t));
if (!buffer_info || buffer_info->address == 0)
@@ -1053,6 +1062,18 @@ write_buffer_descriptor_addr(const struct tu_device *device,
A6XX_TEX_CONST_2_TYPE(A6XX_TEX_BUFFER);
dst[4] = A6XX_TEX_CONST_4_BASE_LO(base_va);
dst[5] = A6XX_TEX_CONST_5_BASE_HI(base_va >> 32);
dst += A6XX_TEX_CONST_DWORDS;
}
if (info->a7xx.storage_8bit) {
dst[0] = A6XX_TEX_CONST_0_TILE_MODE(TILE6_LINEAR) | A6XX_TEX_CONST_0_FMT(FMT6_8_UINT);
dst[1] = range;
dst[2] =
A6XX_TEX_CONST_2_STRUCTSIZETEXELS(1) |
A6XX_TEX_CONST_2_STARTOFFSETTEXELS(offset) |
A6XX_TEX_CONST_2_TYPE(A6XX_TEX_BUFFER);
dst[4] = A6XX_TEX_CONST_4_BASE_LO(base_va);
dst[5] = A6XX_TEX_CONST_5_BASE_HI(base_va >> 32);
}
}

View File

@@ -142,6 +142,7 @@ get_device_extensions(const struct tu_physical_device *device,
struct vk_device_extension_table *ext)
{
*ext = (struct vk_device_extension_table) { .table = {
.KHR_8bit_storage = device->info->a7xx.storage_8bit,
.KHR_16bit_storage = device->info->a6xx.storage_16bit,
.KHR_bind_memory2 = true,
.KHR_buffer_device_address = true,
@@ -379,7 +380,7 @@ tu_get_features(struct tu_physical_device *pdevice,
/* Vulkan 1.2 */
features->samplerMirrorClampToEdge = true;
features->drawIndirectCount = true;
features->storageBuffer8BitAccess = false;
features->storageBuffer8BitAccess = pdevice->info->a7xx.storage_8bit;
features->uniformAndStorageBuffer8BitAccess = false;
features->storagePushConstant8 = false;
features->shaderBufferInt64Atomics = false;
@@ -1096,7 +1097,8 @@ tu_get_properties(struct tu_physical_device *pdevice,
props->uniformBufferDescriptorSize = A6XX_TEX_CONST_DWORDS * 4;
props->robustUniformBufferDescriptorSize = A6XX_TEX_CONST_DWORDS * 4;
props->storageBufferDescriptorSize = A6XX_TEX_CONST_DWORDS * 4 * (1 +
COND(pdevice->info->a6xx.storage_16bit && !pdevice->info->a6xx.has_isam_v, 1));
COND(pdevice->info->a6xx.storage_16bit && !pdevice->info->a6xx.has_isam_v, 1) +
COND(pdevice->info->a7xx.storage_8bit, 1));
props->robustStorageBufferDescriptorSize =
props->storageBufferDescriptorSize;
props->inputAttachmentDescriptorSize = TU_DEBUG(DYNAMIC) ?
@@ -2301,6 +2303,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
.bindless_fb_read_descriptor = -1,
.bindless_fb_read_slot = -1,
.storage_16bit = physical_device->info->a6xx.storage_16bit,
.storage_8bit = physical_device->info->a7xx.storage_8bit,
.shared_push_consts = !TU_DEBUG(PUSH_CONSTS_PER_STAGE),
};
device->compiler = ir3_compiler_create(

View File

@@ -282,15 +282,20 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev,
}
}
/* For isam, we need to adjust the descriptor index to use the 32-bit
* descriptor if 16-bit storage support is present but the 16-bit descriptor
* cannot be used for 32-bit access through isam.v.
/* Descriptor index has to be adjusted in the following cases:
* - isam loads, when the 16-bit descriptor cannot also be used for 32-bit
* loads -- next-index descriptor will be able to do that;
* - 8-bit SSBO loads and stores -- next-index descriptor is dedicated to
* storage accesses of that size.
*/
if (dev->physical_device->info->a6xx.storage_16bit &&
!dev->physical_device->info->a6xx.has_isam_v &&
intrin->intrinsic == nir_intrinsic_load_ssbo &&
(nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
intrin->def.bit_size > 16) {
if ((dev->physical_device->info->a6xx.storage_16bit &&
!dev->physical_device->info->a6xx.has_isam_v &&
intrin->intrinsic == nir_intrinsic_load_ssbo &&
(nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
intrin->def.bit_size > 16) ||
(dev->physical_device->info->a7xx.storage_8bit &&
((intrin->intrinsic == nir_intrinsic_load_ssbo && intrin->def.bit_size == 8) ||
(intrin->intrinsic == nir_intrinsic_store_ssbo && intrin->src[0].ssa->bit_size == 8)))) {
descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1);
}