intel/fs: enable extended bindless surface offset

Gives use 4Gb of bindless surface state on Gfx12.5+ instead of 64Mb.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
This commit is contained in:
Lionel Landwerlin
2022-10-14 17:49:00 +03:00
committed by Marge Bot
parent 01fc9a06bd
commit 6d6877bf99
8 changed files with 54 additions and 7 deletions

View File

@@ -114,6 +114,13 @@ struct brw_compiler {
*/
bool indirect_ubos_use_sampler;
/**
* Gfx12.5+ has a bit in the SEND instruction extending the bindless
* surface offset range from 20 to 26 bits, effectively giving us 4Gb of
* bindless surface descriptors instead of 64Mb previously.
*/
bool extended_bindless_surface_offset;
struct nir_shader *clc_shader;
};

View File

@@ -1880,18 +1880,18 @@ lsc_disassemble_ex_desc(const struct intel_device_info *devinfo,
const unsigned addr_type = lsc_msg_desc_addr_type(devinfo, imm_desc);
switch (addr_type) {
case LSC_ADDR_SURFTYPE_FLAT:
format(file, "base_offset %u ",
format(file, " base_offset %u ",
lsc_flat_ex_desc_base_offset(devinfo, imm_ex_desc));
break;
case LSC_ADDR_SURFTYPE_BSS:
case LSC_ADDR_SURFTYPE_SS:
format(file, "surface_state_index %u ",
format(file, " surface_state_index %u ",
lsc_bss_ex_desc_index(devinfo, imm_ex_desc));
break;
case LSC_ADDR_SURFTYPE_BTI:
format(file, "BTI %u ",
format(file, " BTI %u ",
lsc_bti_ex_desc_index(devinfo, imm_ex_desc));
format(file, "base_offset %u ",
format(file, " base_offset %u ",
lsc_bti_ex_desc_base_offset(devinfo, imm_ex_desc));
break;
default:
@@ -2469,11 +2469,13 @@ brw_disassemble_inst(FILE *file, const struct brw_isa_info *isa,
if (space)
string(file, " ");
}
if (devinfo->verx10 >= 125 && brw_inst_send_ex_bso(devinfo, inst))
format(file, " ex_bso");
if (brw_sfid_is_lsc(sfid)) {
lsc_disassemble_ex_desc(devinfo, imm_desc, imm_ex_desc, file);
} else {
if (has_imm_desc)
format(file, "mlen %u", brw_message_desc_mlen(devinfo, imm_desc));
format(file, " mlen %u", brw_message_desc_mlen(devinfo, imm_desc));
if (has_imm_ex_desc) {
format(file, " ex_mlen %u",
brw_message_ex_desc_ex_mlen(devinfo, imm_ex_desc));

View File

@@ -1721,6 +1721,7 @@ brw_send_indirect_split_message(struct brw_codegen *p,
struct brw_reg ex_desc,
unsigned ex_desc_imm,
bool ex_desc_scratch,
bool ex_bso,
bool eot);
void brw_ff_sync(struct brw_codegen *p,

View File

@@ -2739,6 +2739,7 @@ brw_send_indirect_split_message(struct brw_codegen *p,
struct brw_reg ex_desc,
unsigned ex_desc_imm,
bool ex_desc_scratch,
bool ex_bso,
bool eot)
{
const struct intel_device_info *devinfo = p->devinfo;
@@ -2777,6 +2778,12 @@ brw_send_indirect_split_message(struct brw_codegen *p,
!ex_desc_scratch &&
(devinfo->ver >= 12 ||
((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {
/* ATS-M PRMs, Volume 2d: Command Reference: Structures,
* EU_INSTRUCTION_SEND instruction
*
* "ExBSO: Exists If: ([ExDesc.IsReg]==true)"
*/
assert(!ex_bso);
ex_desc.ud |= ex_desc_imm;
} else {
const struct tgl_swsb swsb = brw_get_default_swsb(p);
@@ -2799,7 +2806,7 @@ brw_send_indirect_split_message(struct brw_codegen *p,
* descriptor which comes from the address register. If we don't OR
* those two bits in, the external unit may get confused and hang.
*/
unsigned imm_part = ex_desc_imm | sfid | eot << 5;
unsigned imm_part = ex_bso ? 0 : (ex_desc_imm | sfid | eot << 5);
if (ex_desc_scratch) {
/* Or the scratch surface offset together with the immediate part of
@@ -2852,6 +2859,10 @@ brw_send_indirect_split_message(struct brw_codegen *p,
brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
}
if (ex_bso) {
brw_inst_set_send_ex_bso(devinfo, send, true);
brw_inst_set_send_src1_len(devinfo, send, GET_BITS(ex_desc_imm, 10, 6));
}
brw_inst_set_sfid(devinfo, send, sfid);
brw_inst_set_eot(devinfo, send, eot);
}

View File

@@ -343,7 +343,8 @@ fs_generator::generate_send(fs_inst *inst,
*/
brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2,
desc, desc_imm, ex_desc, ex_desc_imm,
inst->send_ex_desc_scratch, inst->eot);
inst->send_ex_desc_scratch,
inst->send_ex_bso, inst->eot);
if (inst->check_tdr)
brw_inst_set_opcode(p->isa, brw_last_inst,
devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);

View File

@@ -620,8 +620,10 @@ FC(send_sel_reg32_desc, /* 4+ */ 77, 77, /* 12+ */ 48, 48, devinfo->ver
FC(send_sel_reg32_ex_desc, /* 4+ */ 61, 61, /* 12+ */ 49, 49, devinfo->ver >= 9)
F8(send_src0_reg_file, /* 4+ */ 38, 37, /* 8+ */ 42, 41, /* 12+ */ 66, 66)
FC(send_src1_reg_nr, /* 4+ */ 51, 44, /* 12+ */ 111, 104, devinfo->ver >= 9)
FC(send_src1_len, /* 4+ */ -1, -1, /* 12+ */ 103, 99, devinfo->verx10 >= 125)
FC(send_src1_reg_file, /* 4+ */ 36, 36, /* 12+ */ 98, 98, devinfo->ver >= 9)
FC(send_dst_reg_file, /* 4+ */ 35, 35, /* 12+ */ 50, 50, devinfo->ver >= 9)
FC(send_ex_bso, /* 4+ */ -1, -1, /* 12+ */ 39, 39, devinfo->verx10 >= 125)
/** @} */
/* Message descriptor bits */

View File

@@ -177,6 +177,9 @@ struct backend_instruction {
* the scratch surface offset to build
* extended descriptor
*/
bool send_ex_bso:1; /**< Only for SHADER_OPCODE_SEND, use extended bindless
* surface offset (26bits instead of 20bits)
*/
bool predicate_trivial:1; /**< The predication mask applied to this
* instruction is guaranteed to be uniform and
* a superset of the execution mask of the

View File

@@ -808,6 +808,7 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
unsigned coord_components,
unsigned grad_components)
{
const brw_compiler *compiler = bld.shader->compiler;
const intel_device_info *devinfo = bld.shader->devinfo;
const enum brw_reg_type payload_type =
brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_F);
@@ -1153,6 +1154,7 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
* we can use the surface handle directly as the extended descriptor.
*/
inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
inst->send_ex_bso = compiler->extended_bindless_surface_offset;
} else {
/* Immediate portion of the descriptor */
inst->desc = brw_sampler_desc(devinfo,
@@ -1344,6 +1346,7 @@ setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,
const fs_reg &surface, const fs_reg &surface_handle)
{
const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
const brw_compiler *compiler = bld.shader->compiler;
/* We must have exactly one of surface and surface_handle */
assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
@@ -1362,6 +1365,7 @@ setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,
* we can use the surface handle directly as the extended descriptor.
*/
inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
inst->send_ex_bso = compiler->extended_bindless_surface_offset;
} else {
inst->desc = desc;
const fs_builder ubld = bld.exec_all().group(1, 0);
@@ -1377,12 +1381,15 @@ setup_lsc_surface_descriptors(const fs_builder &bld, fs_inst *inst,
uint32_t desc, const fs_reg &surface)
{
const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
const brw_compiler *compiler = bld.shader->compiler;
inst->src[0] = brw_imm_ud(0); /* desc */
enum lsc_addr_surface_type surf_type = lsc_msg_desc_addr_type(devinfo, desc);
switch (surf_type) {
case LSC_ADDR_SURFTYPE_BSS:
inst->send_ex_bso = compiler->extended_bindless_surface_offset;
/* fall-through */
case LSC_ADDR_SURFTYPE_SS:
assert(surface.file != BAD_FILE);
/* We assume that the driver provided the handle in the top 20 bits so
@@ -1415,6 +1422,7 @@ setup_lsc_surface_descriptors(const fs_builder &bld, fs_inst *inst,
static void
lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
{
const brw_compiler *compiler = bld.shader->compiler;
const intel_device_info *devinfo = bld.shader->devinfo;
/* Get the logical send arguments. */
@@ -1646,6 +1654,8 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
inst->header_size = header_sz;
inst->send_has_side_effects = has_side_effects;
inst->send_is_volatile = !has_side_effects;
inst->send_ex_bso = surface_handle.file != BAD_FILE &&
compiler->extended_bindless_surface_offset;
/* Set up SFID and descriptors */
inst->sfid = sfid;
@@ -1674,6 +1684,7 @@ lsc_bits_to_data_size(unsigned bit_size)
static void
lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
{
const brw_compiler *compiler = bld.shader->compiler;
const intel_device_info *devinfo = bld.shader->devinfo;
assert(devinfo->has_lsc);
@@ -1808,6 +1819,8 @@ lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
inst->header_size = 0;
inst->send_has_side_effects = has_side_effects;
inst->send_is_volatile = !has_side_effects;
inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
compiler->extended_bindless_surface_offset;
inst->resize_sources(4);
@@ -1828,6 +1841,7 @@ lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
static void
lower_lsc_block_logical_send(const fs_builder &bld, fs_inst *inst)
{
const brw_compiler *compiler = bld.shader->compiler;
const intel_device_info *devinfo = bld.shader->devinfo;
assert(devinfo->has_lsc);
@@ -1893,6 +1907,8 @@ lower_lsc_block_logical_send(const fs_builder &bld, fs_inst *inst)
inst->header_size = 0;
inst->send_has_side_effects = has_side_effects;
inst->send_is_volatile = !has_side_effects;
inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
compiler->extended_bindless_surface_offset;
inst->resize_sources(4);
@@ -2322,6 +2338,8 @@ lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
inst->opcode = SHADER_OPCODE_SEND;
inst->sfid = GFX12_SFID_UGM;
inst->resize_sources(3);
inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
compiler->extended_bindless_surface_offset;
assert(!compiler->indirect_ubos_use_sampler);
@@ -3019,6 +3037,8 @@ fs_visitor::lower_uniform_pull_constant_loads()
/* Update the original instruction. */
inst->opcode = SHADER_OPCODE_SEND;
inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
inst->send_ex_bso = surface_handle.file != BAD_FILE &&
compiler->extended_bindless_surface_offset;
inst->ex_mlen = 0;
inst->header_size = 0;
inst->send_has_side_effects = false;