intel/fs: switch register allocation spilling to use LSC on Gfx12.5+

v2: drop the hardcoded inst->mlen=1 (Rohan)

v3: Move back to LOAD/STORE messages (limited to SIMD16 for LSC)

v4: Also use 4 GRFs transpose loads for fills (Curro)

v5: Reduce amount of needed register to build per lane offsets (Curro)
    Drop some now useless SIMD32 code
    Unify unspill code

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17555>
This commit is contained in:
Lionel Landwerlin
2022-07-18 12:27:53 +03:00
committed by Marge Bot
parent 3c6fa2703d
commit 37b3601052
5 changed files with 176 additions and 46 deletions

View File

@@ -1587,6 +1587,7 @@ brw_send_indirect_split_message(struct brw_codegen *p,
unsigned desc_imm, unsigned desc_imm,
struct brw_reg ex_desc, struct brw_reg ex_desc,
unsigned ex_desc_imm, unsigned ex_desc_imm,
bool ex_desc_scratch,
bool eot); bool eot);
void brw_ff_sync(struct brw_codegen *p, void brw_ff_sync(struct brw_codegen *p,

View File

@@ -2746,6 +2746,7 @@ brw_send_indirect_split_message(struct brw_codegen *p,
unsigned desc_imm, unsigned desc_imm,
struct brw_reg ex_desc, struct brw_reg ex_desc,
unsigned ex_desc_imm, unsigned ex_desc_imm,
bool ex_desc_scratch,
bool eot) bool eot)
{ {
const struct intel_device_info *devinfo = p->devinfo; const struct intel_device_info *devinfo = p->devinfo;
@@ -2781,6 +2782,7 @@ brw_send_indirect_split_message(struct brw_codegen *p,
} }
if (ex_desc.file == BRW_IMMEDIATE_VALUE && if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
!ex_desc_scratch &&
(devinfo->ver >= 12 || (devinfo->ver >= 12 ||
((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) { ((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {
ex_desc.ud |= ex_desc_imm; ex_desc.ud |= ex_desc_imm;
@@ -2807,7 +2809,16 @@ brw_send_indirect_split_message(struct brw_codegen *p,
*/ */
unsigned imm_part = ex_desc_imm | sfid | eot << 5; unsigned imm_part = ex_desc_imm | sfid | eot << 5;
if (ex_desc.file == BRW_IMMEDIATE_VALUE) { if (ex_desc_scratch) {
/* Or the scratch surface offset together with the immediate part of
* the extended descriptor.
*/
assert(devinfo->verx10 >= 125);
brw_AND(p, addr,
retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
brw_imm_ud(INTEL_MASK(31, 10)));
brw_OR(p, addr, addr, brw_imm_ud(imm_part));
} else if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
/* ex_desc bits 15:12 don't exist in the instruction encoding prior /* ex_desc bits 15:12 don't exist in the instruction encoding prior
* to Gfx12, so we may have fallen back to an indirect extended * to Gfx12, so we may have fallen back to an indirect extended
* descriptor. * descriptor.

View File

@@ -335,13 +335,14 @@ fs_generator::generate_send(fs_inst *inst,
uint32_t ex_desc_imm = inst->ex_desc | uint32_t ex_desc_imm = inst->ex_desc |
brw_message_ex_desc(devinfo, inst->ex_mlen); brw_message_ex_desc(devinfo, inst->ex_mlen);
if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm) { if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm ||
inst->send_ex_desc_scratch) {
/* If we have any sort of extended descriptor, then we need SENDS. This /* If we have any sort of extended descriptor, then we need SENDS. This
* also covers the dual-payload case because ex_mlen goes in ex_desc. * also covers the dual-payload case because ex_mlen goes in ex_desc.
*/ */
brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2, brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2,
desc, desc_imm, ex_desc, ex_desc_imm, desc, desc_imm, ex_desc, ex_desc_imm,
inst->eot); inst->send_ex_desc_scratch, inst->eot);
if (inst->check_tdr) if (inst->check_tdr)
brw_inst_set_opcode(p->isa, brw_last_inst, brw_inst_set_opcode(p->isa, brw_last_inst,
devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC); devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);

View File

@@ -348,10 +348,15 @@ private:
void build_interference_graph(bool allow_spilling); void build_interference_graph(bool allow_spilling);
void discard_interference_graph(); void discard_interference_graph();
fs_reg build_lane_offsets(const fs_builder &bld,
uint32_t spill_offset, int ip);
fs_reg build_single_offset(const fs_builder &bld,
uint32_t spill_offset, int ip);
void emit_unspill(const fs_builder &bld, struct shader_stats *stats, void emit_unspill(const fs_builder &bld, struct shader_stats *stats,
fs_reg dst, uint32_t spill_offset, unsigned count); fs_reg dst, uint32_t spill_offset, unsigned count, int ip);
void emit_spill(const fs_builder &bld, struct shader_stats *stats, void emit_spill(const fs_builder &bld, struct shader_stats *stats,
fs_reg src, uint32_t spill_offset, unsigned count); fs_reg src, uint32_t spill_offset, unsigned count, int ip);
void set_spill_costs(); void set_spill_costs();
int choose_spill_reg(); int choose_spill_reg();
@@ -448,6 +453,10 @@ namespace {
unsigned unsigned
spill_max_size(const backend_shader *s) spill_max_size(const backend_shader *s)
{ {
/* LSC is limited to SIMD16 sends */
if (s->devinfo->has_lsc)
return 2;
/* FINISHME - On Gfx7+ it should be possible to avoid this limit /* FINISHME - On Gfx7+ it should be possible to avoid this limit
* altogether by spilling directly from the temporary GRF * altogether by spilling directly from the temporary GRF
* allocated to hold the result of the instruction (and the * allocated to hold the result of the instruction (and the
@@ -661,7 +670,7 @@ fs_reg_alloc::build_interference_graph(bool allow_spilling)
first_vgrf_node = node_count; first_vgrf_node = node_count;
node_count += fs->alloc.count; node_count += fs->alloc.count;
last_vgrf_node = node_count - 1; last_vgrf_node = node_count - 1;
if (devinfo->ver >= 9 && allow_spilling) { if ((devinfo->ver >= 9 && devinfo->verx10 < 125) && allow_spilling) {
scratch_header_node = node_count++; scratch_header_node = node_count++;
} else { } else {
scratch_header_node = -1; scratch_header_node = -1;
@@ -742,11 +751,59 @@ fs_reg_alloc::discard_interference_graph()
have_spill_costs = false; have_spill_costs = false;
} }
fs_reg
fs_reg_alloc::build_single_offset(const fs_builder &bld, uint32_t spill_offset, int ip)
{
fs_reg offset = retype(alloc_spill_reg(1, ip), BRW_REGISTER_TYPE_UD);
fs_inst *inst = bld.MOV(offset, brw_imm_ud(spill_offset));
_mesa_set_add(spill_insts, inst);
return offset;
}
fs_reg
fs_reg_alloc::build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, int ip)
{
/* LSC messages are limited to SIMD16 */
assert(bld.dispatch_width() <= 16);
const fs_builder ubld = bld.exec_all();
const unsigned reg_count = ubld.dispatch_width() / 8;
fs_reg offset = retype(alloc_spill_reg(reg_count, ip), BRW_REGISTER_TYPE_UD);
fs_inst *inst;
/* Build an offset per lane in SIMD8 */
inst = ubld.group(8, 0).MOV(retype(offset, BRW_REGISTER_TYPE_UW),
brw_imm_uv(0x76543210));
_mesa_set_add(spill_insts, inst);
inst = ubld.group(8, 0).MOV(offset, retype(offset, BRW_REGISTER_TYPE_UW));
_mesa_set_add(spill_insts, inst);
/* Build offsets in the upper 8 lanes of SIMD16 */
if (ubld.dispatch_width() > 8) {
inst = ubld.group(8, 0).ADD(
byte_offset(offset, REG_SIZE),
byte_offset(offset, 0),
brw_imm_ud(8));
_mesa_set_add(spill_insts, inst);
}
/* Make the offset a dword */
inst = ubld.SHL(offset, offset, brw_imm_ud(2));
_mesa_set_add(spill_insts, inst);
/* Add the base offset */
inst = ubld.ADD(offset, offset, brw_imm_ud(spill_offset));
_mesa_set_add(spill_insts, inst);
return offset;
}
void void
fs_reg_alloc::emit_unspill(const fs_builder &bld, fs_reg_alloc::emit_unspill(const fs_builder &bld,
struct shader_stats *stats, struct shader_stats *stats,
fs_reg dst, fs_reg dst,
uint32_t spill_offset, unsigned count) uint32_t spill_offset, unsigned count, int ip)
{ {
const intel_device_info *devinfo = bld.shader->devinfo; const intel_device_info *devinfo = bld.shader->devinfo;
const unsigned reg_size = dst.component_size(bld.dispatch_width()) / const unsigned reg_size = dst.component_size(bld.dispatch_width()) /
@@ -757,7 +814,53 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld,
++stats->fill_count; ++stats->fill_count;
fs_inst *unspill_inst; fs_inst *unspill_inst;
if (devinfo->ver >= 9) { if (devinfo->verx10 >= 125) {
/* LSC is limited to SIMD16 load/store but we can load more using
* transpose messages.
*/
const bool use_transpose = bld.dispatch_width() > 16;
const fs_builder ubld = use_transpose ? bld.exec_all().group(1, 0) : bld;
fs_reg offset;
if (use_transpose) {
offset = build_single_offset(ubld, spill_offset, ip);
} else {
offset = build_lane_offsets(ubld, spill_offset, ip);
}
/* We leave the extended descriptor empty and flag the instruction to
* ask the generated to insert the extended descriptor in the address
* register. That way we don't need to burn an additional register
* for register allocation spill/fill.
*/
fs_reg srcs[] = {
brw_imm_ud(0), /* desc */
brw_imm_ud(0), /* ex_desc */
offset, /* payload */
fs_reg(), /* payload2 */
};
unspill_inst = ubld.emit(SHADER_OPCODE_SEND, dst,
srcs, ARRAY_SIZE(srcs));
unspill_inst->sfid = GFX12_SFID_UGM;
unspill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
unspill_inst->exec_size,
LSC_ADDR_SURFTYPE_BSS,
LSC_ADDR_SIZE_A32,
1 /* num_coordinates */,
LSC_DATA_SIZE_D32,
use_transpose ? reg_size * 8 : 1 /* num_channels */,
use_transpose,
LSC_CACHE_LOAD_L1STATE_L3MOCS,
true /* has_dest */);
unspill_inst->header_size = 0;
unspill_inst->mlen =
lsc_msg_desc_src0_len(devinfo, unspill_inst->desc);
unspill_inst->ex_mlen = 0;
unspill_inst->size_written =
lsc_msg_desc_dest_len(devinfo, unspill_inst->desc) * REG_SIZE;
unspill_inst->send_has_side_effects = false;
unspill_inst->send_is_volatile = true;
unspill_inst->send_ex_desc_scratch = true;
} else if (devinfo->ver >= 9) {
fs_reg header = this->scratch_header; fs_reg header = this->scratch_header;
fs_builder ubld = bld.exec_all().group(1, 0); fs_builder ubld = bld.exec_all().group(1, 0);
assert(spill_offset % 16 == 0); assert(spill_offset % 16 == 0);
@@ -765,15 +868,8 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld,
brw_imm_ud(spill_offset / 16)); brw_imm_ud(spill_offset / 16));
_mesa_set_add(spill_insts, unspill_inst); _mesa_set_add(spill_insts, unspill_inst);
unsigned bti; const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
fs_reg ex_desc; const fs_reg ex_desc = brw_imm_ud(0);
if (devinfo->verx10 >= 125) {
bti = GFX9_BTI_BINDLESS;
ex_desc = component(this->scratch_header, 0);
} else {
bti = GFX8_BTI_STATELESS_NON_COHERENT;
ex_desc = brw_imm_ud(0);
}
fs_reg srcs[] = { brw_imm_ud(0), ex_desc, header }; fs_reg srcs[] = { brw_imm_ud(0), ex_desc, header };
unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst, unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst,
@@ -815,7 +911,7 @@ void
fs_reg_alloc::emit_spill(const fs_builder &bld, fs_reg_alloc::emit_spill(const fs_builder &bld,
struct shader_stats *stats, struct shader_stats *stats,
fs_reg src, fs_reg src,
uint32_t spill_offset, unsigned count) uint32_t spill_offset, unsigned count, int ip)
{ {
const intel_device_info *devinfo = bld.shader->devinfo; const intel_device_info *devinfo = bld.shader->devinfo;
const unsigned reg_size = src.component_size(bld.dispatch_width()) / const unsigned reg_size = src.component_size(bld.dispatch_width()) /
@@ -826,7 +922,40 @@ fs_reg_alloc::emit_spill(const fs_builder &bld,
++stats->spill_count; ++stats->spill_count;
fs_inst *spill_inst; fs_inst *spill_inst;
if (devinfo->ver >= 9) { if (devinfo->verx10 >= 125) {
fs_reg offset = build_lane_offsets(bld, spill_offset, ip);
/* We leave the extended descriptor empty and flag the instruction
* relocate the extended descriptor. That way the surface offset is
* directly put into the instruction and we don't need to use a
* register to hold it.
*/
fs_reg srcs[] = {
brw_imm_ud(0), /* desc */
brw_imm_ud(0), /* ex_desc */
offset, /* payload */
src, /* payload2 */
};
spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
srcs, ARRAY_SIZE(srcs));
spill_inst->sfid = GFX12_SFID_UGM;
spill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
bld.dispatch_width(),
LSC_ADDR_SURFTYPE_BSS,
LSC_ADDR_SIZE_A32,
1 /* num_coordinates */,
LSC_DATA_SIZE_D32,
1 /* num_channels */,
false /* transpose */,
LSC_CACHE_LOAD_L1STATE_L3MOCS,
false /* has_dest */);
spill_inst->header_size = 0;
spill_inst->mlen = lsc_msg_desc_src0_len(devinfo, spill_inst->desc);
spill_inst->ex_mlen = reg_size;
spill_inst->size_written = 0;
spill_inst->send_has_side_effects = true;
spill_inst->send_is_volatile = false;
spill_inst->send_ex_desc_scratch = true;
} else if (devinfo->ver >= 9) {
fs_reg header = this->scratch_header; fs_reg header = this->scratch_header;
fs_builder ubld = bld.exec_all().group(1, 0); fs_builder ubld = bld.exec_all().group(1, 0);
assert(spill_offset % 16 == 0); assert(spill_offset % 16 == 0);
@@ -834,15 +963,8 @@ fs_reg_alloc::emit_spill(const fs_builder &bld,
brw_imm_ud(spill_offset / 16)); brw_imm_ud(spill_offset / 16));
_mesa_set_add(spill_insts, spill_inst); _mesa_set_add(spill_insts, spill_inst);
unsigned bti; const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
fs_reg ex_desc; const fs_reg ex_desc = brw_imm_ud(0);
if (devinfo->verx10 >= 125) {
bti = GFX9_BTI_BINDLESS;
ex_desc = component(this->scratch_header, 0);
} else {
bti = GFX8_BTI_STATELESS_NON_COHERENT;
ex_desc = brw_imm_ud(0);
}
fs_reg srcs[] = { brw_imm_ud(0), ex_desc, header, src }; fs_reg srcs[] = { brw_imm_ud(0), ex_desc, header, src };
spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(), spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
@@ -1033,25 +1155,16 @@ fs_reg_alloc::spill_reg(unsigned spill_reg)
* SIMD16 mode, because we'd stomp the FB writes. * SIMD16 mode, because we'd stomp the FB writes.
*/ */
if (!fs->spilled_any_registers) { if (!fs->spilled_any_registers) {
if (devinfo->ver >= 9) { if (devinfo->verx10 >= 125) {
/* We will allocate a register on the fly */
} else if (devinfo->ver >= 9) {
this->scratch_header = alloc_scratch_header(); this->scratch_header = alloc_scratch_header();
fs_builder ubld = fs->bld.exec_all().group(8, 0).at( fs_builder ubld = fs->bld.exec_all().group(8, 0).at(
fs->cfg->first_block(), fs->cfg->first_block()->start()); fs->cfg->first_block(), fs->cfg->first_block()->start());
fs_inst *inst; fs_inst *inst = ubld.emit(SHADER_OPCODE_SCRATCH_HEADER,
if (devinfo->verx10 >= 125) { this->scratch_header);
inst = ubld.MOV(this->scratch_header, brw_imm_ud(0)); _mesa_set_add(spill_insts, inst);
_mesa_set_add(spill_insts, inst);
inst = ubld.group(1, 0).AND(component(this->scratch_header, 0),
retype(brw_vec1_grf(0, 5),
BRW_REGISTER_TYPE_UD),
brw_imm_ud(INTEL_MASK(31, 10)));
_mesa_set_add(spill_insts, inst);
} else {
inst = ubld.emit(SHADER_OPCODE_SCRATCH_HEADER,
this->scratch_header);
_mesa_set_add(spill_insts, inst);
}
} else { } else {
bool mrf_used[BRW_MAX_MRF(devinfo->ver)]; bool mrf_used[BRW_MAX_MRF(devinfo->ver)];
get_used_mrfs(fs, mrf_used); get_used_mrfs(fs, mrf_used);
@@ -1112,7 +1225,7 @@ fs_reg_alloc::spill_reg(unsigned spill_reg)
* unspill destination is a block-local temporary. * unspill destination is a block-local temporary.
*/ */
emit_unspill(ibld.exec_all().group(width, 0), &fs->shader_stats, emit_unspill(ibld.exec_all().group(width, 0), &fs->shader_stats,
unspill_dst, subset_spill_offset, count); unspill_dst, subset_spill_offset, count, ip);
} }
} }
@@ -1167,10 +1280,10 @@ fs_reg_alloc::spill_reg(unsigned spill_reg)
if (inst->is_partial_write() || if (inst->is_partial_write() ||
(!inst->force_writemask_all && !per_channel)) (!inst->force_writemask_all && !per_channel))
emit_unspill(ubld, &fs->shader_stats, spill_src, emit_unspill(ubld, &fs->shader_stats, spill_src,
subset_spill_offset, regs_written(inst)); subset_spill_offset, regs_written(inst), ip);
emit_spill(ubld.at(block, inst->next), &fs->shader_stats, spill_src, emit_spill(ubld.at(block, inst->next), &fs->shader_stats, spill_src,
subset_spill_offset, regs_written(inst)); subset_spill_offset, regs_written(inst), ip);
} }
for (fs_inst *inst = (fs_inst *)before->next; for (fs_inst *inst = (fs_inst *)before->next;

View File

@@ -174,6 +174,10 @@ struct backend_instruction {
bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */ bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */
bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */ bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */
bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */ bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */
bool send_ex_desc_scratch:1; /**< Only valid for SHADER_OPCODE_SEND, use
* the scratch surface offset to build
* extended descriptor
*/
bool eot:1; bool eot:1;
/* Chooses which flag subregister (f0.0 to f1.1) is used for conditional /* Chooses which flag subregister (f0.0 to f1.1) is used for conditional