intel/fs: Add Wa_22013689345

v2: Use a simpler framework (Lionel)

v3: Rebase, add task/mesh (Lionel)

v4: Fixup fence exec size (SIMDX -> SIMD1)

v5: Fix invalidate_analysis, add finishme comment (Curro)

Signed-off-by: Sagar Ghuge <sagar.ghuge@intel.com>
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: 22.0 <mesa-stable>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14947>
This commit is contained in:
Sagar Ghuge
2021-10-27 14:11:27 -07:00
committed by Marge Bot
parent 5cc4075f95
commit 6031ad4bf6
8 changed files with 94 additions and 4 deletions

View File

@@ -679,6 +679,7 @@ static const char* const lsc_flush_type[] = {
[LSC_FLUSH_TYPE_DISCARD] = "discard",
[LSC_FLUSH_TYPE_CLEAN] = "clean",
[LSC_FLUSH_TYPE_L3ONLY] = "l3only",
[LSC_FLUSH_TYPE_NONE_6] = "none_6",
};
static const char* const lsc_addr_size[] = {

View File

@@ -1781,6 +1781,7 @@ brw_memory_fence(struct brw_codegen *p,
struct brw_reg src,
enum opcode send_op,
enum brw_message_target sfid,
uint32_t desc,
bool commit_enable,
unsigned bti);

View File

@@ -1991,6 +1991,11 @@ enum PACKED lsc_flush_type {
* Flush "RW" section of the L3 cache, but leave L1 and L2 caches untouched.
*/
LSC_FLUSH_TYPE_L3ONLY = 5,
/*
* HW maps this flush type internally to NONE.
*/
LSC_FLUSH_TYPE_NONE_6 = 6,
};
enum PACKED lsc_backup_fence_routing {

View File

@@ -3256,7 +3256,8 @@ brw_set_memory_fence_message(struct brw_codegen *p,
static void
gfx12_set_memory_fence_message(struct brw_codegen *p,
struct brw_inst *insn,
enum brw_message_target sfid)
enum brw_message_target sfid,
uint32_t desc)
{
const unsigned mlen = 1; /* g0 header */
/* Completion signaled by write to register. No data returned. */
@@ -3268,8 +3269,8 @@ gfx12_set_memory_fence_message(struct brw_codegen *p,
brw_set_desc(p, insn, brw_urb_fence_desc(p->devinfo) |
brw_message_desc(p->devinfo, mlen, rlen, false));
} else {
enum lsc_fence_scope scope = LSC_FENCE_THREADGROUP;
enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE;
enum lsc_fence_scope scope = lsc_fence_msg_desc_scope(p->devinfo, desc);
enum lsc_flush_type flush_type = lsc_fence_msg_desc_flush_type(p->devinfo, desc);
if (sfid == GFX12_SFID_TGM) {
scope = LSC_FENCE_TILE;
@@ -3288,6 +3289,7 @@ brw_memory_fence(struct brw_codegen *p,
struct brw_reg src,
enum opcode send_op,
enum brw_message_target sfid,
uint32_t desc,
bool commit_enable,
unsigned bti)
{
@@ -3307,7 +3309,7 @@ brw_memory_fence(struct brw_codegen *p,
/* All DG2 hardware requires LSC for fence messages, even A-step */
if (devinfo->has_lsc)
gfx12_set_memory_fence_message(p, insn, sfid);
gfx12_set_memory_fence_message(p, insn, sfid, desc);
else
brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
}

View File

@@ -8618,6 +8618,75 @@ fs_visitor::fixup_3src_null_dest()
DEPENDENCY_VARIABLES);
}
static bool
needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
{
/* This workaround is about making sure that any instruction writing
* through UGM has completed before we hit EOT.
*
* The workaround talks about UGM writes or atomic message but what is
* important is anything that hasn't completed. Usually any SEND
* instruction that has a destination register will be read by something
* else so we don't need to care about those as they will be synchronized
* by other parts of the shader or optimized away. What is left are
* instructions that don't have a destination register.
*/
if (inst->sfid != GFX12_SFID_UGM)
return false;
return inst->dst.file == BAD_FILE;
}
/* Wa_22013689345
*
* We need to emit UGM fence message before EOT, if shader has any UGM write
* or atomic message.
*
* TODO/FINISHME: According to Curro we could avoid the fence in some cases.
* We probably need a better criteria in needs_dummy_fence().
*/
void
fs_visitor::emit_dummy_memory_fence_before_eot()
{
bool progress = false;
bool has_ugm_write_or_atomic = false;
if (!intel_device_info_is_dg2(devinfo))
return;
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
if (!inst->eot) {
if (needs_dummy_fence(devinfo, inst))
has_ugm_write_or_atomic = true;
continue;
}
if (!has_ugm_write_or_atomic)
break;
const fs_builder ibld(this, block, inst);
const fs_builder ubld = ibld.exec_all().group(1, 0);
fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
fs_inst *dummy_fence = ubld.emit(SHADER_OPCODE_MEMORY_FENCE,
dst, brw_vec8_grf(0, 0),
/* commit enable */ brw_imm_ud(1),
/* bti */ brw_imm_ud(0));
dummy_fence->sfid = GFX12_SFID_UGM;
dummy_fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_TILE,
LSC_FLUSH_TYPE_NONE_6, false);
ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
progress = true;
/* TODO: remove this break if we ever have shader with multiple EOT. */
break;
}
if (progress) {
invalidate_analysis(DEPENDENCY_INSTRUCTIONS |
DEPENDENCY_VARIABLES);
}
}
/**
* Find the first instruction in the program that might start a region of
* divergent control flow due to a HALT jump. There is no
@@ -8927,6 +8996,7 @@ fs_visitor::run_vs()
assign_vs_urb_setup();
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
allocate_registers(true /* allow_spilling */);
return !failed;
@@ -9049,6 +9119,7 @@ fs_visitor::run_tcs()
assign_tcs_urb_setup();
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
allocate_registers(true /* allow_spilling */);
return !failed;
@@ -9077,6 +9148,7 @@ fs_visitor::run_tes()
assign_tes_urb_setup();
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
allocate_registers(true /* allow_spilling */);
return !failed;
@@ -9120,6 +9192,7 @@ fs_visitor::run_gs()
assign_gs_urb_setup();
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
allocate_registers(true /* allow_spilling */);
return !failed;
@@ -9220,6 +9293,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
assign_urb_setup();
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
allocate_registers(allow_spilling);
}
@@ -9255,6 +9329,7 @@ fs_visitor::run_cs(bool allow_spilling)
assign_curb_setup();
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
allocate_registers(allow_spilling);
return !failed;
@@ -9283,6 +9358,7 @@ fs_visitor::run_bs(bool allow_spilling)
assign_curb_setup();
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
allocate_registers(allow_spilling);
return !failed;
@@ -9327,6 +9403,7 @@ fs_visitor::run_task(bool allow_spilling)
assign_curb_setup();
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
allocate_registers(allow_spilling);
return !failed;
@@ -9371,6 +9448,7 @@ fs_visitor::run_mesh(bool allow_spilling)
assign_curb_setup();
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
allocate_registers(allow_spilling);
return !failed;

View File

@@ -136,6 +136,7 @@ public:
void setup_cs_payload();
bool fixup_sends_duplicate_payload();
void fixup_3src_null_dest();
void emit_dummy_memory_fence_before_eot();
bool fixup_nomask_control_flow();
void assign_curb_setup();
void assign_urb_setup();

View File

@@ -2382,6 +2382,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
brw_memory_fence(p, dst, src[0], send_op,
brw_message_target(inst->sfid),
inst->desc,
/* commit_enable */ src[1].ud,
/* bti */ src[2].ud);
send_count++;

View File

@@ -1926,6 +1926,7 @@ generate_code(struct brw_codegen *p,
case SHADER_OPCODE_MEMORY_FENCE:
brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND,
brw_message_target(inst->sfid),
inst->desc,
/* commit_enable */ false,
/* bti */ 0);
send_count++;