intel/fs: improve Wa_22013689345 workaround

The initial implementation is a pretty big hammer. Implement the HW
recommendation to minimize cases in which we need a fence.

This improves by 10FPS on some of the Sascha Willems RT demos.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Fixes: 6031ad4bf6 ("intel/fs: Add Wa_22013689345")
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19322>
This commit is contained in:
Lionel Landwerlin
2022-08-16 08:08:43 +00:00
committed by Marge Bot
parent 86f353ed23
commit 945637514e
2 changed files with 61 additions and 8 deletions

View File

@@ -1203,6 +1203,43 @@ lsc_opcode_has_transpose(enum lsc_opcode opcode)
return opcode == LSC_OP_LOAD || opcode == LSC_OP_STORE;
}
static inline bool
lsc_opcode_is_store(enum lsc_opcode opcode)
{
return opcode == LSC_OP_STORE ||
opcode == LSC_OP_STORE_CMASK;
}
static inline bool
lsc_opcode_is_atomic(enum lsc_opcode opcode)
{
switch (opcode) {
case LSC_OP_ATOMIC_INC:
case LSC_OP_ATOMIC_DEC:
case LSC_OP_ATOMIC_LOAD:
case LSC_OP_ATOMIC_STORE:
case LSC_OP_ATOMIC_ADD:
case LSC_OP_ATOMIC_SUB:
case LSC_OP_ATOMIC_MIN:
case LSC_OP_ATOMIC_MAX:
case LSC_OP_ATOMIC_UMIN:
case LSC_OP_ATOMIC_UMAX:
case LSC_OP_ATOMIC_CMPXCHG:
case LSC_OP_ATOMIC_FADD:
case LSC_OP_ATOMIC_FSUB:
case LSC_OP_ATOMIC_FMIN:
case LSC_OP_ATOMIC_FMAX:
case LSC_OP_ATOMIC_FCMPXCHG:
case LSC_OP_ATOMIC_AND:
case LSC_OP_ATOMIC_OR:
case LSC_OP_ATOMIC_XOR:
return true;
default:
return false;
}
}
static inline uint32_t
lsc_data_size_bytes(enum lsc_data_size data_size)
{

View File

@@ -6332,18 +6332,34 @@ needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
{
/* This workaround is about making sure that any instruction writing
* through UGM has completed before we hit EOT.
*
* The workaround talks about UGM writes or atomic message but what is
* important is anything that hasn't completed. Usually any SEND
* instruction that has a destination register will be read by something
* else so we don't need to care about those as they will be synchronized
* by other parts of the shader or optimized away. What is left are
* instructions that don't have a destination register.
*/
if (inst->sfid != GFX12_SFID_UGM)
return false;
return inst->dst.file == BAD_FILE;
/* Any UGM, non-Scratch-surface Stores (not including Atomic) messages,
* where the L1-cache override is NOT among {WB, WS, WT}
*/
enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc);
if (lsc_opcode_is_store(opcode)) {
switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) {
case LSC_CACHE_STORE_L1STATE_L3MOCS:
case LSC_CACHE_STORE_L1WB_L3WB:
case LSC_CACHE_STORE_L1S_L3UC:
case LSC_CACHE_STORE_L1S_L3WB:
case LSC_CACHE_STORE_L1WT_L3UC:
case LSC_CACHE_STORE_L1WT_L3WB:
return false;
default:
return true;
}
}
/* Any UGM Atomic message WITHOUT return value */
if (lsc_opcode_is_atomic(opcode) && inst->dst.file == BAD_FILE)
return true;
return false;
}
/* Wa_22013689345