intel/fs: improve Wa_22013689345 workaround
The initial implementation is a pretty big hammer. Implement the HW
recommendation to minimize cases in which we need a fence.
This improves by 10FPS on some of the Sascha Willems RT demos.
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Fixes: 6031ad4bf6
("intel/fs: Add Wa_22013689345")
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19322>
This commit is contained in:

committed by
Marge Bot

parent
86f353ed23
commit
945637514e
@@ -1203,6 +1203,43 @@ lsc_opcode_has_transpose(enum lsc_opcode opcode)
|
||||
return opcode == LSC_OP_LOAD || opcode == LSC_OP_STORE;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
lsc_opcode_is_store(enum lsc_opcode opcode)
|
||||
{
|
||||
return opcode == LSC_OP_STORE ||
|
||||
opcode == LSC_OP_STORE_CMASK;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
lsc_opcode_is_atomic(enum lsc_opcode opcode)
|
||||
{
|
||||
switch (opcode) {
|
||||
case LSC_OP_ATOMIC_INC:
|
||||
case LSC_OP_ATOMIC_DEC:
|
||||
case LSC_OP_ATOMIC_LOAD:
|
||||
case LSC_OP_ATOMIC_STORE:
|
||||
case LSC_OP_ATOMIC_ADD:
|
||||
case LSC_OP_ATOMIC_SUB:
|
||||
case LSC_OP_ATOMIC_MIN:
|
||||
case LSC_OP_ATOMIC_MAX:
|
||||
case LSC_OP_ATOMIC_UMIN:
|
||||
case LSC_OP_ATOMIC_UMAX:
|
||||
case LSC_OP_ATOMIC_CMPXCHG:
|
||||
case LSC_OP_ATOMIC_FADD:
|
||||
case LSC_OP_ATOMIC_FSUB:
|
||||
case LSC_OP_ATOMIC_FMIN:
|
||||
case LSC_OP_ATOMIC_FMAX:
|
||||
case LSC_OP_ATOMIC_FCMPXCHG:
|
||||
case LSC_OP_ATOMIC_AND:
|
||||
case LSC_OP_ATOMIC_OR:
|
||||
case LSC_OP_ATOMIC_XOR:
|
||||
return true;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
lsc_data_size_bytes(enum lsc_data_size data_size)
|
||||
{
|
||||
|
@@ -6332,18 +6332,34 @@ needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
|
||||
{
|
||||
/* This workaround is about making sure that any instruction writing
|
||||
* through UGM has completed before we hit EOT.
|
||||
*
|
||||
* The workaround talks about UGM writes or atomic message but what is
|
||||
* important is anything that hasn't completed. Usually any SEND
|
||||
* instruction that has a destination register will be read by something
|
||||
* else so we don't need to care about those as they will be synchronized
|
||||
* by other parts of the shader or optimized away. What is left are
|
||||
* instructions that don't have a destination register.
|
||||
*/
|
||||
if (inst->sfid != GFX12_SFID_UGM)
|
||||
return false;
|
||||
|
||||
return inst->dst.file == BAD_FILE;
|
||||
/* Any UGM, non-Scratch-surface Stores (not including Atomic) messages,
|
||||
* where the L1-cache override is NOT among {WB, WS, WT}
|
||||
*/
|
||||
enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc);
|
||||
if (lsc_opcode_is_store(opcode)) {
|
||||
switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) {
|
||||
case LSC_CACHE_STORE_L1STATE_L3MOCS:
|
||||
case LSC_CACHE_STORE_L1WB_L3WB:
|
||||
case LSC_CACHE_STORE_L1S_L3UC:
|
||||
case LSC_CACHE_STORE_L1S_L3WB:
|
||||
case LSC_CACHE_STORE_L1WT_L3UC:
|
||||
case LSC_CACHE_STORE_L1WT_L3WB:
|
||||
return false;
|
||||
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Any UGM Atomic message WITHOUT return value */
|
||||
if (lsc_opcode_is_atomic(opcode) && inst->dst.file == BAD_FILE)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Wa_22013689345
|
||||
|
Reference in New Issue
Block a user