From bd40a1e8c93b9852302c8b1a11ff8e7496f56a03 Mon Sep 17 00:00:00 2001 From: Mark Janes Date: Thu, 29 Apr 2021 18:48:03 -0700 Subject: [PATCH] intel/fs: Lower untyped atomic messages to LSC when available Bspec programming note metions that "Atomic messages are always forced to "un-cacheable" in the L1 cache". We can make the L1 cache un-cacheable and L3 with write-back policy. v2: (Sagar Ghuge): - Fix caching policy for atomic messages - Fix simd exec size v3: (Sagar Ghuge): - Add atomic messages to brw_schedule_instructions v4: (Jason Ekstrand): - Rebase on lsc_msg_desc reworks Co-authored-by: Sagar Ghuge Co-authored-by: Jason Ekstrand Reviewed-by: Jason Ekstrand Reviewed-by: Sagar Ghuge Part-of: --- src/intel/compiler/brw_fs.cpp | 54 ++++++++++++++++++- src/intel/compiler/brw_ir_performance.cpp | 19 +++++++ .../compiler/brw_schedule_instructions.cpp | 16 ++++++ 3 files changed, 88 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 35d48cb5c0f..4ff29a94cc5 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -5842,6 +5842,42 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) inst->resize_sources(4); } +static enum lsc_opcode +brw_atomic_op_to_lsc_atomic_op(unsigned op) +{ + switch(op) { + case BRW_AOP_AND: + return LSC_OP_ATOMIC_AND; + case BRW_AOP_OR: + return LSC_OP_ATOMIC_OR; + case BRW_AOP_XOR: + return LSC_OP_ATOMIC_XOR; + case BRW_AOP_MOV: + return LSC_OP_ATOMIC_STORE; + case BRW_AOP_INC: + return LSC_OP_ATOMIC_INC; + case BRW_AOP_DEC: + return LSC_OP_ATOMIC_DEC; + case BRW_AOP_ADD: + return LSC_OP_ATOMIC_ADD; + case BRW_AOP_SUB: + return LSC_OP_ATOMIC_SUB; + case BRW_AOP_IMAX: + return LSC_OP_ATOMIC_MAX; + case BRW_AOP_IMIN: + return LSC_OP_ATOMIC_MIN; + case BRW_AOP_UMAX: + return LSC_OP_ATOMIC_UMAX; + case BRW_AOP_UMIN: + return LSC_OP_ATOMIC_UMIN; + case BRW_AOP_CMPWR: + return LSC_OP_ATOMIC_CMPXCHG; + default: + assert(false); + unreachable("invalid atomic opcode"); + } +} + static void lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst) { @@ -5915,6 +5951,22 @@ lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst) LSC_CACHE_STORE_L1STATE_L3MOCS, false /* has_dest */); break; + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: + /* Bspec: Atomic instruction -> Cache section: + * + * Atomic messages are always forced to "un-cacheable" in the L1 + * cache. + */ + inst->desc = lsc_msg_desc(devinfo, + brw_atomic_op_to_lsc_atomic_op(arg.ud), + inst->exec_size, + surf_type, LSC_ADDR_SIZE_A32, + 1 /* num_coordinates */, + LSC_DATA_SIZE_D32, 1 /* num_channels */, + false /* transpose */, + LSC_CACHE_STORE_L1UC_L3WB, + !inst->dst.is_null()); + break; default: unreachable("Unknown surface logical instruction"); } @@ -6530,6 +6582,7 @@ fs_visitor::lower_logical_sends() case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: if (devinfo->has_lsc) { lower_lsc_surface_logical_send(ibld, inst); break; @@ -6538,7 +6591,6 @@ fs_visitor::lower_logical_sends() case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: - case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp index f89a01e5dcd..d04514cb431 100644 --- a/src/intel/compiler/brw_ir_performance.cpp +++ b/src/intel/compiler/brw_ir_performance.cpp @@ -1103,6 +1103,25 @@ namespace { 0, 20 /* XXX */, 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); + + case LSC_OP_ATOMIC_INC: + case LSC_OP_ATOMIC_DEC: + case LSC_OP_ATOMIC_LOAD: + case LSC_OP_ATOMIC_STORE: + case LSC_OP_ATOMIC_ADD: + case LSC_OP_ATOMIC_SUB: + case LSC_OP_ATOMIC_MIN: + case LSC_OP_ATOMIC_MAX: + case LSC_OP_ATOMIC_UMIN: + case LSC_OP_ATOMIC_UMAX: + case LSC_OP_ATOMIC_CMPXCHG: + case LSC_OP_ATOMIC_AND: + case LSC_OP_ATOMIC_OR: + case LSC_OP_ATOMIC_XOR: + return calculate_desc(info, unit_dp_dc, 2, 0, 0, + 30 /* XXX */, 400 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, + 0, 400 /* XXX */); default: abort(); } diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index 6b9d8389a88..b5cd1064c55 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -536,6 +536,22 @@ schedule_node::set_latency_gfx7(bool is_haswell) case LSC_OP_STORE_CMASK: latency = 300; break; + case LSC_OP_ATOMIC_INC: + case LSC_OP_ATOMIC_DEC: + case LSC_OP_ATOMIC_LOAD: + case LSC_OP_ATOMIC_STORE: + case LSC_OP_ATOMIC_ADD: + case LSC_OP_ATOMIC_SUB: + case LSC_OP_ATOMIC_MIN: + case LSC_OP_ATOMIC_MAX: + case LSC_OP_ATOMIC_UMIN: + case LSC_OP_ATOMIC_UMAX: + case LSC_OP_ATOMIC_CMPXCHG: + case LSC_OP_ATOMIC_AND: + case LSC_OP_ATOMIC_OR: + case LSC_OP_ATOMIC_XOR: + latency = 1400; + break; default: unreachable("unsupported new data port message instruction"); }