diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 7c501da84db..9ffb0cb9a58 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -279,47 +279,26 @@ enum opcode { /** * Texture sampling opcodes. * - * LOGICAL opcodes are eventually translated to the matching non-LOGICAL - * opcode but instead of taking a single payload blob they expect their - * arguments separately as individual sources. The position/ordering of the - * arguments are defined by the enum tex_logical_srcs. + * LOGICAL opcodes are eventually translated to SHADER_OPCODE_SEND but + * take parameters as individual sources. See enum tex_logical_srcs. */ - SHADER_OPCODE_TEX, SHADER_OPCODE_TEX_LOGICAL, - SHADER_OPCODE_TXD, SHADER_OPCODE_TXD_LOGICAL, - SHADER_OPCODE_TXF, SHADER_OPCODE_TXF_LOGICAL, - SHADER_OPCODE_TXF_LZ, - SHADER_OPCODE_TXL, SHADER_OPCODE_TXL_LOGICAL, - SHADER_OPCODE_TXL_LZ, - SHADER_OPCODE_TXS, SHADER_OPCODE_TXS_LOGICAL, - FS_OPCODE_TXB, FS_OPCODE_TXB_LOGICAL, - SHADER_OPCODE_TXF_CMS_W, SHADER_OPCODE_TXF_CMS_W_LOGICAL, SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL, - SHADER_OPCODE_TXF_MCS, SHADER_OPCODE_TXF_MCS_LOGICAL, - SHADER_OPCODE_LOD, SHADER_OPCODE_LOD_LOGICAL, - SHADER_OPCODE_TG4, SHADER_OPCODE_TG4_LOGICAL, - SHADER_OPCODE_TG4_IMPLICIT_LOD, SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL, - SHADER_OPCODE_TG4_EXPLICIT_LOD, SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL, - SHADER_OPCODE_TG4_BIAS, SHADER_OPCODE_TG4_BIAS_LOGICAL, - SHADER_OPCODE_TG4_OFFSET, SHADER_OPCODE_TG4_OFFSET_LOGICAL, - SHADER_OPCODE_TG4_OFFSET_LOD, SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL, - SHADER_OPCODE_TG4_OFFSET_BIAS, SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL, - SHADER_OPCODE_SAMPLEINFO, SHADER_OPCODE_SAMPLEINFO_LOGICAL, SHADER_OPCODE_IMAGE_SIZE_LOGICAL, diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index d40802c8376..fe59c246954 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -237,25 +237,6 @@ fs_inst::is_control_source(unsigned arg) const case SHADER_OPCODE_MOV_INDIRECT: case SHADER_OPCODE_CLUSTER_BROADCAST: - case SHADER_OPCODE_TEX: - case FS_OPCODE_TXB: - case SHADER_OPCODE_TXD: - case SHADER_OPCODE_TXF: - case SHADER_OPCODE_TXF_LZ: - case SHADER_OPCODE_TXF_CMS_W: - case SHADER_OPCODE_TXF_MCS: - case SHADER_OPCODE_TXL: - case SHADER_OPCODE_TXL_LZ: - case SHADER_OPCODE_TXS: - case SHADER_OPCODE_LOD: - case SHADER_OPCODE_TG4: - case SHADER_OPCODE_TG4_OFFSET: - case SHADER_OPCODE_TG4_BIAS: - case SHADER_OPCODE_TG4_EXPLICIT_LOD: - case SHADER_OPCODE_TG4_IMPLICIT_LOD: - case SHADER_OPCODE_TG4_OFFSET_LOD: - case SHADER_OPCODE_TG4_OFFSET_BIAS: - case SHADER_OPCODE_SAMPLEINFO: return arg == 1 || arg == 2; case SHADER_OPCODE_SEND: @@ -277,25 +258,6 @@ fs_inst::is_payload(unsigned arg) const case SHADER_OPCODE_INTERLOCK: case SHADER_OPCODE_MEMORY_FENCE: case SHADER_OPCODE_BARRIER: - case SHADER_OPCODE_TEX: - case FS_OPCODE_TXB: - case SHADER_OPCODE_TXD: - case SHADER_OPCODE_TXF: - case SHADER_OPCODE_TXF_LZ: - case SHADER_OPCODE_TXF_CMS_W: - case SHADER_OPCODE_TXF_MCS: - case SHADER_OPCODE_TXL: - case SHADER_OPCODE_TXL_LZ: - case SHADER_OPCODE_TXS: - case SHADER_OPCODE_LOD: - case SHADER_OPCODE_TG4: - case SHADER_OPCODE_TG4_OFFSET: - case SHADER_OPCODE_TG4_BIAS: - case SHADER_OPCODE_TG4_EXPLICIT_LOD: - case SHADER_OPCODE_TG4_IMPLICIT_LOD: - case SHADER_OPCODE_TG4_OFFSET_LOD: - case SHADER_OPCODE_TG4_OFFSET_BIAS: - case SHADER_OPCODE_SAMPLEINFO: return arg == 0; case SHADER_OPCODE_SEND: @@ -959,29 +921,6 @@ fs_inst::size_read(int arg) const } break; - case SHADER_OPCODE_TEX: - case FS_OPCODE_TXB: - case SHADER_OPCODE_TXD: - case SHADER_OPCODE_TXF: - case SHADER_OPCODE_TXF_LZ: - case SHADER_OPCODE_TXF_CMS_W: - case SHADER_OPCODE_TXF_MCS: - case SHADER_OPCODE_TXL: - case SHADER_OPCODE_TXL_LZ: - case SHADER_OPCODE_TXS: - case SHADER_OPCODE_LOD: - case SHADER_OPCODE_TG4: - case SHADER_OPCODE_TG4_OFFSET: - case SHADER_OPCODE_TG4_BIAS: - case SHADER_OPCODE_TG4_EXPLICIT_LOD: - case SHADER_OPCODE_TG4_IMPLICIT_LOD: - case SHADER_OPCODE_TG4_OFFSET_LOD: - case SHADER_OPCODE_TG4_OFFSET_BIAS: - case SHADER_OPCODE_SAMPLEINFO: - if (arg == 0 && src[0].file == VGRF) - return mlen * REG_SIZE; - break; - default: break; } @@ -2343,78 +2282,40 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op) case SHADER_OPCODE_UNDEF: return "undef"; - case SHADER_OPCODE_TEX: - return "tex"; case SHADER_OPCODE_TEX_LOGICAL: return "tex_logical"; - case SHADER_OPCODE_TXD: - return "txd"; case SHADER_OPCODE_TXD_LOGICAL: return "txd_logical"; - case SHADER_OPCODE_TXF: - return "txf"; case SHADER_OPCODE_TXF_LOGICAL: return "txf_logical"; - case SHADER_OPCODE_TXF_LZ: - return "txf_lz"; - case SHADER_OPCODE_TXL: - return "txl"; case SHADER_OPCODE_TXL_LOGICAL: return "txl_logical"; - case SHADER_OPCODE_TXL_LZ: - return "txl_lz"; - case SHADER_OPCODE_TXS: - return "txs"; case SHADER_OPCODE_TXS_LOGICAL: return "txs_logical"; - case FS_OPCODE_TXB: - return "txb"; case FS_OPCODE_TXB_LOGICAL: return "txb_logical"; - case SHADER_OPCODE_TXF_CMS_W: - return "txf_cms_w"; case SHADER_OPCODE_TXF_CMS_W_LOGICAL: return "txf_cms_w_logical"; case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: return "txf_cms_w_gfx12_logical"; - case SHADER_OPCODE_TXF_MCS: - return "txf_mcs"; case SHADER_OPCODE_TXF_MCS_LOGICAL: return "txf_mcs_logical"; - case SHADER_OPCODE_LOD: - return "lod"; case SHADER_OPCODE_LOD_LOGICAL: return "lod_logical"; - case SHADER_OPCODE_TG4: - return "tg4"; case SHADER_OPCODE_TG4_LOGICAL: return "tg4_logical"; - case SHADER_OPCODE_TG4_OFFSET: - return "tg4_offset"; case SHADER_OPCODE_TG4_OFFSET_LOGICAL: return "tg4_offset_logical"; - case SHADER_OPCODE_TG4_OFFSET_LOD: - return "tg4_offset_lod"; case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL: return "tg4_offset_lod_logical"; - case SHADER_OPCODE_TG4_OFFSET_BIAS: - return "tg4_offset_bias"; case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL: return "tg4_offset_bias_logical"; - case SHADER_OPCODE_TG4_BIAS: - return "tg4_b"; case SHADER_OPCODE_TG4_BIAS_LOGICAL: return "tg4_b_logical"; - case SHADER_OPCODE_TG4_EXPLICIT_LOD: - return "tg4_l"; case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL: return "tg4_l_logical"; - case SHADER_OPCODE_TG4_IMPLICIT_LOD: - return "tg4_i"; case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL: return "tg4_i_logical"; - case SHADER_OPCODE_SAMPLEINFO: - return "sampleinfo"; case SHADER_OPCODE_SAMPLEINFO_LOGICAL: return "sampleinfo_logical"; diff --git a/src/intel/compiler/brw_fs_lower_simd_width.cpp b/src/intel/compiler/brw_fs_lower_simd_width.cpp index a12e447a941..56ec355ba3b 100644 --- a/src/intel/compiler/brw_fs_lower_simd_width.cpp +++ b/src/intel/compiler/brw_fs_lower_simd_width.cpp @@ -173,15 +173,15 @@ get_sampler_lowered_simd_width(const struct intel_device_info *devinfo, * message, it will push it over 5 arguments and we have to fall back to * SIMD8. */ - if (inst->opcode != SHADER_OPCODE_TEX && + if (inst->opcode != SHADER_OPCODE_TEX_LOGICAL && inst->components_read(TEX_LOGICAL_SRC_MIN_LOD)) return devinfo->ver < 20 ? 8 : 16; /* On Gfx9+ the LOD argument is for free if we're able to use the LZ * variant of the TXL or TXF message. */ - const bool implicit_lod = (inst->opcode == SHADER_OPCODE_TXL || - inst->opcode == SHADER_OPCODE_TXF) && + const bool implicit_lod = (inst->opcode == SHADER_OPCODE_TXL_LOGICAL || + inst->opcode == SHADER_OPCODE_TXF_LOGICAL) && inst->src[TEX_LOGICAL_SRC_LOD].is_zero(); /* Calculate the total number of argument components that need to be passed diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp index 5ef2130a333..daf5985d2a5 100644 --- a/src/intel/compiler/brw_ir_performance.cpp +++ b/src/intel/compiler/brw_ir_performance.cpp @@ -550,26 +550,7 @@ namespace { 0, 2 /* XXX */, 0, 0, 0, 8 /* XXX */, 0, 0); - case SHADER_OPCODE_TEX: - case FS_OPCODE_TXB: - case SHADER_OPCODE_TXD: - case SHADER_OPCODE_TXF: - case SHADER_OPCODE_TXF_LZ: - case SHADER_OPCODE_TXL: - case SHADER_OPCODE_TXL_LZ: - case SHADER_OPCODE_TXF_CMS_W: - case SHADER_OPCODE_TXF_MCS: - case SHADER_OPCODE_TXS: - case SHADER_OPCODE_LOD: case SHADER_OPCODE_GET_BUFFER_SIZE: - case SHADER_OPCODE_TG4: - case SHADER_OPCODE_TG4_BIAS: - case SHADER_OPCODE_TG4_EXPLICIT_LOD: - case SHADER_OPCODE_TG4_IMPLICIT_LOD: - case SHADER_OPCODE_TG4_OFFSET: - case SHADER_OPCODE_TG4_OFFSET_LOD: - case SHADER_OPCODE_TG4_OFFSET_BIAS: - case SHADER_OPCODE_SAMPLEINFO: return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */, 8 /* XXX */, 750 /* XXX */, 0, 0, 2 /* XXX */, 0); diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp index 79480d6cd0c..73b724d7853 100644 --- a/src/intel/compiler/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw_lower_logical_sends.cpp @@ -571,10 +571,11 @@ is_high_sampler(const struct intel_device_info *devinfo, const fs_reg &sampler) static unsigned sampler_msg_type(const intel_device_info *devinfo, - opcode opcode, bool shadow_compare, bool has_min_lod) + opcode opcode, bool shadow_compare, + bool lod_is_zero, bool has_min_lod) { switch (opcode) { - case SHADER_OPCODE_TEX: + case SHADER_OPCODE_TEX_LOGICAL: if (devinfo->ver >= 20 && has_min_lod) { return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD : XE2_SAMPLER_MESSAGE_SAMPLE_MLOD; @@ -582,72 +583,71 @@ sampler_msg_type(const intel_device_info *devinfo, return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE : GFX5_SAMPLER_MESSAGE_SAMPLE; } - case FS_OPCODE_TXB: + case FS_OPCODE_TXB_LOGICAL: return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE : GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS; - case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXL_LOGICAL: assert(!has_min_lod); + if (lod_is_zero) { + return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ : + GFX9_SAMPLER_MESSAGE_SAMPLE_LZ; + } return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE : GFX5_SAMPLER_MESSAGE_SAMPLE_LOD; - case SHADER_OPCODE_TXL_LZ: - assert(!has_min_lod); - return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ : - GFX9_SAMPLER_MESSAGE_SAMPLE_LZ; - case SHADER_OPCODE_TXS: + case SHADER_OPCODE_TXS_LOGICAL: case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: assert(!has_min_lod); return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO; - case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXD_LOGICAL: return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE : GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS; - case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_LOGICAL: assert(!has_min_lod); - return GFX5_SAMPLER_MESSAGE_SAMPLE_LD; - case SHADER_OPCODE_TXF_LZ: - assert(!has_min_lod); - return GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ; - case SHADER_OPCODE_TXF_CMS_W: + return lod_is_zero ? GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ : + GFX5_SAMPLER_MESSAGE_SAMPLE_LD; + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: assert(!has_min_lod); return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W; - case SHADER_OPCODE_TXF_MCS: + case SHADER_OPCODE_TXF_MCS_LOGICAL: assert(!has_min_lod); return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; - case SHADER_OPCODE_LOD: + case SHADER_OPCODE_LOD_LOGICAL: assert(!has_min_lod); return GFX5_SAMPLER_MESSAGE_LOD; - case SHADER_OPCODE_TG4: + case SHADER_OPCODE_TG4_LOGICAL: assert(!has_min_lod); return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C : GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4; break; - case SHADER_OPCODE_TG4_OFFSET: + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: assert(!has_min_lod); return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C : GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; - case SHADER_OPCODE_TG4_OFFSET_LOD: + case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL: assert(!has_min_lod); assert(devinfo->ver >= 20); return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L_C: XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L; - case SHADER_OPCODE_TG4_OFFSET_BIAS: + case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL: assert(!has_min_lod); assert(devinfo->ver >= 20); return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_B; - case SHADER_OPCODE_TG4_BIAS: + case SHADER_OPCODE_TG4_BIAS_LOGICAL: assert(!has_min_lod); assert(devinfo->ver >= 20); return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_B; - case SHADER_OPCODE_TG4_EXPLICIT_LOD: + case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL: assert(!has_min_lod); assert(devinfo->ver >= 20); return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L_C : XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L; - case SHADER_OPCODE_TG4_IMPLICIT_LOD: + case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL: assert(!has_min_lod); assert(devinfo->ver >= 20); return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I_C : XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I; - case SHADER_OPCODE_SAMPLEINFO: + case SHADER_OPCODE_SAMPLEINFO_LOGICAL: assert(!has_min_lod); return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; default: @@ -702,14 +702,14 @@ static bool shader_opcode_needs_header(opcode op) { switch (op) { - case SHADER_OPCODE_TG4: - case SHADER_OPCODE_TG4_OFFSET: - case SHADER_OPCODE_TG4_OFFSET_BIAS: - case SHADER_OPCODE_TG4_OFFSET_LOD: - case SHADER_OPCODE_TG4_BIAS: - case SHADER_OPCODE_TG4_EXPLICIT_LOD: - case SHADER_OPCODE_TG4_IMPLICIT_LOD: - case SHADER_OPCODE_SAMPLEINFO: + case SHADER_OPCODE_TG4_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL: + case SHADER_OPCODE_TG4_BIAS_LOGICAL: + case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL: + case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL: + case SHADER_OPCODE_SAMPLEINFO_LOGICAL: return true; default: break; @@ -719,7 +719,7 @@ shader_opcode_needs_header(opcode op) } static void -lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, +lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, const fs_reg &coordinate, const fs_reg &shadow_c, fs_reg lod, const fs_reg &lod2, @@ -746,6 +746,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_D); unsigned reg_width = bld.dispatch_width() / 8; unsigned header_size = 0, length = 0; + opcode op = inst->opcode; fs_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE]; for (unsigned i = 0; i < ARRAY_SIZE(sources); i++) sources[i] = bld.vgrf(payload_type); @@ -855,22 +856,14 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, } } - /* Change the opcode to account for LOD being zero before the - * switch-statement that emits sources based on the opcode. - */ - if (lod.is_zero()) { - if (op == SHADER_OPCODE_TXL) - op = SHADER_OPCODE_TXL_LZ; - else if (op == SHADER_OPCODE_TXF) - op = SHADER_OPCODE_TXF_LZ; - } + const bool lod_is_zero = lod.is_zero(); /* On Xe2 and newer platforms, min_lod is the first parameter specifically * so that a bunch of other, possibly unused, parameters don't need to also * be included. */ const unsigned msg_type = - sampler_msg_type(devinfo, op, inst->shadow_compare, + sampler_msg_type(devinfo, op, inst->shadow_compare, lod_is_zero, min_lod.file != BAD_FILE); const bool min_lod_is_first = devinfo->ver >= 20 && @@ -891,16 +884,19 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, /* Set up the LOD info */ switch (op) { - case FS_OPCODE_TXB: - case SHADER_OPCODE_TG4_BIAS: - case SHADER_OPCODE_TG4_EXPLICIT_LOD: - case SHADER_OPCODE_TG4_OFFSET_LOD: - case SHADER_OPCODE_TG4_OFFSET_BIAS: - case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXL_LOGICAL: + if (lod_is_zero) + break; + FALLTHROUGH; + case FS_OPCODE_TXB_LOGICAL: + case SHADER_OPCODE_TG4_BIAS_LOGICAL: + case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL: bld.MOV(sources[length], lod); length++; break; - case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXD_LOGICAL: /* TXD should have been lowered in SIMD16 mode (in SIMD32 mode in * Xe2+). */ @@ -923,7 +919,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, coordinate_done = true; break; - case SHADER_OPCODE_TXS: + case SHADER_OPCODE_TXS_LOGICAL: bld.MOV(retype(sources[length], payload_unsigned_type), lod); length++; break; @@ -932,8 +928,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, bld.MOV(retype(sources[length], payload_unsigned_type), brw_imm_ud(0)); length++; break; - case SHADER_OPCODE_TXF: - case SHADER_OPCODE_TXF_LZ: + case SHADER_OPCODE_TXF_LOGICAL: /* On Gfx9 the parameters are intermixed they are u, v, lod, r. */ bld.MOV(retype(sources[length++], payload_signed_type), coordinate); @@ -945,7 +940,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, } length++; - if (op != SHADER_OPCODE_TXF_LZ) { + if (!lod_is_zero) { bld.MOV(retype(sources[length], payload_signed_type), lod); length++; } @@ -957,50 +952,38 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, coordinate_done = true; break; - case SHADER_OPCODE_TXF_CMS_W: - case SHADER_OPCODE_TXF_MCS: - if (op == SHADER_OPCODE_TXF_CMS_W) { - bld.MOV(retype(sources[length++], payload_unsigned_type), sample_index); - } + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: + bld.MOV(retype(sources[length++], payload_unsigned_type), sample_index); /* Data from the multisample control surface. */ - if (op == SHADER_OPCODE_TXF_CMS_W) { - unsigned num_mcs_components = 1; - - /* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs - + for (unsigned i = 0; i < 2; ++i) { + /* Sampler always writes 4/8 register worth of data but for ld_mcs + * only valid data is in first two register. So with 16-bit + * payload, we need to split 2-32bit register into 4-16-bit + * payload. + * + * From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs - * Shared Functions - 3D Sampler - Messages - Message Format: * * ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r */ - if (op == SHADER_OPCODE_TXF_CMS_W) - num_mcs_components = 2; - - for (unsigned i = 0; i < num_mcs_components; ++i) { - /* Sampler always writes 4/8 register worth of data but for ld_mcs - * only valid data is in first two register. So with 16-bit - * payload, we need to split 2-32bit register into 4-16-bit - * payload. - * - * From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs - - * Shared Functions - 3D Sampler - Messages - Message Format: - * - * ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r - */ - if (devinfo->verx10 >= 125 && op == SHADER_OPCODE_TXF_CMS_W) { - fs_reg tmp = offset(mcs, bld, i); - bld.MOV(retype(sources[length++], payload_unsigned_type), - mcs.file == IMM ? mcs : - subscript(tmp, payload_unsigned_type, 0)); - bld.MOV(retype(sources[length++], payload_unsigned_type), - mcs.file == IMM ? mcs : - subscript(tmp, payload_unsigned_type, 1)); - } else { - bld.MOV(retype(sources[length++], payload_unsigned_type), - mcs.file == IMM ? mcs : offset(mcs, bld, i)); - } + if (op == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) { + fs_reg tmp = offset(mcs, bld, i); + bld.MOV(retype(sources[length++], payload_unsigned_type), + mcs.file == IMM ? mcs : + subscript(tmp, payload_unsigned_type, 0)); + bld.MOV(retype(sources[length++], payload_unsigned_type), + mcs.file == IMM ? mcs : + subscript(tmp, payload_unsigned_type, 1)); + } else { + bld.MOV(retype(sources[length++], payload_unsigned_type), + mcs.file == IMM ? mcs : offset(mcs, bld, i)); } } + FALLTHROUGH; + case SHADER_OPCODE_TXF_MCS_LOGICAL: /* There is no offsetting for this message; just copy in the integer * texture coordinates. */ @@ -1010,7 +993,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, coordinate_done = true; break; - case SHADER_OPCODE_TG4_OFFSET: + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: /* More crazy intermixing */ for (unsigned i = 0; i < 2; i++) /* u, v */ bld.MOV(sources[length++], offset(coordinate, bld, i)); @@ -1037,7 +1020,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, if (min_lod.file != BAD_FILE && !min_lod_is_first) { /* Account for all of the missing coordinate sources */ - if (op == FS_OPCODE_TXB && devinfo->ver >= 20 && + if (op == FS_OPCODE_TXB_LOGICAL && devinfo->ver >= 20 && inst->has_packed_lod_ai_src) { /* Bspec 64985: * @@ -1052,7 +1035,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, * Param BIAS_AI U V R MLOD */ length += 3 - coord_components; - } else if (op == SHADER_OPCODE_TXD && devinfo->verx10 >= 125) { + } else if (op == SHADER_OPCODE_TXD_LOGICAL && devinfo->verx10 >= 125) { /* On DG2 and newer platforms, sample_d can only be used with 1D and * 2D surfaces, so the maximum number of gradient components is 2. * In spite of this limitation, the Bspec lists a mysterious R @@ -1065,14 +1048,14 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, length += (2 - grad_components) * 2; } else { length += 4 - coord_components; - if (op == SHADER_OPCODE_TXD) + if (op == SHADER_OPCODE_TXD_LOGICAL) length += (3 - grad_components) * 2; } bld.MOV(sources[length++], min_lod); /* Wa_14014595444: Populate MLOD as parameter 5 (twice). */ - if (devinfo->verx10 == 125 && op == FS_OPCODE_TXB && + if (devinfo->verx10 == 125 && op == FS_OPCODE_TXB_LOGICAL && !inst->shadow_compare) bld.MOV(sources[length++], min_lod); } @@ -1113,10 +1096,6 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, inst->opcode = SHADER_OPCODE_SEND; inst->mlen = mlen; inst->header_size = header_size; - - assert(msg_type == sampler_msg_type(devinfo, op, inst->shadow_compare, - min_lod.file != BAD_FILE)); - inst->sfid = BRW_SFID_SAMPLER; if (surface.file == IMM && (sampler.file == IMM || sampler_handle.file != BAD_FILE)) { @@ -1203,7 +1182,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, static unsigned get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo, - opcode op, const fs_inst *inst) + const fs_inst *inst) { assert(inst); const fs_reg *src = inst->src; @@ -1227,7 +1206,7 @@ get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo, * which is already in 16-bits unlike the other parameters that need forced * conversion. */ - if (devinfo->verx10 < 125 || op != SHADER_OPCODE_TXF_CMS_W) { + if (inst->opcode != SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) { for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) { assert(src[i].file == BAD_FILE || brw_reg_type_to_size(src[i].type) == src_type_size); @@ -1246,10 +1225,9 @@ get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo, * ld_mcs SIMD8H and SIMD16H Only * ld2dms REMOVEDBY(GEN:HAS:1406788836) */ - - if (op == SHADER_OPCODE_TXF_CMS_W || - op == SHADER_OPCODE_TXF_MCS || - (op == FS_OPCODE_TXB && !inst->has_packed_lod_ai_src && + if (inst->opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL || + inst->opcode == SHADER_OPCODE_TXF_MCS_LOGICAL || + (inst->opcode == FS_OPCODE_TXB_LOGICAL && !inst->has_packed_lod_ai_src && devinfo->ver >= 20)) src_type_size = 2; @@ -1257,7 +1235,7 @@ get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo, } static void -lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) +lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst) { const intel_device_info *devinfo = bld.shader->devinfo; const fs_reg coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE]; @@ -1280,12 +1258,12 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) const bool residency = inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0; const unsigned msg_payload_type_bit_size = - get_sampler_msg_payload_type_bit_size(devinfo, op, inst); + get_sampler_msg_payload_type_bit_size(devinfo, inst); /* 16-bit payloads are available only on gfx11+ */ assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11); - lower_sampler_logical_send(bld, inst, op, coordinate, + lower_sampler_logical_send(bld, inst, coordinate, shadow_c, lod, lod2, min_lod, sample_index, mcs, surface, sampler, @@ -2757,80 +2735,25 @@ brw_fs_lower_logical_sends(fs_visitor &s) break; case SHADER_OPCODE_TEX_LOGICAL: - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX); - break; - case SHADER_OPCODE_TXD_LOGICAL: - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD); - break; - case SHADER_OPCODE_TXF_LOGICAL: - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF); - break; - case SHADER_OPCODE_TXL_LOGICAL: - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL); - break; - case SHADER_OPCODE_TXS_LOGICAL: - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS); - break; - case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: - lower_sampler_logical_send(ibld, inst, - SHADER_OPCODE_IMAGE_SIZE_LOGICAL); - break; - case FS_OPCODE_TXB_LOGICAL: - lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB); - break; - case SHADER_OPCODE_TXF_CMS_W_LOGICAL: case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W); - break; - case SHADER_OPCODE_TXF_MCS_LOGICAL: - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS); - break; - case SHADER_OPCODE_LOD_LOGICAL: - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD); - break; - case SHADER_OPCODE_TG4_LOGICAL: - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4); - break; - case SHADER_OPCODE_TG4_BIAS_LOGICAL: - assert(devinfo->ver >= 20); - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_BIAS); - break; - case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL: - assert(devinfo->ver >= 20); - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_EXPLICIT_LOD); - break; - case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL: - assert(devinfo->ver >= 20); - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_IMPLICIT_LOD); - break; - case SHADER_OPCODE_TG4_OFFSET_LOGICAL: - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET); - break; - case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL: - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET_LOD); - break; - case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL: - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET_BIAS); - break; - case SHADER_OPCODE_SAMPLEINFO_LOGICAL: - lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO); + lower_sampler_logical_send(ibld, inst); break; case SHADER_OPCODE_GET_BUFFER_SIZE: diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index f8d1c985d19..20638456677 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -231,90 +231,6 @@ schedule_node::set_latency(const struct brw_isa_info *isa) latency = 24; break; - case SHADER_OPCODE_TEX: - case SHADER_OPCODE_TXD: - case SHADER_OPCODE_TXF: - case SHADER_OPCODE_TXF_LZ: - case SHADER_OPCODE_TXL: - case SHADER_OPCODE_TXL_LZ: - /* 18 cycles: - * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; - * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; - * send(8) g4<1>UW g114<8,8,1>F - * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; - * - * 697 +/-49 cycles (min 610, n=26): - * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; - * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; - * send(8) g4<1>UW g114<8,8,1>F - * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; - * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; - * - * So the latency on our first texture load of the batchbuffer takes - * ~700 cycles, since the caches are cold at that point. - * - * 840 +/- 92 cycles (min 720, n=25): - * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; - * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; - * send(8) g4<1>UW g114<8,8,1>F - * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; - * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; - * send(8) g4<1>UW g114<8,8,1>F - * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; - * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; - * - * On the second load, it takes just an extra ~140 cycles, and after - * accounting for the 14 cycles of the MOV's latency, that makes ~130. - * - * 683 +/- 49 cycles (min = 602, n=47): - * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; - * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; - * send(8) g4<1>UW g114<8,8,1>F - * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; - * send(8) g50<1>UW g114<8,8,1>F - * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; - * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; - * - * The unit appears to be pipelined, since this matches up with the - * cache-cold case, despite there being two loads here. If you replace - * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39). - * - * So, take some number between the cache-hot 140 cycles and the - * cache-cold 700 cycles. No particular tuning was done on this. - * - * I haven't done significant testing of the non-TEX opcodes. TXL at - * least looked about the same as TEX. - */ - latency = 200; - break; - - case SHADER_OPCODE_TXS: - /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41 - * cycles (n=15): - * mov(8) g114<1>UD 0D { align1 WE_normal 1Q }; - * send(8) g6<1>UW g114<8,8,1>F - * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q }; - * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q }; - * - * - * Two loads was 535 +/- 30 cycles (n=19): - * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; - * send(16) g6<1>UW g114<8,8,1>F - * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; - * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; - * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H }; - * send(16) g8<1>UW g114<8,8,1>F - * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; - * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H }; - * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H }; - * - * Since the only caches that should matter are just the - * instruction/state cache containing the surface state, assume that we - * always have hot caches. - */ - latency = 100; - break; - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: /* testing using varying-index pull constants: * @@ -352,12 +268,83 @@ schedule_node::set_latency(const struct brw_isa_info *isa) switch (msg_type) { case GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO: case GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO: - /* See also SHADER_OPCODE_TXS */ + /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41 + * cycles (n=15): + * mov(8) g114<1>UD 0D { align1 WE_normal 1Q }; + * send(8) g6<1>UW g114<8,8,1>F + * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q }; + * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q }; + * + * + * Two loads was 535 +/- 30 cycles (n=19): + * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; + * send(16) g6<1>UW g114<8,8,1>F + * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; + * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; + * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H }; + * send(16) g8<1>UW g114<8,8,1>F + * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; + * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H }; + * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H }; + * + * Since the only caches that should matter are just the + * instruction/state cache containing the surface state, + * assume that we always have hot caches. + */ latency = 100; break; default: - /* See also SHADER_OPCODE_TEX */ + /* 18 cycles: + * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; + * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; + * send(8) g4<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * + * 697 +/-49 cycles (min 610, n=26): + * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; + * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; + * send(8) g4<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * + * So the latency on our first texture load of the batchbuffer + * takes ~700 cycles, since the caches are cold at that point. + * + * 840 +/- 92 cycles (min 720, n=25): + * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; + * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; + * send(8) g4<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * send(8) g4<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * + * On the second load, it takes just an extra ~140 cycles, and + * after accounting for the 14 cycles of the MOV's latency, that + * makes ~130. + * + * 683 +/- 49 cycles (min = 602, n=47): + * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; + * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; + * send(8) g4<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * send(8) g50<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * + * The unit appears to be pipelined, since this matches up with + * the cache-cold case, despite there being two loads here. If + * you replace the g4 in the MOV to null with g50, it's still + * 693 +/- 52 (n=39). + * + * So, take some number between the cache-hot 140 cycles and the + * cache-cold 700 cycles. No particular tuning was done on this. + * + * I haven't done significant testing of the non-TEX opcodes. + * TXL at least looked about the same as TEX. + */ latency = 200; break; }