From feec9166cdb2d562e741a2775b3fa87fc0876707 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Wed, 20 Jul 2022 10:21:21 -0700 Subject: [PATCH] intel/compiler/xe2: Handle new URB write messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rework: * idr v1: Fix compilation error. * idr v2: Add support for per-channel offsets. * idr v3: get_lowered_simd_width is 16 on Xe2+. * idr v4: Add disassembly support. Add validation support. * Sqaushed in changes Marcin Ĺšlusarz's patches: * "intel/compiler: skip adding 0 to payload address" * "intel/compiler/xe2: drop masking off top 8 bits of URB handle" Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_disasm.c | 120 +++++++++++---- src/intel/compiler/brw_eu_validate.c | 6 +- src/intel/compiler/brw_fs.cpp | 2 +- .../compiler/brw_lower_logical_sends.cpp | 142 +++++++++++++++++- 4 files changed, 234 insertions(+), 36 deletions(-) diff --git a/src/intel/compiler/brw_disasm.c b/src/intel/compiler/brw_disasm.c index e92b60c327f..f7d1c6c6f87 100644 --- a/src/intel/compiler/brw_disasm.c +++ b/src/intel/compiler/brw_disasm.c @@ -2222,41 +2222,95 @@ brw_disassemble_inst(FILE *file, const struct brw_isa_info *isa, } case BRW_SFID_URB: { - unsigned opcode = brw_inst_urb_opcode(devinfo, inst); - - format(file, " offset %"PRIu64, brw_inst_urb_global_offset(devinfo, inst)); - - space = 1; - - err |= control(file, "urb opcode", - devinfo->ver >= 7 ? gfx7_urb_opcode - : gfx5_urb_opcode, - opcode, &space); - - if (devinfo->ver >= 7 && - brw_inst_urb_per_slot_offset(devinfo, inst)) { - string(file, " per-slot"); - } - - if (opcode == GFX8_URB_OPCODE_SIMD8_WRITE || - opcode == GFX8_URB_OPCODE_SIMD8_READ) { - if (brw_inst_urb_channel_mask_present(devinfo, inst)) - string(file, " masked"); - } else if (opcode != GFX125_URB_OPCODE_FENCE) { - err |= control(file, "urb swizzle", urb_swizzle, - brw_inst_urb_swizzle_control(devinfo, inst), + if (devinfo->ver >= 20) { + format(file, " ("); + const enum lsc_opcode op = lsc_msg_desc_opcode(devinfo, imm_desc); + err |= control(file, "operation", lsc_operation, + op, &space); + format(file, ","); + err |= control(file, "addr_size", lsc_addr_size, + lsc_msg_desc_addr_size(devinfo, imm_desc), &space); - } - if (devinfo->ver < 7) { - err |= control(file, "urb allocate", urb_allocate, - brw_inst_urb_allocate(devinfo, inst), &space); - err |= control(file, "urb used", urb_used, - brw_inst_urb_used(devinfo, inst), &space); - } - if (devinfo->ver < 8) { - err |= control(file, "urb complete", urb_complete, - brw_inst_urb_complete(devinfo, inst), &space); + format(file, ","); + err |= control(file, "data_size", lsc_data_size, + lsc_msg_desc_data_size(devinfo, imm_desc), + &space); + format(file, ","); + if (lsc_opcode_has_cmask(op)) { + err |= control(file, "component_mask", + lsc_cmask_str, + lsc_msg_desc_cmask(devinfo, imm_desc), + &space); + } else { + err |= control(file, "vector_size", + lsc_vect_size_str, + lsc_msg_desc_vect_size(devinfo, imm_desc), + &space); + if (lsc_msg_desc_transpose(devinfo, imm_desc)) + format(file, ", transpose"); + } + switch(op) { + case LSC_OP_LOAD_CMASK: + case LSC_OP_LOAD: + format(file, ","); + err |= control(file, "cache_load", + lsc_cache_load, + lsc_msg_desc_cache_ctrl(devinfo, imm_desc), + &space); + break; + default: + format(file, ","); + err |= control(file, "cache_store", + lsc_cache_store, + lsc_msg_desc_cache_ctrl(devinfo, imm_desc), + &space); + break; + } + + format(file, " dst_len = %u,", lsc_msg_desc_dest_len(devinfo, imm_desc)); + format(file, " src0_len = %u,", lsc_msg_desc_src0_len(devinfo, imm_desc)); + format(file, " src1_len = %d", brw_message_ex_desc_ex_mlen(devinfo, imm_ex_desc)); + err |= control(file, "address_type", lsc_addr_surface_type, + lsc_msg_desc_addr_type(devinfo, imm_desc), &space); + format(file, " )"); + } else { + unsigned urb_opcode = brw_inst_urb_opcode(devinfo, inst); + + format(file, " offset %"PRIu64, brw_inst_urb_global_offset(devinfo, inst)); + + space = 1; + + err |= control(file, "urb opcode", + devinfo->ver >= 7 ? gfx7_urb_opcode + : gfx5_urb_opcode, + urb_opcode, &space); + + if (devinfo->ver >= 7 && + brw_inst_urb_per_slot_offset(devinfo, inst)) { + string(file, " per-slot"); + } + + if (urb_opcode == GFX8_URB_OPCODE_SIMD8_WRITE || + urb_opcode == GFX8_URB_OPCODE_SIMD8_READ) { + if (brw_inst_urb_channel_mask_present(devinfo, inst)) + string(file, " masked"); + } else if (urb_opcode != GFX125_URB_OPCODE_FENCE) { + err |= control(file, "urb swizzle", urb_swizzle, + brw_inst_urb_swizzle_control(devinfo, inst), + &space); + } + + if (devinfo->ver < 7) { + err |= control(file, "urb allocate", urb_allocate, + brw_inst_urb_allocate(devinfo, inst), &space); + err |= control(file, "urb used", urb_used, + brw_inst_urb_used(devinfo, inst), &space); + } + if (devinfo->ver < 8) { + err |= control(file, "urb complete", urb_complete, + brw_inst_urb_complete(devinfo, inst), &space); + } } break; } diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c index ec02cb2c703..f9383afd850 100644 --- a/src/intel/compiler/brw_eu_validate.c +++ b/src/intel/compiler/brw_eu_validate.c @@ -2485,6 +2485,10 @@ send_descriptor_restrictions(const struct brw_isa_info *isa, const uint32_t desc = brw_inst_send_desc(devinfo, inst); switch (brw_inst_sfid(devinfo, inst)) { + case BRW_SFID_URB: + if (devinfo->ver < 20) + break; + FALLTHROUGH; case GFX12_SFID_TGM: case GFX12_SFID_SLM: case GFX12_SFID_UGM: @@ -2500,7 +2504,7 @@ send_descriptor_restrictions(const struct brw_isa_info *isa, break; } - if (brw_inst_sfid(devinfo, inst) == BRW_SFID_URB) { + if (brw_inst_sfid(devinfo, inst) == BRW_SFID_URB && devinfo->ver < 20) { /* Gfx4 doesn't have a "header present" bit in the SEND message. */ ERROR_IF(devinfo->ver > 4 && !brw_inst_header_present(devinfo, inst), "Header must be present for all URB messages."); diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 9716383ad94..d4545cabc7e 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -5391,7 +5391,7 @@ get_lowered_simd_width(const struct brw_compiler *compiler, case SHADER_OPCODE_URB_READ_LOGICAL: case SHADER_OPCODE_URB_WRITE_LOGICAL: - return MIN2(8, inst->exec_size); + return MIN2(devinfo->ver < 20 ? 8 : 16, inst->exec_size); case SHADER_OPCODE_QUAD_SWIZZLE: { const unsigned swiz = inst->src[1].ud; diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp index 05e6b95a845..367a9abc672 100644 --- a/src/intel/compiler/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw_lower_logical_sends.cpp @@ -266,6 +266,142 @@ lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst) inst->src[3] = brw_null_reg(); } +static void +lower_urb_write_logical_send_xe2(const fs_builder &bld, fs_inst *inst) +{ + /* FINISHME: This is not yet implemented. */ + assert(inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file == BAD_FILE); + + const intel_device_info *devinfo = bld.shader->devinfo; + assert(devinfo->has_lsc); + + /* Get the logical send arguments. */ + const fs_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE]; + const fs_reg src = inst->src[URB_LOGICAL_SRC_DATA]; + + /* Calculate the total number of components of the payload. */ + const unsigned src_comps = inst->components_read(URB_LOGICAL_SRC_DATA); + const unsigned src_sz = type_sz(src.type); + + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD); + + bld.MOV(payload, handle); + + /* The low 24-bits of the URB handle is a byte offset into the URB area. + * Add the offset of the write to this value. + * + * FINISHME: What units is inst->offset? vec4, right? + */ + if (inst->offset) { + bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16)); + inst->offset = 0; + } + + if (inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE) { + fs_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS]; + fs_reg payload1 = bld.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg payload2 = bld.vgrf(BRW_REGISTER_TYPE_UD); + + for (unsigned i = 1; i < src_comps; i++) { + bld.MOV(payload2, offset(src, bld, i - 1)); + bld.ADD(payload1, payload, offset(offsets, bld, i - 1)); + + fs_inst *send = bld.emit(SHADER_OPCODE_SEND, + bld.null_reg_ud(), + brw_imm_ud(0) /* desc */, + brw_imm_ud(0) /* ex_desc */, + brw_vec8_grf(0, 0) /* payload */); + + const unsigned ex_mlen = (src_sz * inst->exec_size) / REG_SIZE; + + send->sfid = BRW_SFID_URB; + + send->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, send->exec_size, + LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32, + 1 /* num_coordinates */, + LSC_DATA_SIZE_D32, 1 /* num_channels */, + false /* transpose */, + LSC_CACHE_STORE_L1UC_L3UC, + false /* has_dest */); + + send->mlen = lsc_msg_desc_src0_len(devinfo, send->desc); + send->ex_mlen = ex_mlen; + send->header_size = 0; + send->send_has_side_effects = true; + send->send_is_volatile = false; + + send->resize_sources(4); + + send->src[0] = brw_imm_ud(0); + send->src[1] = brw_imm_ud(0); + + send->src[2] = payload1; + send->src[3] = payload2; + } + + bld.MOV(payload2, offset(src, bld, src_comps - 1)); + bld.ADD(payload1, payload, offset(offsets, bld, src_comps - 1)); + const unsigned ex_mlen = (src_sz * inst->exec_size) / REG_SIZE; + + inst->sfid = BRW_SFID_URB; + + inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size, + LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32, + 1 /* num_coordinates */, + LSC_DATA_SIZE_D32, 1 /* num_channels */, + false /* transpose */, + LSC_CACHE_STORE_L1UC_L3UC, + false /* has_dest */); + + + /* Update the original instruction. */ + inst->opcode = SHADER_OPCODE_SEND; + inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); + inst->ex_mlen = ex_mlen; + inst->header_size = 0; + inst->send_has_side_effects = true; + inst->send_is_volatile = false; + + inst->resize_sources(4); + + inst->src[0] = brw_imm_ud(0); + inst->src[1] = brw_imm_ud(0); + + inst->src[2] = payload1; + inst->src[3] = payload2; + } else { + fs_reg payload2 = bld.move_to_vgrf(src, src_comps); + const unsigned ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE; + + inst->sfid = BRW_SFID_URB; + + inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size, + LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32, + 1 /* num_coordinates */, + LSC_DATA_SIZE_D32, src_comps /* num_channels */, + false /* transpose */, + LSC_CACHE_STORE_L1UC_L3UC, + false /* has_dest */); + + + /* Update the original instruction. */ + inst->opcode = SHADER_OPCODE_SEND; + inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); + inst->ex_mlen = ex_mlen; + inst->header_size = 0; + inst->send_has_side_effects = true; + inst->send_is_volatile = false; + + inst->resize_sources(4); + + inst->src[0] = brw_imm_ud(0); + inst->src[1] = brw_imm_ud(0); + + inst->src[2] = payload; + inst->src[3] = payload2; + } +} + static void setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key, fs_reg *dst, fs_reg color, unsigned components) @@ -3201,7 +3337,11 @@ fs_visitor::lower_logical_sends() break; case SHADER_OPCODE_URB_WRITE_LOGICAL: - lower_urb_write_logical_send(ibld, inst); + if (devinfo->ver < 20) + lower_urb_write_logical_send(ibld, inst); + else + lower_urb_write_logical_send_xe2(ibld, inst); + break; default: