From 0a5bdf1199bea33d24d684f897724a696ae5f86f Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Wed, 13 Mar 2024 11:01:16 +0200 Subject: [PATCH] brw: add infra to make use of the address register in the IR This limits the address register to simple cases inside a block. Validation ensures that the address register is only written once and read once. Instruction scheduling makes sure that instructions using the address register in the generator are not scheduled while there is an usage of the register in the IR. Signed-off-by: Lionel Landwerlin Reviewed-by: Alyssa Rosenzweig Reviewed-by: Caio Oliveira Part-of: --- src/intel/compiler/brw_fs.cpp | 13 ++++++++ src/intel/compiler/brw_fs.h | 2 ++ src/intel/compiler/brw_fs_builder.h | 8 +++++ src/intel/compiler/brw_fs_validate.cpp | 45 +++++++++++++++++++++++++- src/intel/compiler/brw_fs_visitor.cpp | 2 ++ src/intel/compiler/brw_ir_fs.h | 6 ++++ src/intel/compiler/brw_print.cpp | 10 ++++-- src/intel/compiler/brw_reg.cpp | 13 ++++++++ src/intel/compiler/brw_reg.h | 3 ++ src/intel/compiler/brw_shader.cpp | 1 - 10 files changed, 99 insertions(+), 4 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index b7a1ffc233d..cb77c691cb9 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -799,6 +799,19 @@ fs_inst::is_raw_move() const brw_type_size_bits(src[0].type) == brw_type_size_bits(dst.type)); } +bool +fs_inst::uses_address_register_implicitly() const +{ + switch (opcode) { + case SHADER_OPCODE_BROADCAST: + case SHADER_OPCODE_SHUFFLE: + case SHADER_OPCODE_MOV_INDIRECT: + return true; + default: + return false; + } +} + /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch. * This brings in those uniform definitions */ diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 739fc827f90..8d66055bc59 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -448,6 +448,8 @@ public: /* The API selected subgroup size */ unsigned api_subgroup_size; /**< 0, 8, 16, 32 */ + unsigned next_address_register_nr; + struct brw_shader_stats shader_stats; void debug_optimizer(const nir_shader *nir, diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h index 638f1ac9145..c1820d8989a 100644 --- a/src/intel/compiler/brw_fs_builder.h +++ b/src/intel/compiler/brw_fs_builder.h @@ -212,6 +212,14 @@ namespace brw { return retype(null_reg_ud(), type); } + brw_reg + vaddr(enum brw_reg_type type, unsigned subnr) const + { + brw_reg addr = brw_address_reg(subnr); + addr.nr = shader->next_address_register_nr++; + return retype(addr, type); + } + /** * Create a null register of floating type. */ diff --git a/src/intel/compiler/brw_fs_validate.cpp b/src/intel/compiler/brw_fs_validate.cpp index d80532fc4af..4796edfa449 100644 --- a/src/intel/compiler/brw_fs_validate.cpp +++ b/src/intel/compiler/brw_fs_validate.cpp @@ -281,6 +281,16 @@ brw_fs_validate(const fs_visitor &s) s.cfg->validate(_mesa_shader_stage_to_abbrev(s.stage)); foreach_block(block, s.cfg) { + /* Track the last used address register. Usage of the address register + * in the IR should be limited to within a block, otherwise we would + * unable to schedule some instructions without spilling the address + * register to a VGRF. + * + * Another pattern we stick to when using the address register in the IR + * is that we write and read the register in pairs of instruction. + */ + uint32_t last_used_address_register[16] = {}; + foreach_inst_in_block (fs_inst, inst, block) { brw_validate_instruction_phase(s, inst); @@ -392,15 +402,24 @@ brw_fs_validate(const fs_visitor &s) if (inst->dst.file == VGRF) { fsv_assert_lte(inst->dst.offset / REG_SIZE + regs_written(inst), s.alloc.sizes[inst->dst.nr]); - if (inst->exec_size > 1) fsv_assert_ne(inst->dst.stride, 0); + } else if (inst->dst.is_address()) { + fsv_assert(inst->dst.nr != 0); } + bool read_address_reg = false; for (unsigned i = 0; i < inst->sources; i++) { if (inst->src[i].file == VGRF) { fsv_assert_lte(inst->src[i].offset / REG_SIZE + regs_read(devinfo, inst, i), s.alloc.sizes[inst->src[i].nr]); + } else if (inst->src[i].is_address()) { + fsv_assert(inst->src[i].nr != 0); + for (unsigned hw = 0; hw < inst->size_read(devinfo, i); hw += 2) { + fsv_assert_eq(inst->src[i].nr, + last_used_address_register[inst->src[i].address_slot(hw)]); + } + read_address_reg = true; } } @@ -516,6 +535,30 @@ brw_fs_validate(const fs_visitor &s) inst->src[i].type != BRW_TYPE_HF); } } + + /* Update the last used address register. */ + if (read_address_reg) { + /* When an instruction only reads the address register, we assume + * the read parts are never going to be used again. + */ + for (unsigned i = 0; i < inst->sources; i++) { + if (!inst->src[i].is_address()) + continue; + for (unsigned hw = 0; hw < inst->size_read(devinfo, i); hw += 2) + last_used_address_register[inst->src[i].address_slot(hw)] = 0; + } + } + if (inst->dst.is_address()) { + /* For the written part of the address register */ + for (unsigned hw = 0; hw < inst->size_written; hw += 2) + last_used_address_register[inst->dst.address_slot(hw)] = inst->dst.nr; + } else if (inst->uses_address_register_implicitly()) { + /* If the instruction is making use of the address register, + * discard the entire thing. + */ + memset(last_used_address_register, 0, + sizeof(last_used_address_register)); + } } } } diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp index 907203555e3..b055c9fc031 100644 --- a/src/intel/compiler/brw_fs_visitor.cpp +++ b/src/intel/compiler/brw_fs_visitor.cpp @@ -471,6 +471,8 @@ fs_visitor::init() this->spilled_any_registers = false; this->phase = BRW_SHADER_PHASE_INITIAL; + + this->next_address_register_nr = 1; } fs_visitor::~fs_visitor() diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h index 27dbecf92dd..f9564886625 100644 --- a/src/intel/compiler/brw_ir_fs.h +++ b/src/intel/compiler/brw_ir_fs.h @@ -122,6 +122,12 @@ public: */ bool has_sampler_residency() const; + /** + * Return true if this instruction is using the address register + * implicitly. + */ + bool uses_address_register_implicitly() const; + uint8_t sources; /**< Number of brw_reg sources. */ /** diff --git a/src/intel/compiler/brw_print.cpp b/src/intel/compiler/brw_print.cpp index c008a4b94f1..dea47d75628 100644 --- a/src/intel/compiler/brw_print.cpp +++ b/src/intel/compiler/brw_print.cpp @@ -434,7 +434,10 @@ brw_print_instruction(const fs_visitor &s, const fs_inst *inst, FILE *file, cons fprintf(file, "***attr%d***", inst->dst.nr); break; case ADDRESS: - fprintf(file, "a0.%d", inst->dst.subnr); + if (inst->dst.nr == 0) + fprintf(file, "a0.%d", inst->dst.subnr); + else + fprintf(file, "va%u.%d", inst->dst.nr, inst->dst.subnr); break; case ARF: switch (inst->dst.nr & 0xF0) { @@ -498,7 +501,10 @@ brw_print_instruction(const fs_visitor &s, const fs_inst *inst, FILE *file, cons fprintf(file, "g%d", inst->src[i].nr); break; case ADDRESS: - fprintf(file, "a0.%d", inst->src[i].subnr); + if (inst->src[i].nr == 0) + fprintf(file, "a0.%d", inst->src[i].subnr); + else + fprintf(file, "va%u.%d", inst->src[i].nr, inst->src[i].subnr); break; case ATTR: fprintf(file, "attr%d", inst->src[i].nr); diff --git a/src/intel/compiler/brw_reg.cpp b/src/intel/compiler/brw_reg.cpp index ab119dbcfaa..aab993ba296 100644 --- a/src/intel/compiler/brw_reg.cpp +++ b/src/intel/compiler/brw_reg.cpp @@ -263,6 +263,19 @@ brw_reg::is_accumulator() const return file == ARF && (nr & 0xF0) == BRW_ARF_ACCUMULATOR; } +bool +brw_reg::is_address() const +{ + return file == ADDRESS; +} + +unsigned +brw_reg::address_slot(unsigned byte_offset) const +{ + assert(is_address()); + return (reg_offset(*this) + byte_offset) / 2; +} + bool brw_reg::equals(const brw_reg &r) const { diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h index 144f2554195..2a5de328419 100644 --- a/src/intel/compiler/brw_reg.h +++ b/src/intel/compiler/brw_reg.h @@ -226,6 +226,9 @@ typedef struct brw_reg { bool is_negative_one() const; bool is_null() const; bool is_accumulator() const; + bool is_address() const; + + unsigned address_slot(unsigned byte_offset) const; /** * Return the size in bytes of a single logical component of the diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index d8449e09906..64d2e25f633 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -323,4 +323,3 @@ fs_inst::remove(bblock_t *block, bool defer_later_block_ip_updates) exec_node::remove(); } -