diff --git a/src/intel/compiler/brw_disasm.c b/src/intel/compiler/brw_disasm.c index ff46cb9549a..486f55d7bec 100644 --- a/src/intel/compiler/brw_disasm.c +++ b/src/intel/compiler/brw_disasm.c @@ -708,6 +708,9 @@ reg(FILE *file, unsigned _reg_file, unsigned _reg_nr) format(file, "mask%d", _reg_nr & 0x0f); break; case BRW_ARF_MASK_STACK: + format(file, "ms%d", _reg_nr & 0x0f); + break; + case BRW_ARF_MASK_STACK_DEPTH: format(file, "msd%d", _reg_nr & 0x0f); break; case BRW_ARF_STATE: diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index 262c527b2e9..9ed481e10e6 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -1102,7 +1102,7 @@ brw_inst *brw_WHILE(struct brw_codegen *p); brw_inst *brw_BREAK(struct brw_codegen *p); brw_inst *brw_CONT(struct brw_codegen *p); -brw_inst *gen6_HALT(struct brw_codegen *p); +brw_inst *brw_HALT(struct brw_codegen *p); /* Forward jumps: */ diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index 12042131999..cc1bc8cc13f 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -1725,14 +1725,23 @@ brw_CONT(struct brw_codegen *p) } brw_inst * -gen6_HALT(struct brw_codegen *p) +brw_HALT(struct brw_codegen *p) { const struct gen_device_info *devinfo = p->devinfo; brw_inst *insn; insn = next_insn(p, BRW_OPCODE_HALT); brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); - if (devinfo->gen < 8) { + if (devinfo->gen < 6) { + /* From the Gen4 PRM: + * + * "IP register must be put (for example, by the assembler) at + * and locations. + */ + brw_set_dest(p, insn, brw_ip_reg()); + brw_set_src0(p, insn, brw_ip_reg()); + brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */ + } else if (devinfo->gen < 8) { brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */ } else if (devinfo->gen < 12) { diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 5825e0770d4..bee9816d815 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -224,25 +224,27 @@ public: bool fs_generator::patch_discard_jumps_to_fb_writes() { - if (devinfo->gen < 6 || this->discard_halt_patches.is_empty()) + if (this->discard_halt_patches.is_empty()) return false; int scale = brw_jump_scale(p->devinfo); - /* There is a somewhat strange undocumented requirement of using - * HALT, according to the simulator. If some channel has HALTed to - * a particular UIP, then by the end of the program, every channel - * must have HALTed to that UIP. Furthermore, the tracking is a - * stack, so you can't do the final halt of a UIP after starting - * halting to a new UIP. - * - * Symptoms of not emitting this instruction on actual hardware - * included GPU hangs and sparkly rendering on the piglit discard - * tests. - */ - brw_inst *last_halt = gen6_HALT(p); - brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); - brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); + if (devinfo->gen >= 6) { + /* There is a somewhat strange undocumented requirement of using + * HALT, according to the simulator. If some channel has HALTed to + * a particular UIP, then by the end of the program, every channel + * must have HALTed to that UIP. Furthermore, the tracking is a + * stack, so you can't do the final halt of a UIP after starting + * halting to a new UIP. + * + * Symptoms of not emitting this instruction on actual hardware + * included GPU hangs and sparkly rendering on the piglit discard + * tests. + */ + brw_inst *last_halt = brw_HALT(p); + brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); + brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); + } int ip = p->nr_insn; @@ -250,11 +252,67 @@ fs_generator::patch_discard_jumps_to_fb_writes() brw_inst *patch = &p->store[patch_ip->ip]; assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT); - /* HALT takes a half-instruction distance from the pre-incremented IP. */ - brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); + if (devinfo->gen >= 6) { + /* HALT takes a half-instruction distance from the pre-incremented IP. */ + brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); + } else { + brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale)); + } } this->discard_halt_patches.make_empty(); + + if (devinfo->gen < 6) { + /* From the g965 PRM: + * + * "As DMask is not automatically reloaded into AMask upon completion + * of this instruction, software has to manually restore AMask upon + * completion." + * + * DMask lives in the bottom 16 bits of sr0.1. + */ + brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK), + retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW)); + brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1); + brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE); + brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE); + brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH); + } + + if (devinfo->gen == 4 && !devinfo->is_g4x) { + /* From the g965 PRM: + * + * "[DevBW, DevCL] Erratum: The subfields in mask stack register are + * reset to zero during graphics reset, however, they are not + * initialized at thread dispatch. These subfields will retain the + * values from the previous thread. Software should make sure the + * mask stack is empty (reset to zero) before terminating the thread. + * In case that this is not practical, software may have to reset the + * mask stack at the beginning of each kernel, which will impact the + * performance." + * + * Luckily we can rely on: + * + * "[DevBW, DevCL] This register access restriction is not + * applicable, hardware does ensure execution pipeline coherency, + * when a mask stack register is used as an explicit source and/or + * destination." + */ + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + + brw_set_default_exec_size(p, BRW_EXECUTE_2); + brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0)); + + brw_set_default_exec_size(p, BRW_EXECUTE_16); + /* Reset the if stack. */ + brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW), + brw_imm_uw(0)); + + brw_pop_insn_state(p); + } + return true; } @@ -1362,14 +1420,12 @@ fs_generator::generate_ddy(const fs_inst *inst, void fs_generator::generate_discard_jump(fs_inst *) { - assert(devinfo->gen >= 6); - /* This HALT will be patched up at FB write time to point UIP at the end of * the program, and at brw_uip_jip() JIP will be set to the end of the * current block (or the program). */ this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn)); - gen6_HALT(p); + brw_HALT(p); } void diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 8916f9b876e..37c1d2f4cc9 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -3482,13 +3482,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, cmp->predicate = BRW_PREDICATE_NORMAL; cmp->flag_subreg = sample_mask_flag_subreg(this); - if (devinfo->gen >= 6) { - /* Due to the way we implement discard, the jump will only happen - * when the whole quad is discarded. So we can do this even for - * demote as it won't break its uniformity promises. - */ - emit_discard_jump(); - } + emit_discard_jump(); if (devinfo->gen < 7) limit_dispatch_width( diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h index 865cd9e0f36..620b8722d92 100644 --- a/src/intel/compiler/brw_reg.h +++ b/src/intel/compiler/brw_reg.h @@ -939,6 +939,21 @@ brw_dmask_reg() return brw_sr0_reg(2); } +static inline struct brw_reg +brw_mask_stack_reg(unsigned subnr) +{ + return suboffset(retype(brw_vec16_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_MASK_STACK, 0), + BRW_REGISTER_TYPE_UB), subnr); +} + +static inline struct brw_reg +brw_mask_stack_depth_reg(unsigned subnr) +{ + return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_MASK_STACK_DEPTH, subnr); +} + static inline struct brw_reg brw_message_reg(unsigned nr) {