intel/fs: Emit HALT for discard on Gen4-5

Using HALT to immediately jump to the end of the shader is required to
implement GL_EXT_gpu_shader4 and OpenGL 3.0.  However, vanilla OpenGL
1.2 doesn't forbid it and it likely makes something somewhere faster.
We should be consistent and implement the same discard behavior on all
hardware if we can.

The rules for HALT on Gen4-5 are a bit different from Gen6+.  On the
older hardware, there is no stack for HALT; instead it's up to software
to save and restore mask registers.  However, there's no real saving
needed since we only use HALT to jump to the end of the program where
we're about about to do our FB writes.  All we need to do is reset AMask
to DMask, the value it was initialized to at the start of the thread.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5244>
This commit is contained in:
Jason Ekstrand
2020-04-25 14:59:30 -05:00
committed by Marge Bot
parent 94aa7997e4
commit c48f42e178
6 changed files with 107 additions and 30 deletions

View File

@@ -708,6 +708,9 @@ reg(FILE *file, unsigned _reg_file, unsigned _reg_nr)
format(file, "mask%d", _reg_nr & 0x0f); format(file, "mask%d", _reg_nr & 0x0f);
break; break;
case BRW_ARF_MASK_STACK: case BRW_ARF_MASK_STACK:
format(file, "ms%d", _reg_nr & 0x0f);
break;
case BRW_ARF_MASK_STACK_DEPTH:
format(file, "msd%d", _reg_nr & 0x0f); format(file, "msd%d", _reg_nr & 0x0f);
break; break;
case BRW_ARF_STATE: case BRW_ARF_STATE:

View File

@@ -1102,7 +1102,7 @@ brw_inst *brw_WHILE(struct brw_codegen *p);
brw_inst *brw_BREAK(struct brw_codegen *p); brw_inst *brw_BREAK(struct brw_codegen *p);
brw_inst *brw_CONT(struct brw_codegen *p); brw_inst *brw_CONT(struct brw_codegen *p);
brw_inst *gen6_HALT(struct brw_codegen *p); brw_inst *brw_HALT(struct brw_codegen *p);
/* Forward jumps: /* Forward jumps:
*/ */

View File

@@ -1725,14 +1725,23 @@ brw_CONT(struct brw_codegen *p)
} }
brw_inst * brw_inst *
gen6_HALT(struct brw_codegen *p) brw_HALT(struct brw_codegen *p)
{ {
const struct gen_device_info *devinfo = p->devinfo; const struct gen_device_info *devinfo = p->devinfo;
brw_inst *insn; brw_inst *insn;
insn = next_insn(p, BRW_OPCODE_HALT); insn = next_insn(p, BRW_OPCODE_HALT);
brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
if (devinfo->gen < 8) { if (devinfo->gen < 6) {
/* From the Gen4 PRM:
*
* "IP register must be put (for example, by the assembler) at <dst>
* and <src0> locations.
*/
brw_set_dest(p, insn, brw_ip_reg());
brw_set_src0(p, insn, brw_ip_reg());
brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */
} else if (devinfo->gen < 8) {
brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */ brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
} else if (devinfo->gen < 12) { } else if (devinfo->gen < 12) {

View File

@@ -224,25 +224,27 @@ public:
bool bool
fs_generator::patch_discard_jumps_to_fb_writes() fs_generator::patch_discard_jumps_to_fb_writes()
{ {
if (devinfo->gen < 6 || this->discard_halt_patches.is_empty()) if (this->discard_halt_patches.is_empty())
return false; return false;
int scale = brw_jump_scale(p->devinfo); int scale = brw_jump_scale(p->devinfo);
/* There is a somewhat strange undocumented requirement of using if (devinfo->gen >= 6) {
* HALT, according to the simulator. If some channel has HALTed to /* There is a somewhat strange undocumented requirement of using
* a particular UIP, then by the end of the program, every channel * HALT, according to the simulator. If some channel has HALTed to
* must have HALTed to that UIP. Furthermore, the tracking is a * a particular UIP, then by the end of the program, every channel
* stack, so you can't do the final halt of a UIP after starting * must have HALTed to that UIP. Furthermore, the tracking is a
* halting to a new UIP. * stack, so you can't do the final halt of a UIP after starting
* * halting to a new UIP.
* Symptoms of not emitting this instruction on actual hardware *
* included GPU hangs and sparkly rendering on the piglit discard * Symptoms of not emitting this instruction on actual hardware
* tests. * included GPU hangs and sparkly rendering on the piglit discard
*/ * tests.
brw_inst *last_halt = gen6_HALT(p); */
brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); brw_inst *last_halt = brw_HALT(p);
brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
}
int ip = p->nr_insn; int ip = p->nr_insn;
@@ -250,11 +252,67 @@ fs_generator::patch_discard_jumps_to_fb_writes()
brw_inst *patch = &p->store[patch_ip->ip]; brw_inst *patch = &p->store[patch_ip->ip];
assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT); assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
/* HALT takes a half-instruction distance from the pre-incremented IP. */ if (devinfo->gen >= 6) {
brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); /* HALT takes a half-instruction distance from the pre-incremented IP. */
brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
} else {
brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale));
}
} }
this->discard_halt_patches.make_empty(); this->discard_halt_patches.make_empty();
if (devinfo->gen < 6) {
/* From the g965 PRM:
*
* "As DMask is not automatically reloaded into AMask upon completion
* of this instruction, software has to manually restore AMask upon
* completion."
*
* DMask lives in the bottom 16 bits of sr0.1.
*/
brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK),
retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW));
brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1);
brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE);
brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE);
brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH);
}
if (devinfo->gen == 4 && !devinfo->is_g4x) {
/* From the g965 PRM:
*
* "[DevBW, DevCL] Erratum: The subfields in mask stack register are
* reset to zero during graphics reset, however, they are not
* initialized at thread dispatch. These subfields will retain the
* values from the previous thread. Software should make sure the
* mask stack is empty (reset to zero) before terminating the thread.
* In case that this is not practical, software may have to reset the
* mask stack at the beginning of each kernel, which will impact the
* performance."
*
* Luckily we can rely on:
*
* "[DevBW, DevCL] This register access restriction is not
* applicable, hardware does ensure execution pipeline coherency,
* when a mask stack register is used as an explicit source and/or
* destination."
*/
brw_push_insn_state(p);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
brw_set_default_exec_size(p, BRW_EXECUTE_2);
brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0));
brw_set_default_exec_size(p, BRW_EXECUTE_16);
/* Reset the if stack. */
brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW),
brw_imm_uw(0));
brw_pop_insn_state(p);
}
return true; return true;
} }
@@ -1362,14 +1420,12 @@ fs_generator::generate_ddy(const fs_inst *inst,
void void
fs_generator::generate_discard_jump(fs_inst *) fs_generator::generate_discard_jump(fs_inst *)
{ {
assert(devinfo->gen >= 6);
/* This HALT will be patched up at FB write time to point UIP at the end of /* This HALT will be patched up at FB write time to point UIP at the end of
* the program, and at brw_uip_jip() JIP will be set to the end of the * the program, and at brw_uip_jip() JIP will be set to the end of the
* current block (or the program). * current block (or the program).
*/ */
this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn)); this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
gen6_HALT(p); brw_HALT(p);
} }
void void

View File

@@ -3482,13 +3482,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
cmp->predicate = BRW_PREDICATE_NORMAL; cmp->predicate = BRW_PREDICATE_NORMAL;
cmp->flag_subreg = sample_mask_flag_subreg(this); cmp->flag_subreg = sample_mask_flag_subreg(this);
if (devinfo->gen >= 6) { emit_discard_jump();
/* Due to the way we implement discard, the jump will only happen
* when the whole quad is discarded. So we can do this even for
* demote as it won't break its uniformity promises.
*/
emit_discard_jump();
}
if (devinfo->gen < 7) if (devinfo->gen < 7)
limit_dispatch_width( limit_dispatch_width(

View File

@@ -939,6 +939,21 @@ brw_dmask_reg()
return brw_sr0_reg(2); return brw_sr0_reg(2);
} }
static inline struct brw_reg
brw_mask_stack_reg(unsigned subnr)
{
return suboffset(retype(brw_vec16_reg(BRW_ARCHITECTURE_REGISTER_FILE,
BRW_ARF_MASK_STACK, 0),
BRW_REGISTER_TYPE_UB), subnr);
}
static inline struct brw_reg
brw_mask_stack_depth_reg(unsigned subnr)
{
return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
BRW_ARF_MASK_STACK_DEPTH, subnr);
}
static inline struct brw_reg static inline struct brw_reg
brw_message_reg(unsigned nr) brw_message_reg(unsigned nr)
{ {