intel/compiler: Use an existing URB write to end TCS threads when viable

VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set.  For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.

In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders.  We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow.  So we can't guarantee that there's
a URB write being at the end of the shader.

Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so.  But on other platforms, there's no such
field, and this write is purely wasteful.

Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.

No changes in shader-db.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
This commit is contained in:
Kenneth Graunke
2022-08-03 20:54:52 -07:00
parent 19c40e0730
commit be21d54aca
2 changed files with 61 additions and 23 deletions

View File

@@ -1511,6 +1511,34 @@ fs_visitor::resolve_source_modifiers(const fs_reg &src)
return temp;
}
/**
* Walk backwards from the end of the program looking for a URB write that
* isn't in control flow, and mark it with EOT.
*
* Return true if successful or false if a separate EOT write is needed.
*/
bool
fs_visitor::mark_last_urb_write_with_eot()
{
foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
prev->eot = true;
/* Delete now dead instructions. */
foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
if (dead == prev)
break;
dead->remove();
}
return true;
} else if (prev->is_control_flow() || prev->has_side_effects()) {
break;
}
}
return false;
}
void
fs_visitor::emit_gs_thread_end()
{
@@ -1526,21 +1554,12 @@ fs_visitor::emit_gs_thread_end()
fs_inst *inst;
if (gs_prog_data->static_vertex_count != -1) {
foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
prev->eot = true;
/* Delete now dead instructions. */
foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
if (dead == prev)
break;
dead->remove();
}
/* Try and tag the last URB write with EOT instead of emitting a whole
* separate write just to finish the thread.
*/
if (mark_last_urb_write_with_eot())
return;
} else if (prev->is_control_flow() || prev->has_side_effects()) {
break;
}
}
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
@@ -6555,6 +6574,31 @@ fs_visitor::set_tcs_invocation_id()
}
}
void
fs_visitor::emit_tcs_thread_end()
{
/* Try and tag the last URB write with EOT instead of emitting a whole
* separate write just to finish the thread. There isn't guaranteed to
* be one, so this may not succeed.
*/
if (devinfo->ver != 8 && mark_last_urb_write_with_eot())
return;
/* Emit a URB write to end the thread. On Broadwell, we use this to write
* zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy
* algorithm to set it optimally). On other platforms, we simply write
* zero to a reserved/MBZ patch header DWord which has no consequence.
*/
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16);
srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0);
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
reg_undef, srcs, ARRAY_SIZE(srcs));
inst->mlen = 3;
inst->eot = true;
}
bool
fs_visitor::run_tcs()
{
@@ -6587,15 +6631,7 @@ fs_visitor::run_tcs()
bld.emit(BRW_OPCODE_ENDIF);
}
/* Emit EOT write; set TR DS Cache bit */
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16);
srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0);
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
reg_undef, srcs, ARRAY_SIZE(srcs));
inst->mlen = 3;
inst->eot = true;
emit_tcs_thread_end();
if (failed)
return false;

View File

@@ -412,6 +412,8 @@ public:
void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src,
unsigned base_offset, const nir_src &offset_src,
unsigned num_components, unsigned first_component);
bool mark_last_urb_write_with_eot();
void emit_tcs_thread_end();
void emit_urb_fence();
void emit_cs_terminate();
fs_reg emit_work_group_id_setup();