anv, iris: Disable pre fetching the binding table entries on DG2

On DG2 the HW will fetch the binding entries into the cache
for every single thread when a compute walker is dispatched,
wiping out the advantages of the cache prefetch.

The spec also advises to not do a cache prefetch when we have more than
31 binding table entries, but most real world applications will never
hit that limit.

Signed-off-by: Rohan Garg <rohan.garg@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18498>
This commit is contained in:
Rohan Garg
2022-09-06 17:31:51 +02:00
parent d91c3bde8c
commit c0c243f1cb
3 changed files with 13 additions and 5 deletions

View File

@@ -4741,7 +4741,9 @@ iris_store_cs_state(const struct intel_device_info *devinfo,
assert(cs_prog_data->push.cross_thread.regs == 0); assert(cs_prog_data->push.cross_thread.regs == 0);
#endif #endif
desc.BarrierEnable = cs_prog_data->uses_barrier; desc.BarrierEnable = cs_prog_data->uses_barrier;
desc.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31); /* Typically set to 0 to avoid prefetching on every thread dispatch. */
desc.BindingTableEntryCount = devinfo->verx10 == 125 ?
0 : MIN2(shader->bt.size_bytes / 4, 31);
desc.SamplerCount = encode_sampler_count(shader); desc.SamplerCount = encode_sampler_count(shader);
#if GFX_VER >= 12 #if GFX_VER >= 12
/* TODO: Check if we are missing workarounds and enable mid-thread /* TODO: Check if we are missing workarounds and enable mid-thread
@@ -7212,7 +7214,9 @@ iris_upload_compute_walker(struct iris_context *ice,
.SamplerStatePointer = shs->sampler_table.offset, .SamplerStatePointer = shs->sampler_table.offset,
.SamplerCount = encode_sampler_count(shader), .SamplerCount = encode_sampler_count(shader),
.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE], .BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE],
.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31), /* Typically set to 0 to avoid prefetching on every thread dispatch. */
.BindingTableEntryCount = devinfo->verx10 == 125 ?
0 : MIN2(shader->bt.size_bytes / 4, 31),
}; };
assert(brw_cs_push_const_total_size(cs_prog_data, dispatch.threads) == 0); assert(brw_cs_push_const_total_size(cs_prog_data, dispatch.threads) == 0);

View File

@@ -5096,8 +5096,9 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset, cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
.BindingTablePointer = .BindingTablePointer =
cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset, cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
.BindingTableEntryCount = /* Typically set to 0 to avoid prefetching on every thread dispatch. */
1 + MIN2(pipeline->cs->bind_map.surface_count, 30), .BindingTableEntryCount = devinfo->verx10 == 125 ?
0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
.NumberofThreadsinGPGPUThreadGroup = dispatch.threads, .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
.SharedLocalMemorySize = encode_slm_size(GFX_VER, .SharedLocalMemorySize = encode_slm_size(GFX_VER,
prog_data->base.total_shared), prog_data->base.total_shared),

View File

@@ -2169,8 +2169,11 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin), .SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin),
/* We add 1 because the CS indirect parameters buffer isn't accounted /* We add 1 because the CS indirect parameters buffer isn't accounted
* for in bind_map.surface_count. * for in bind_map.surface_count.
*
* Typically set to 0 to avoid prefetching on every thread dispatch.
*/ */
.BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30), .BindingTableEntryCount = devinfo->verx10 == 125 ?
0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
.BarrierEnable = cs_prog_data->uses_barrier, .BarrierEnable = cs_prog_data->uses_barrier,
.SharedLocalMemorySize = .SharedLocalMemorySize =
encode_slm_size(GFX_VER, cs_prog_data->base.total_shared), encode_slm_size(GFX_VER, cs_prog_data->base.total_shared),