From c0c243f1cb4e9ad7b7e1e3ab1d763494d8936c09 Mon Sep 17 00:00:00 2001 From: Rohan Garg Date: Tue, 6 Sep 2022 17:31:51 +0200 Subject: [PATCH] anv, iris: Disable pre fetching the binding table entries on DG2 On DG2 the HW will fetch the binding entries into the cache for every single thread when a compute walker is dispatched, wiping out the advantages of the cache prefetch. The spec also advises to not do a cache prefetch when we have more than 31 binding table entries, but most real world applications will never hit that limit. Signed-off-by: Rohan Garg Reviewed-by: Kenneth Graunke Part-of: --- src/gallium/drivers/iris/iris_state.c | 8 ++++++-- src/intel/vulkan/genX_cmd_buffer.c | 5 +++-- src/intel/vulkan/genX_pipeline.c | 5 ++++- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 12f13ccd972..3afb152a5c1 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -4741,7 +4741,9 @@ iris_store_cs_state(const struct intel_device_info *devinfo, assert(cs_prog_data->push.cross_thread.regs == 0); #endif desc.BarrierEnable = cs_prog_data->uses_barrier; - desc.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31); + /* Typically set to 0 to avoid prefetching on every thread dispatch. */ + desc.BindingTableEntryCount = devinfo->verx10 == 125 ? + 0 : MIN2(shader->bt.size_bytes / 4, 31); desc.SamplerCount = encode_sampler_count(shader); #if GFX_VER >= 12 /* TODO: Check if we are missing workarounds and enable mid-thread @@ -7212,7 +7214,9 @@ iris_upload_compute_walker(struct iris_context *ice, .SamplerStatePointer = shs->sampler_table.offset, .SamplerCount = encode_sampler_count(shader), .BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE], - .BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31), + /* Typically set to 0 to avoid prefetching on every thread dispatch. */ + .BindingTableEntryCount = devinfo->verx10 == 125 ? + 0 : MIN2(shader->bt.size_bytes / 4, 31), }; assert(brw_cs_push_const_total_size(cs_prog_data, dispatch.threads) == 0); diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 70cbceb942b..90dd68803a1 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -5096,8 +5096,9 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset, .BindingTablePointer = cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset, - .BindingTableEntryCount = - 1 + MIN2(pipeline->cs->bind_map.surface_count, 30), + /* Typically set to 0 to avoid prefetching on every thread dispatch. */ + .BindingTableEntryCount = devinfo->verx10 == 125 ? + 0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30), .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, .SharedLocalMemorySize = encode_slm_size(GFX_VER, prog_data->base.total_shared), diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 4a04be33bd8..44a9855e7c8 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -2169,8 +2169,11 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline) .SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin), /* We add 1 because the CS indirect parameters buffer isn't accounted * for in bind_map.surface_count. + * + * Typically set to 0 to avoid prefetching on every thread dispatch. */ - .BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30), + .BindingTableEntryCount = devinfo->verx10 == 125 ? + 0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30), .BarrierEnable = cs_prog_data->uses_barrier, .SharedLocalMemorySize = encode_slm_size(GFX_VER, cs_prog_data->base.total_shared),