anv: device: calculate compute thread numbers using subslices numbers

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
Lionel Landwerlin
2016-09-07 17:19:35 +01:00
parent 1f291369e4
commit 09394ee6cf
6 changed files with 74 additions and 18 deletions

View File

@@ -924,14 +924,15 @@ anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
if (size == 0) { if (size == 0) {
/* We own the lock. Allocate a buffer */ /* We own the lock. Allocate a buffer */
struct gen_device_info *devinfo = &device->info; struct anv_physical_device *physical_device =
&device->instance->physicalDevice;
uint32_t max_threads[] = { uint32_t max_threads[] = {
[MESA_SHADER_VERTEX] = devinfo->max_vs_threads, [MESA_SHADER_VERTEX] = physical_device->max_vs_threads,
[MESA_SHADER_TESS_CTRL] = devinfo->max_hs_threads, [MESA_SHADER_TESS_CTRL] = physical_device->max_hs_threads,
[MESA_SHADER_TESS_EVAL] = devinfo->max_ds_threads, [MESA_SHADER_TESS_EVAL] = physical_device->max_ds_threads,
[MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads, [MESA_SHADER_GEOMETRY] = physical_device->max_gs_threads,
[MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads, [MESA_SHADER_FRAGMENT] = physical_device->max_wm_threads,
[MESA_SHADER_COMPUTE] = devinfo->max_cs_threads, [MESA_SHADER_COMPUTE] = physical_device->max_cs_threads,
}; };
size = per_thread_scratch * max_threads[stage]; size = per_thread_scratch * max_threads[stage];

View File

@@ -136,6 +136,41 @@ anv_physical_device_init(struct anv_physical_device *device,
bool swizzled = anv_gem_get_bit6_swizzle(fd, I915_TILING_X); bool swizzled = anv_gem_get_bit6_swizzle(fd, I915_TILING_X);
device->max_vs_threads = device->info->max_vs_threads;
device->max_hs_threads = device->info->max_hs_threads;
device->max_ds_threads = device->info->max_ds_threads;
device->max_gs_threads = device->info->max_gs_threads;
device->max_wm_threads = device->info->max_wm_threads;
/* GENs prior to 8 do not support EU/Subslice info */
if (device->info->gen >= 8) {
device->subslice_total = anv_gem_get_param(fd, I915_PARAM_SUBSLICE_TOTAL);
device->eu_total = anv_gem_get_param(fd, I915_PARAM_EU_TOTAL);
/* Without this information, we cannot get the right Braswell
* brandstrings, and we have to use conservative numbers for GPGPU on
* many platforms, but otherwise, things will just work.
*/
if (device->subslice_total < 1 || device->eu_total < 1) {
fprintf(stderr, "WARNING: Kernel 4.1 required to properly"
" query GPU properties.\n");
}
} else if (device->info->gen == 7) {
device->subslice_total = 1 << (device->info->gt - 1);
}
if (device->info->is_cherryview &&
device->subslice_total > 0 && device->eu_total > 0) {
/* Logical CS threads = EUs per subslice * 7 threads per EU */
device->max_cs_threads = device->eu_total / device->subslice_total * 7;
/* Fuse configurations may give more threads than expected, never less. */
if (device->max_cs_threads < device->info->max_cs_threads)
device->max_cs_threads = device->info->max_cs_threads;
} else {
device->max_cs_threads = device->info->max_cs_threads;
}
close(fd); close(fd);
brw_process_intel_debug_variable(); brw_process_intel_debug_variable();
@@ -503,11 +538,11 @@ void anv_GetPhysicalDeviceProperties(
.maxFragmentCombinedOutputResources = 8, .maxFragmentCombinedOutputResources = 8,
.maxComputeSharedMemorySize = 32768, .maxComputeSharedMemorySize = 32768,
.maxComputeWorkGroupCount = { 65535, 65535, 65535 }, .maxComputeWorkGroupCount = { 65535, 65535, 65535 },
.maxComputeWorkGroupInvocations = 16 * devinfo->max_cs_threads, .maxComputeWorkGroupInvocations = 16 * pdevice->max_cs_threads,
.maxComputeWorkGroupSize = { .maxComputeWorkGroupSize = {
16 * devinfo->max_cs_threads, 16 * pdevice->max_cs_threads,
16 * devinfo->max_cs_threads, 16 * pdevice->max_cs_threads,
16 * devinfo->max_cs_threads, 16 * pdevice->max_cs_threads,
}, },
.subPixelPrecisionBits = 4 /* FIXME */, .subPixelPrecisionBits = 4 /* FIXME */,
.subTexelPrecisionBits = 4 /* FIXME */, .subTexelPrecisionBits = 4 /* FIXME */,

View File

@@ -570,6 +570,20 @@ struct anv_physical_device {
struct isl_device isl_dev; struct isl_device isl_dev;
int cmd_parser_version; int cmd_parser_version;
uint32_t eu_total;
uint32_t subslice_total;
/**
* Platform specific constants containing the maximum number of threads
* for each pipeline stage.
*/
uint32_t max_vs_threads;
uint32_t max_hs_threads;
uint32_t max_ds_threads;
uint32_t max_gs_threads;
uint32_t max_wm_threads;
uint32_t max_cs_threads;
struct anv_wsi_interface * wsi[VK_ICD_WSI_PLATFORM_MAX]; struct anv_wsi_interface * wsi[VK_ICD_WSI_PLATFORM_MAX];
}; };

View File

@@ -45,6 +45,8 @@ genX(graphics_pipeline_create)(
{ {
ANV_FROM_HANDLE(anv_device, device, _device); ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_render_pass, pass, pCreateInfo->renderPass); ANV_FROM_HANDLE(anv_render_pass, pass, pCreateInfo->renderPass);
struct anv_physical_device *physical_device =
&device->instance->physicalDevice;
struct anv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass]; struct anv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
struct anv_pipeline *pipeline; struct anv_pipeline *pipeline;
VkResult result; VkResult result;
@@ -123,7 +125,7 @@ genX(graphics_pipeline_create)(
vs.VertexURBEntryReadLength = vs_prog_data->base.urb_read_length; vs.VertexURBEntryReadLength = vs_prog_data->base.urb_read_length;
vs.VertexURBEntryReadOffset = 0; vs.VertexURBEntryReadOffset = 0;
vs.MaximumNumberofThreads = device->info.max_vs_threads - 1; vs.MaximumNumberofThreads = physical_device->max_vs_threads - 1;
vs.StatisticsEnable = true; vs.StatisticsEnable = true;
vs.VSFunctionEnable = true; vs.VSFunctionEnable = true;
} }
@@ -152,7 +154,7 @@ genX(graphics_pipeline_create)(
gs.DispatchGRFStartRegisterforURBData = gs.DispatchGRFStartRegisterforURBData =
gs_prog_data->base.base.dispatch_grf_start_reg; gs_prog_data->base.base.dispatch_grf_start_reg;
gs.MaximumNumberofThreads = device->info.max_gs_threads - 1; gs.MaximumNumberofThreads = physical_device->max_gs_threads - 1;
/* This in the next dword on HSW. */ /* This in the next dword on HSW. */
gs.ControlDataFormat = gs_prog_data->control_data_format; gs.ControlDataFormat = gs_prog_data->control_data_format;
gs.ControlDataHeaderSize = gs_prog_data->control_data_header_size_hwords; gs.ControlDataHeaderSize = gs_prog_data->control_data_header_size_hwords;
@@ -185,7 +187,7 @@ genX(graphics_pipeline_create)(
* don't at least set the maximum number of threads. * don't at least set the maximum number of threads.
*/ */
anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS), ps) { anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS), ps) {
ps.MaximumNumberofThreads = device->info.max_wm_threads - 1; ps.MaximumNumberofThreads = physical_device->max_wm_threads - 1;
} }
} else { } else {
const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
@@ -207,7 +209,7 @@ genX(graphics_pipeline_create)(
.offset = 0, .offset = 0,
}; };
ps.PerThreadScratchSpace = scratch_space(&wm_prog_data->base); ps.PerThreadScratchSpace = scratch_space(&wm_prog_data->base);
ps.MaximumNumberofThreads = device->info.max_wm_threads - 1; ps.MaximumNumberofThreads = physical_device->max_wm_threads - 1;
ps.PushConstantEnable = wm_prog_data->base.nr_params > 0; ps.PushConstantEnable = wm_prog_data->base.nr_params > 0;
ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0; ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0;
ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask; ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;

View File

@@ -55,6 +55,8 @@ genX(graphics_pipeline_create)(
{ {
ANV_FROM_HANDLE(anv_device, device, _device); ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_render_pass, pass, pCreateInfo->renderPass); ANV_FROM_HANDLE(anv_render_pass, pass, pCreateInfo->renderPass);
struct anv_physical_device *physical_device =
&device->instance->physicalDevice;
struct anv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass]; struct anv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
struct anv_pipeline *pipeline; struct anv_pipeline *pipeline;
VkResult result; VkResult result;
@@ -142,7 +144,7 @@ genX(graphics_pipeline_create)(
gs.DispatchGRFStartRegisterForURBData = gs.DispatchGRFStartRegisterForURBData =
gs_prog_data->base.base.dispatch_grf_start_reg; gs_prog_data->base.base.dispatch_grf_start_reg;
gs.MaximumNumberofThreads = device->info.max_gs_threads / 2 - 1; gs.MaximumNumberofThreads = physical_device->max_gs_threads / 2 - 1;
gs.ControlDataHeaderSize = gs_prog_data->control_data_header_size_hwords; gs.ControlDataHeaderSize = gs_prog_data->control_data_header_size_hwords;
gs.DispatchMode = gs_prog_data->base.dispatch_mode; gs.DispatchMode = gs_prog_data->base.dispatch_mode;
gs.StatisticsEnable = true; gs.StatisticsEnable = true;
@@ -213,7 +215,7 @@ genX(graphics_pipeline_create)(
vs.VertexURBEntryReadLength = vs_prog_data->base.urb_read_length; vs.VertexURBEntryReadLength = vs_prog_data->base.urb_read_length;
vs.VertexURBEntryReadOffset = 0; vs.VertexURBEntryReadOffset = 0;
vs.MaximumNumberofThreads = device->info.max_vs_threads - 1; vs.MaximumNumberofThreads = physical_device->max_vs_threads - 1;
vs.StatisticsEnable = false; vs.StatisticsEnable = false;
vs.SIMD8DispatchEnable = pipeline->vs_simd8 != NO_KERNEL; vs.SIMD8DispatchEnable = pipeline->vs_simd8 != NO_KERNEL;
vs.VertexCacheDisable = false; vs.VertexCacheDisable = false;

View File

@@ -35,6 +35,8 @@ genX(compute_pipeline_create)(
VkPipeline* pPipeline) VkPipeline* pPipeline)
{ {
ANV_FROM_HANDLE(anv_device, device, _device); ANV_FROM_HANDLE(anv_device, device, _device);
struct anv_physical_device *physical_device =
&device->instance->physicalDevice;
struct anv_pipeline *pipeline; struct anv_pipeline *pipeline;
VkResult result; VkResult result;
@@ -115,7 +117,7 @@ genX(compute_pipeline_create)(
#else #else
vfe.GPGPUMode = true; vfe.GPGPUMode = true;
#endif #endif
vfe.MaximumNumberofThreads = device->info.max_cs_threads - 1; vfe.MaximumNumberofThreads = physical_device->max_cs_threads - 1;
vfe.NumberofURBEntries = GEN_GEN <= 7 ? 0 : 2; vfe.NumberofURBEntries = GEN_GEN <= 7 ? 0 : 2;
vfe.ResetGatewayTimer = true; vfe.ResetGatewayTimer = true;
#if GEN_GEN <= 8 #if GEN_GEN <= 8